unicodeobject.c revision f3fd733f928752c9e35f8f5141a54cd21c0993b5
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "ucnhash.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* Limit for the Unicode object free list */
51
52#define PyUnicode_MAXFREELIST       1024
53
54/* Limit for the Unicode object free list stay alive optimization.
55
56   The implementation will keep allocated Unicode memory intact for
57   all objects on the free list having a size less than this
58   limit. This reduces malloc() overhead for small Unicode objects.
59
60   At worst this will result in PyUnicode_MAXFREELIST *
61   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
62   malloc()-overhead) bytes of unused garbage.
63
64   Setting the limit to 0 effectively turns the feature off.
65
66   Note: This is an experimental feature ! If you get core dumps when
67   using Unicode objects, turn this feature off.
68
69*/
70
71#define KEEPALIVE_SIZE_LIMIT       9
72
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
81/* --- Globals ------------------------------------------------------------
82
83   The globals are initialized by the _PyUnicode_Init() API and should
84   not be used before calling that API.
85
86*/
87
88
89#ifdef __cplusplus
90extern "C" {
91#endif
92
93/* This dictionary holds all interned unicode strings.  Note that references
94   to strings in this dictionary are *not* counted in the string's ob_refcnt.
95   When the interned string reaches a refcnt of 0 the string deallocation
96   function will delete the reference from this dictionary.
97
98   Another way to look at this is that to say that the actual reference
99   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
100*/
101static PyObject *interned;
102
103/* Free list for Unicode objects */
104static PyUnicodeObject *free_list;
105static int numfree;
106
107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111   shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
116    0, 0, 0, 0, 0, 0, 0, 0,
117/*     case 0x0009: * CHARACTER TABULATION */
118/*     case 0x000A: * LINE FEED */
119/*     case 0x000B: * LINE TABULATION */
120/*     case 0x000C: * FORM FEED */
121/*     case 0x000D: * CARRIAGE RETURN */
122    0, 1, 1, 1, 1, 1, 0, 0,
123    0, 0, 0, 0, 0, 0, 0, 0,
124/*     case 0x001C: * FILE SEPARATOR */
125/*     case 0x001D: * GROUP SEPARATOR */
126/*     case 0x001E: * RECORD SEPARATOR */
127/*     case 0x001F: * UNIT SEPARATOR */
128    0, 0, 0, 0, 1, 1, 1, 1,
129/*     case 0x0020: * SPACE */
130    1, 0, 0, 0, 0, 0, 0, 0,
131    0, 0, 0, 0, 0, 0, 0, 0,
132    0, 0, 0, 0, 0, 0, 0, 0,
133    0, 0, 0, 0, 0, 0, 0, 0,
134
135    0, 0, 0, 0, 0, 0, 0, 0,
136    0, 0, 0, 0, 0, 0, 0, 0,
137    0, 0, 0, 0, 0, 0, 0, 0,
138    0, 0, 0, 0, 0, 0, 0, 0,
139    0, 0, 0, 0, 0, 0, 0, 0,
140    0, 0, 0, 0, 0, 0, 0, 0,
141    0, 0, 0, 0, 0, 0, 0, 0,
142    0, 0, 0, 0, 0, 0, 0, 0
143};
144
145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
147       PyObject **errorHandler,const char *encoding, const char *reason,
148       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
151static void
152raise_encode_exception(PyObject **exceptionObject,
153		       const char *encoding,
154		       const Py_UNICODE *unicode, Py_ssize_t size,
155		       Py_ssize_t startpos, Py_ssize_t endpos,
156		       const char *reason);
157
158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
160    0, 0, 0, 0, 0, 0, 0, 0,
161/*         0x000A, * LINE FEED */
162/*         0x000B, * LINE TABULATION */
163/*         0x000C, * FORM FEED */
164/*         0x000D, * CARRIAGE RETURN */
165    0, 0, 1, 1, 1, 1, 0, 0,
166    0, 0, 0, 0, 0, 0, 0, 0,
167/*         0x001C, * FILE SEPARATOR */
168/*         0x001D, * GROUP SEPARATOR */
169/*         0x001E, * RECORD SEPARATOR */
170    0, 0, 0, 0, 1, 1, 1, 0,
171    0, 0, 0, 0, 0, 0, 0, 0,
172    0, 0, 0, 0, 0, 0, 0, 0,
173    0, 0, 0, 0, 0, 0, 0, 0,
174    0, 0, 0, 0, 0, 0, 0, 0,
175
176    0, 0, 0, 0, 0, 0, 0, 0,
177    0, 0, 0, 0, 0, 0, 0, 0,
178    0, 0, 0, 0, 0, 0, 0, 0,
179    0, 0, 0, 0, 0, 0, 0, 0,
180    0, 0, 0, 0, 0, 0, 0, 0,
181    0, 0, 0, 0, 0, 0, 0, 0,
182    0, 0, 0, 0, 0, 0, 0, 0,
183    0, 0, 0, 0, 0, 0, 0, 0
184};
185
186
187Py_UNICODE
188PyUnicode_GetMax(void)
189{
190#ifdef Py_UNICODE_WIDE
191    return 0x10FFFF;
192#else
193    /* This is actually an illegal character, so it should
194       not be passed to unichr. */
195    return 0xFFFF;
196#endif
197}
198
199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202   to keep things simple, we use a single bitmask, using the least 5
203   bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223
224#define BLOOM_LINEBREAK(ch)                                             \
225    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
226     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
227
228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231    /* calculate simple bloom-style bitmask for a given unicode string */
232
233    BLOOM_MASK mask;
234    Py_ssize_t i;
235
236    mask = 0;
237    for (i = 0; i < len; i++)
238        BLOOM_ADD(mask, ptr[i]);
239
240    return mask;
241}
242
243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
245{
246    Py_ssize_t i;
247
248    for (i = 0; i < setlen; i++)
249        if (set[i] == chr)
250            return 1;
251
252    return 0;
253}
254
255#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
256    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
258/* --- Unicode Object ----------------------------------------------------- */
259
260static int
261unicode_resize(register PyUnicodeObject *unicode,
262	       Py_ssize_t length)
263{
264    void *oldstr;
265
266    /* Shortcut if there's nothing much to do. */
267    if (unicode->length == length)
268        goto reset;
269
270    /* Resizing shared object (unicode_empty or single character
271       objects) in-place is not allowed. Use PyUnicode_Resize()
272       instead ! */
273
274    if (unicode == unicode_empty ||
275        (unicode->length == 1 &&
276         unicode->str[0] < 256U &&
277         unicode_latin1[unicode->str[0]] == unicode)) {
278        PyErr_SetString(PyExc_SystemError,
279                        "can't resize shared str objects");
280        return -1;
281    }
282
283    /* We allocate one more byte to make sure the string is Ux0000 terminated.
284       The overallocation is also used by fastsearch, which assumes that it's
285       safe to look at str[length] (without making any assumptions about what
286       it contains). */
287
288    oldstr = unicode->str;
289    unicode->str = PyObject_REALLOC(unicode->str,
290                                    sizeof(Py_UNICODE) * (length + 1));
291    if (!unicode->str) {
292        unicode->str = (Py_UNICODE *)oldstr;
293        PyErr_NoMemory();
294        return -1;
295    }
296    unicode->str[length] = 0;
297    unicode->length = length;
298
299  reset:
300    /* Reset the object caches */
301    if (unicode->defenc) {
302        Py_CLEAR(unicode->defenc);
303    }
304    unicode->hash = -1;
305
306    return 0;
307}
308
309/* We allocate one more byte to make sure the string is
310   Ux0000 terminated; some code (e.g. new_identifier)
311   relies on that.
312
313   XXX This allocator could further be enhanced by assuring that the
314   free list never reduces its size below 1.
315
316*/
317
318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
320{
321    register PyUnicodeObject *unicode;
322
323    /* Optimization for empty strings */
324    if (length == 0 && unicode_empty != NULL) {
325        Py_INCREF(unicode_empty);
326        return unicode_empty;
327    }
328
329    /* Ensure we won't overflow the size. */
330    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331        return (PyUnicodeObject *)PyErr_NoMemory();
332    }
333
334    /* Unicode freelist & memory allocation */
335    if (free_list) {
336        unicode = free_list;
337        free_list = *(PyUnicodeObject **)unicode;
338        numfree--;
339        if (unicode->str) {
340            /* Keep-Alive optimization: we only upsize the buffer,
341               never downsize it. */
342            if ((unicode->length < length) &&
343                unicode_resize(unicode, length) < 0) {
344                PyObject_DEL(unicode->str);
345                unicode->str = NULL;
346            }
347        }
348        else {
349            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
351        }
352        PyObject_INIT(unicode, &PyUnicode_Type);
353    }
354    else {
355        size_t new_size;
356        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
357        if (unicode == NULL)
358            return NULL;
359        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
361    }
362
363    if (!unicode->str) {
364        PyErr_NoMemory();
365        goto onError;
366    }
367    /* Initialize the first element to guard against cases where
368     * the caller fails before initializing str -- unicode_resize()
369     * reads str[0], and the Keep-Alive optimization can keep memory
370     * allocated for str alive across a call to unicode_dealloc(unicode).
371     * We don't want unicode_resize to read uninitialized memory in
372     * that case.
373     */
374    unicode->str[0] = 0;
375    unicode->str[length] = 0;
376    unicode->length = length;
377    unicode->hash = -1;
378    unicode->state = 0;
379    unicode->defenc = NULL;
380    return unicode;
381
382  onError:
383    /* XXX UNREF/NEWREF interface should be more symmetrical */
384    _Py_DEC_REFTOTAL;
385    _Py_ForgetReference((PyObject *)unicode);
386    PyObject_Del(unicode);
387    return NULL;
388}
389
390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
392{
393    switch (PyUnicode_CHECK_INTERNED(unicode)) {
394    case SSTATE_NOT_INTERNED:
395        break;
396
397    case SSTATE_INTERNED_MORTAL:
398        /* revive dead object temporarily for DelItem */
399        Py_REFCNT(unicode) = 3;
400        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401            Py_FatalError(
402                "deletion of interned string failed");
403        break;
404
405    case SSTATE_INTERNED_IMMORTAL:
406        Py_FatalError("Immortal interned string died.");
407
408    default:
409        Py_FatalError("Inconsistent interned string state.");
410    }
411
412    if (PyUnicode_CheckExact(unicode) &&
413        numfree < PyUnicode_MAXFREELIST) {
414        /* Keep-Alive optimization */
415        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416            PyObject_DEL(unicode->str);
417            unicode->str = NULL;
418            unicode->length = 0;
419        }
420        if (unicode->defenc) {
421            Py_CLEAR(unicode->defenc);
422        }
423        /* Add to free list */
424        *(PyUnicodeObject **)unicode = free_list;
425        free_list = unicode;
426        numfree++;
427    }
428    else {
429        PyObject_DEL(unicode->str);
430        Py_XDECREF(unicode->defenc);
431        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
432    }
433}
434
435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
437{
438    register PyUnicodeObject *v;
439
440    /* Argument checks */
441    if (unicode == NULL) {
442        PyErr_BadInternalCall();
443        return -1;
444    }
445    v = *unicode;
446    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
447        PyErr_BadInternalCall();
448        return -1;
449    }
450
451    /* Resizing unicode_empty and single character objects is not
452       possible since these are being shared. We simply return a fresh
453       copy with the same Unicode content. */
454    if (v->length != length &&
455        (v == unicode_empty || v->length == 1)) {
456        PyUnicodeObject *w = _PyUnicode_New(length);
457        if (w == NULL)
458            return -1;
459        Py_UNICODE_COPY(w->str, v->str,
460                        length < v->length ? length : v->length);
461        Py_DECREF(*unicode);
462        *unicode = w;
463        return 0;
464    }
465
466    /* Note that we don't have to modify *unicode for unshared Unicode
467       objects, since we can modify them in-place. */
468    return unicode_resize(v, length);
469}
470
471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
473{
474    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
476
477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
479{
480    PyUnicodeObject *unicode;
481
482    /* If the Unicode data is known at construction time, we can apply
483       some optimizations which share commonly used objects. */
484    if (u != NULL) {
485
486        /* Optimization for empty strings */
487        if (size == 0 && unicode_empty != NULL) {
488            Py_INCREF(unicode_empty);
489            return (PyObject *)unicode_empty;
490        }
491
492        /* Single character Unicode objects in the Latin-1 range are
493           shared when using this constructor */
494        if (size == 1 && *u < 256) {
495            unicode = unicode_latin1[*u];
496            if (!unicode) {
497                unicode = _PyUnicode_New(1);
498                if (!unicode)
499                    return NULL;
500                unicode->str[0] = *u;
501                unicode_latin1[*u] = unicode;
502            }
503            Py_INCREF(unicode);
504            return (PyObject *)unicode;
505        }
506    }
507
508    unicode = _PyUnicode_New(size);
509    if (!unicode)
510        return NULL;
511
512    /* Copy the Unicode data into the new object */
513    if (u != NULL)
514        Py_UNICODE_COPY(unicode->str, u, size);
515
516    return (PyObject *)unicode;
517}
518
519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
521{
522    PyUnicodeObject *unicode;
523
524    if (size < 0) {
525        PyErr_SetString(PyExc_SystemError,
526                        "Negative size passed to PyUnicode_FromStringAndSize");
527        return NULL;
528    }
529
530    /* If the Unicode data is known at construction time, we can apply
531       some optimizations which share commonly used objects.
532       Also, this means the input must be UTF-8, so fall back to the
533       UTF-8 decoder at the end. */
534    if (u != NULL) {
535
536        /* Optimization for empty strings */
537        if (size == 0 && unicode_empty != NULL) {
538            Py_INCREF(unicode_empty);
539            return (PyObject *)unicode_empty;
540        }
541
542        /* Single characters are shared when using this constructor.
543           Restrict to ASCII, since the input must be UTF-8. */
544        if (size == 1 && Py_CHARMASK(*u) < 128) {
545            unicode = unicode_latin1[Py_CHARMASK(*u)];
546            if (!unicode) {
547                unicode = _PyUnicode_New(1);
548                if (!unicode)
549                    return NULL;
550                unicode->str[0] = Py_CHARMASK(*u);
551                unicode_latin1[Py_CHARMASK(*u)] = unicode;
552            }
553            Py_INCREF(unicode);
554            return (PyObject *)unicode;
555        }
556
557        return PyUnicode_DecodeUTF8(u, size, NULL);
558    }
559
560    unicode = _PyUnicode_New(size);
561    if (!unicode)
562        return NULL;
563
564    return (PyObject *)unicode;
565}
566
567PyObject *
568PyUnicode_FromString(const char *u)
569{
570    size_t size = strlen(u);
571    if (size > PY_SSIZE_T_MAX) {
572        PyErr_SetString(PyExc_OverflowError, "input too long");
573        return NULL;
574    }
575
576    return PyUnicode_FromStringAndSize(u, size);
577}
578
579#ifdef HAVE_WCHAR_H
580
581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588   to convert from UTF32 to UTF16. */
589
590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
592{
593    PyUnicodeObject *unicode;
594    register Py_ssize_t i;
595    Py_ssize_t alloc;
596    const wchar_t *orig_w;
597
598    if (w == NULL) {
599        if (size == 0)
600            return PyUnicode_FromStringAndSize(NULL, 0);
601        PyErr_BadInternalCall();
602        return NULL;
603    }
604
605    if (size == -1) {
606        size = wcslen(w);
607    }
608
609    alloc = size;
610    orig_w = w;
611    for (i = size; i > 0; i--) {
612        if (*w > 0xFFFF)
613            alloc++;
614        w++;
615    }
616    w = orig_w;
617    unicode = _PyUnicode_New(alloc);
618    if (!unicode)
619        return NULL;
620
621    /* Copy the wchar_t data into the new object */
622    {
623        register Py_UNICODE *u;
624        u = PyUnicode_AS_UNICODE(unicode);
625        for (i = size; i > 0; i--) {
626            if (*w > 0xFFFF) {
627                wchar_t ordinal = *w++;
628                ordinal -= 0x10000;
629                *u++ = 0xD800 | (ordinal >> 10);
630                *u++ = 0xDC00 | (ordinal & 0x3FF);
631            }
632            else
633                *u++ = *w++;
634        }
635    }
636    return (PyObject *)unicode;
637}
638
639#else
640
641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
643{
644    PyUnicodeObject *unicode;
645
646    if (w == NULL) {
647        if (size == 0)
648            return PyUnicode_FromStringAndSize(NULL, 0);
649        PyErr_BadInternalCall();
650        return NULL;
651    }
652
653    if (size == -1) {
654        size = wcslen(w);
655    }
656
657    unicode = _PyUnicode_New(size);
658    if (!unicode)
659        return NULL;
660
661    /* Copy the wchar_t data into the new object */
662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
663    memcpy(unicode->str, w, size * sizeof(wchar_t));
664#else
665    {
666        register Py_UNICODE *u;
667        register Py_ssize_t i;
668        u = PyUnicode_AS_UNICODE(unicode);
669        for (i = size; i > 0; i--)
670            *u++ = *w++;
671    }
672#endif
673
674    return (PyObject *)unicode;
675}
676
677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
681static void
682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683        int zeropad, int width, int precision, char c)
684{
685    *fmt++ = '%';
686    if (width) {
687        if (zeropad)
688            *fmt++ = '0';
689        fmt += sprintf(fmt, "%d", width);
690    }
691    if (precision)
692        fmt += sprintf(fmt, ".%d", precision);
693    if (longflag)
694        *fmt++ = 'l';
695    else if (longlongflag) {
696        /* longlongflag should only ever be nonzero on machines with
697           HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699        char *f = PY_FORMAT_LONG_LONG;
700        while (*f)
701            *fmt++ = *f++;
702#else
703        /* we shouldn't ever get here */
704        assert(0);
705        *fmt++ = 'l';
706#endif
707    }
708    else if (size_tflag) {
709        char *f = PY_FORMAT_SIZE_T;
710        while (*f)
711            *fmt++ = *f++;
712    }
713    *fmt++ = c;
714    *fmt = '\0';
715}
716
717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721                   int *p_width, int *p_precision,
722                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724    int width, precision, longflag, longlongflag, size_tflag;
725
726    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727    f++;
728    width = 0;
729    while (Py_ISDIGIT((unsigned)*f))
730        width = (width*10) + *f++ - '0';
731    precision = 0;
732    if (*f == '.') {
733        f++;
734        while (Py_ISDIGIT((unsigned)*f))
735            precision = (precision*10) + *f++ - '0';
736        if (*f == '%') {
737            /* "%.3%s" => f points to "3" */
738            f--;
739        }
740    }
741    if (*f == '\0') {
742        /* bogus format "%.1" => go backward, f points to "1" */
743        f--;
744    }
745    if (p_width != NULL)
746        *p_width = width;
747    if (p_precision != NULL)
748        *p_precision = precision;
749
750    /* Handle %ld, %lu, %lld and %llu. */
751    longflag = 0;
752    longlongflag = 0;
753    size_tflag = 0;
754
755    if (*f == 'l') {
756        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
757            longflag = 1;
758            ++f;
759        }
760#ifdef HAVE_LONG_LONG
761        else if (f[1] == 'l' &&
762                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
763            longlongflag = 1;
764            f += 2;
765        }
766#endif
767    }
768    /* handle the size_t flag. */
769    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
770        size_tflag = 1;
771        ++f;
772    }
773    if (p_longflag != NULL)
774        *p_longflag = longflag;
775    if (p_longlongflag != NULL)
776        *p_longlongflag = longlongflag;
777    if (p_size_tflag != NULL)
778        *p_size_tflag = size_tflag;
779    return f;
780}
781
782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld.  21 characters
787   allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
797    va_list count;
798    Py_ssize_t callcount = 0;
799    PyObject **callresults = NULL;
800    PyObject **callresult = NULL;
801    Py_ssize_t n = 0;
802    int width = 0;
803    int precision = 0;
804    int zeropad;
805    const char* f;
806    Py_UNICODE *s;
807    PyObject *string;
808    /* used by sprintf */
809    char buffer[ITEM_BUFFER_LEN+1];
810    /* use abuffer instead of buffer, if we need more space
811     * (which can happen if there's a format specifier with width). */
812    char *abuffer = NULL;
813    char *realbuffer;
814    Py_ssize_t abuffersize = 0;
815    char fmt[61]; /* should be enough for %0width.precisionlld */
816    const char *copy;
817
818    Py_VA_COPY(count, vargs);
819    /* step 1: count the number of %S/%R/%A/%s format specifications
820     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822     * result in an array) */
823    for (f = format; *f; f++) {
824         if (*f == '%') {
825             /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826             f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827             if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
828                 ++callcount;
829         }
830         else if (128 <= (unsigned char)*f) {
831             PyErr_Format(PyExc_ValueError,
832                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
833                "string, got a non-ASCII byte: 0x%02x",
834                (unsigned char)*f);
835             return NULL;
836         }
837    }
838    /* step 2: allocate memory for the results of
839     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
840    if (callcount) {
841        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842        if (!callresults) {
843            PyErr_NoMemory();
844            return NULL;
845        }
846        callresult = callresults;
847    }
848    /* step 3: figure out how large a buffer we need */
849    for (f = format; *f; f++) {
850        if (*f == '%') {
851#ifdef HAVE_LONG_LONG
852            int longlongflag;
853#endif
854            const char* p;
855
856            p = f;
857            f = parse_format_flags(f, &width, NULL,
858                                   NULL, &longlongflag, NULL);
859
860            switch (*f) {
861            case 'c':
862            {
863#ifndef Py_UNICODE_WIDE
864                int ordinal = va_arg(count, int);
865                if (ordinal > 0xffff)
866                    n += 2;
867                else
868                    n++;
869#else
870                (void)va_arg(count, int);
871                n++;
872#endif
873                break;
874            }
875            case '%':
876                n++;
877                break;
878            case 'd': case 'u': case 'i': case 'x':
879                (void) va_arg(count, int);
880#ifdef HAVE_LONG_LONG
881                if (longlongflag) {
882                    if (width < MAX_LONG_LONG_CHARS)
883                        width = MAX_LONG_LONG_CHARS;
884                }
885                else
886#endif
887                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888                       including sign.  Decimal takes the most space.  This
889                       isn't enough for octal.  If a width is specified we
890                       need more (which we allocate later). */
891                    if (width < MAX_LONG_CHARS)
892                        width = MAX_LONG_CHARS;
893                n += width;
894                /* XXX should allow for large precision here too. */
895                if (abuffersize < width)
896                    abuffersize = width;
897                break;
898            case 's':
899            {
900                /* UTF-8 */
901                const char *s = va_arg(count, const char*);
902                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903                if (!str)
904                    goto fail;
905                n += PyUnicode_GET_SIZE(str);
906                /* Remember the str and switch to the next slot */
907                *callresult++ = str;
908                break;
909            }
910            case 'U':
911            {
912                PyObject *obj = va_arg(count, PyObject *);
913                assert(obj && PyUnicode_Check(obj));
914                n += PyUnicode_GET_SIZE(obj);
915                break;
916            }
917            case 'V':
918            {
919                PyObject *obj = va_arg(count, PyObject *);
920                const char *str = va_arg(count, const char *);
921                PyObject *str_obj;
922                assert(obj || str);
923                assert(!obj || PyUnicode_Check(obj));
924                if (obj) {
925                    n += PyUnicode_GET_SIZE(obj);
926                    *callresult++ = NULL;
927                }
928                else {
929                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930                    if (!str_obj)
931                        goto fail;
932                    n += PyUnicode_GET_SIZE(str_obj);
933                    *callresult++ = str_obj;
934                }
935                break;
936            }
937            case 'S':
938            {
939                PyObject *obj = va_arg(count, PyObject *);
940                PyObject *str;
941                assert(obj);
942                str = PyObject_Str(obj);
943                if (!str)
944                    goto fail;
945                n += PyUnicode_GET_SIZE(str);
946                /* Remember the str and switch to the next slot */
947                *callresult++ = str;
948                break;
949            }
950            case 'R':
951            {
952                PyObject *obj = va_arg(count, PyObject *);
953                PyObject *repr;
954                assert(obj);
955                repr = PyObject_Repr(obj);
956                if (!repr)
957                    goto fail;
958                n += PyUnicode_GET_SIZE(repr);
959                /* Remember the repr and switch to the next slot */
960                *callresult++ = repr;
961                break;
962            }
963            case 'A':
964            {
965                PyObject *obj = va_arg(count, PyObject *);
966                PyObject *ascii;
967                assert(obj);
968                ascii = PyObject_ASCII(obj);
969                if (!ascii)
970                    goto fail;
971                n += PyUnicode_GET_SIZE(ascii);
972                /* Remember the repr and switch to the next slot */
973                *callresult++ = ascii;
974                break;
975            }
976            case 'p':
977                (void) va_arg(count, int);
978                /* maximum 64-bit pointer representation:
979                 * 0xffffffffffffffff
980                 * so 19 characters is enough.
981                 * XXX I count 18 -- what's the extra for?
982                 */
983                n += 19;
984                break;
985            default:
986                /* if we stumble upon an unknown
987                   formatting code, copy the rest of
988                   the format string to the output
989                   string. (we cannot just skip the
990                   code, since there's no way to know
991                   what's in the argument list) */
992                n += strlen(p);
993                goto expand;
994            }
995        } else
996            n++;
997    }
998  expand:
999    if (abuffersize > ITEM_BUFFER_LEN) {
1000        /* add 1 for sprintf's trailing null byte */
1001        abuffer = PyObject_Malloc(abuffersize + 1);
1002        if (!abuffer) {
1003            PyErr_NoMemory();
1004            goto fail;
1005        }
1006        realbuffer = abuffer;
1007    }
1008    else
1009        realbuffer = buffer;
1010    /* step 4: fill the buffer */
1011    /* Since we've analyzed how much space we need for the worst case,
1012       we don't have to resize the string.
1013       There can be no errors beyond this point. */
1014    string = PyUnicode_FromUnicode(NULL, n);
1015    if (!string)
1016        goto fail;
1017
1018    s = PyUnicode_AS_UNICODE(string);
1019    callresult = callresults;
1020
1021    for (f = format; *f; f++) {
1022        if (*f == '%') {
1023            const char* p;
1024            int longflag;
1025            int longlongflag;
1026            int size_tflag;
1027
1028            p = f;
1029            zeropad = (f[1] == '0');
1030            f = parse_format_flags(f, &width, &precision,
1031                                   &longflag, &longlongflag, &size_tflag);
1032
1033            switch (*f) {
1034            case 'c':
1035            {
1036                int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038                if (ordinal > 0xffff) {
1039                    ordinal -= 0x10000;
1040                    *s++ = 0xD800 | (ordinal >> 10);
1041                    *s++ = 0xDC00 | (ordinal & 0x3FF);
1042                } else
1043#endif
1044                *s++ = ordinal;
1045                break;
1046            }
1047            case 'i':
1048            case 'd':
1049                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1050                        width, precision, *f);
1051                if (longflag)
1052                    sprintf(realbuffer, fmt, va_arg(vargs, long));
1053#ifdef HAVE_LONG_LONG
1054                else if (longlongflag)
1055                    sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056#endif
1057                else if (size_tflag)
1058                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059                else
1060                    sprintf(realbuffer, fmt, va_arg(vargs, int));
1061                appendstring(realbuffer);
1062                break;
1063            case 'u':
1064                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1065                        width, precision, 'u');
1066                if (longflag)
1067                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
1068#ifdef HAVE_LONG_LONG
1069                else if (longlongflag)
1070                    sprintf(realbuffer, fmt, va_arg(vargs,
1071                                                    unsigned PY_LONG_LONG));
1072#endif
1073                else if (size_tflag)
1074                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075                else
1076                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077                appendstring(realbuffer);
1078                break;
1079            case 'x':
1080                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1081                sprintf(realbuffer, fmt, va_arg(vargs, int));
1082                appendstring(realbuffer);
1083                break;
1084            case 's':
1085            {
1086                /* unused, since we already have the result */
1087                (void) va_arg(vargs, char *);
1088                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089                                PyUnicode_GET_SIZE(*callresult));
1090                s += PyUnicode_GET_SIZE(*callresult);
1091                /* We're done with the unicode()/repr() => forget it */
1092                Py_DECREF(*callresult);
1093                /* switch to next unicode()/repr() result */
1094                ++callresult;
1095                break;
1096            }
1097            case 'U':
1098            {
1099                PyObject *obj = va_arg(vargs, PyObject *);
1100                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102                s += size;
1103                break;
1104            }
1105            case 'V':
1106            {
1107                PyObject *obj = va_arg(vargs, PyObject *);
1108                va_arg(vargs, const char *);
1109                if (obj) {
1110                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112                    s += size;
1113                } else {
1114                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1115                                    PyUnicode_GET_SIZE(*callresult));
1116                    s += PyUnicode_GET_SIZE(*callresult);
1117                    Py_DECREF(*callresult);
1118                }
1119                ++callresult;
1120                break;
1121            }
1122            case 'S':
1123            case 'R':
1124            case 'A':
1125            {
1126                Py_UNICODE *ucopy;
1127                Py_ssize_t usize;
1128                Py_ssize_t upos;
1129                /* unused, since we already have the result */
1130                (void) va_arg(vargs, PyObject *);
1131                ucopy = PyUnicode_AS_UNICODE(*callresult);
1132                usize = PyUnicode_GET_SIZE(*callresult);
1133                for (upos = 0; upos<usize;)
1134                    *s++ = ucopy[upos++];
1135                /* We're done with the unicode()/repr() => forget it */
1136                Py_DECREF(*callresult);
1137                /* switch to next unicode()/repr() result */
1138                ++callresult;
1139                break;
1140            }
1141            case 'p':
1142                sprintf(buffer, "%p", va_arg(vargs, void*));
1143                /* %p is ill-defined:  ensure leading 0x. */
1144                if (buffer[1] == 'X')
1145                    buffer[1] = 'x';
1146                else if (buffer[1] != 'x') {
1147                    memmove(buffer+2, buffer, strlen(buffer)+1);
1148                    buffer[0] = '0';
1149                    buffer[1] = 'x';
1150                }
1151                appendstring(buffer);
1152                break;
1153            case '%':
1154                *s++ = '%';
1155                break;
1156            default:
1157                appendstring(p);
1158                goto end;
1159            }
1160        }
1161        else
1162            *s++ = *f;
1163    }
1164
1165  end:
1166    if (callresults)
1167        PyObject_Free(callresults);
1168    if (abuffer)
1169        PyObject_Free(abuffer);
1170    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171    return string;
1172  fail:
1173    if (callresults) {
1174        PyObject **callresult2 = callresults;
1175        while (callresult2 < callresult) {
1176            Py_XDECREF(*callresult2);
1177            ++callresult2;
1178        }
1179        PyObject_Free(callresults);
1180    }
1181    if (abuffer)
1182        PyObject_Free(abuffer);
1183    return NULL;
1184}
1185
1186#undef appendstring
1187
1188PyObject *
1189PyUnicode_FromFormat(const char *format, ...)
1190{
1191    PyObject* ret;
1192    va_list vargs;
1193
1194#ifdef HAVE_STDARG_PROTOTYPES
1195    va_start(vargs, format);
1196#else
1197    va_start(vargs);
1198#endif
1199    ret = PyUnicode_FromFormatV(format, vargs);
1200    va_end(vargs);
1201    return ret;
1202}
1203
1204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1205   convert a Unicode object to a wide character string.
1206
1207   - If w is NULL: return the number of wide characters (including the nul
1208     character) required to convert the unicode object. Ignore size argument.
1209
1210   - Otherwise: return the number of wide characters (excluding the nul
1211     character) written into w. Write at most size wide characters (including
1212     the nul character). */
1213static Py_ssize_t
1214unicode_aswidechar(PyUnicodeObject *unicode,
1215                   wchar_t *w,
1216                   Py_ssize_t size)
1217{
1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1219    Py_ssize_t res;
1220    if (w != NULL) {
1221        res = PyUnicode_GET_SIZE(unicode);
1222        if (size > res)
1223            size = res + 1;
1224        else
1225            res = size;
1226        memcpy(w, unicode->str, size * sizeof(wchar_t));
1227        return res;
1228    }
1229    else
1230        return PyUnicode_GET_SIZE(unicode) + 1;
1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232    register const Py_UNICODE *u;
1233    const Py_UNICODE *uend;
1234    const wchar_t *worig, *wend;
1235    Py_ssize_t nchar;
1236
1237    u = PyUnicode_AS_UNICODE(unicode);
1238    uend = u + PyUnicode_GET_SIZE(unicode);
1239    if (w != NULL) {
1240        worig = w;
1241        wend = w + size;
1242        while (u != uend && w != wend) {
1243            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245            {
1246                *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247                u += 2;
1248            }
1249            else {
1250                *w = *u;
1251                u++;
1252            }
1253            w++;
1254        }
1255        if (w != wend)
1256            *w = L'\0';
1257        return w - worig;
1258    }
1259    else {
1260        nchar = 1; /* nul character at the end */
1261        while (u != uend) {
1262            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264                u += 2;
1265            else
1266                u++;
1267            nchar++;
1268        }
1269    }
1270    return nchar;
1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272    register Py_UNICODE *u, *uend, ordinal;
1273    register Py_ssize_t i;
1274    wchar_t *worig, *wend;
1275    Py_ssize_t nchar;
1276
1277    u = PyUnicode_AS_UNICODE(unicode);
1278    uend = u + PyUnicode_GET_SIZE(u);
1279    if (w != NULL) {
1280        worig = w;
1281        wend = w + size;
1282        while (u != uend && w != wend) {
1283            ordinal = *u;
1284            if (ordinal > 0xffff) {
1285                ordinal -= 0x10000;
1286                *w++ = 0xD800 | (ordinal >> 10);
1287                *w++ = 0xDC00 | (ordinal & 0x3FF);
1288            }
1289            else
1290                *w++ = ordinal;
1291            u++;
1292        }
1293        if (w != wend)
1294            *w = 0;
1295        return w - worig;
1296    }
1297    else {
1298        nchar = 1; /* nul character */
1299        while (u != uend) {
1300            if (*u > 0xffff)
1301                nchar += 2;
1302            else
1303                nchar++;
1304            u++;
1305        }
1306        return nchar;
1307    }
1308#else
1309#  error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
1310#endif
1311}
1312
1313Py_ssize_t
1314PyUnicode_AsWideChar(PyObject *unicode,
1315                     wchar_t *w,
1316                     Py_ssize_t size)
1317{
1318    if (unicode == NULL) {
1319        PyErr_BadInternalCall();
1320        return -1;
1321    }
1322    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
1323}
1324
1325wchar_t*
1326PyUnicode_AsWideCharString(PyObject *unicode,
1327                           Py_ssize_t *size)
1328{
1329    wchar_t* buffer;
1330    Py_ssize_t buflen;
1331
1332    if (unicode == NULL) {
1333        PyErr_BadInternalCall();
1334        return NULL;
1335    }
1336
1337    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
1338    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
1339        PyErr_NoMemory();
1340        return NULL;
1341    }
1342
1343    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344    if (buffer == NULL) {
1345        PyErr_NoMemory();
1346        return NULL;
1347    }
1348    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
1349    if (size != NULL)
1350        *size = buflen;
1351    return buffer;
1352}
1353
1354#endif
1355
1356PyObject *
1357PyUnicode_FromOrdinal(int ordinal)
1358{
1359    Py_UNICODE s[2];
1360
1361    if (ordinal < 0 || ordinal > 0x10ffff) {
1362        PyErr_SetString(PyExc_ValueError,
1363                        "chr() arg not in range(0x110000)");
1364        return NULL;
1365    }
1366
1367#ifndef Py_UNICODE_WIDE
1368    if (ordinal > 0xffff) {
1369        ordinal -= 0x10000;
1370        s[0] = 0xD800 | (ordinal >> 10);
1371        s[1] = 0xDC00 | (ordinal & 0x3FF);
1372        return PyUnicode_FromUnicode(s, 2);
1373    }
1374#endif
1375
1376    s[0] = (Py_UNICODE)ordinal;
1377    return PyUnicode_FromUnicode(s, 1);
1378}
1379
1380PyObject *
1381PyUnicode_FromObject(register PyObject *obj)
1382{
1383    /* XXX Perhaps we should make this API an alias of
1384       PyObject_Str() instead ?! */
1385    if (PyUnicode_CheckExact(obj)) {
1386        Py_INCREF(obj);
1387        return obj;
1388    }
1389    if (PyUnicode_Check(obj)) {
1390        /* For a Unicode subtype that's not a Unicode object,
1391           return a true Unicode object with the same data. */
1392        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393                                     PyUnicode_GET_SIZE(obj));
1394    }
1395    PyErr_Format(PyExc_TypeError,
1396                 "Can't convert '%.100s' object to str implicitly",
1397                 Py_TYPE(obj)->tp_name);
1398    return NULL;
1399}
1400
1401PyObject *
1402PyUnicode_FromEncodedObject(register PyObject *obj,
1403			    const char *encoding,
1404			    const char *errors)
1405{
1406    Py_buffer buffer;
1407    PyObject *v;
1408
1409    if (obj == NULL) {
1410        PyErr_BadInternalCall();
1411        return NULL;
1412    }
1413
1414    /* Decoding bytes objects is the most common case and should be fast */
1415    if (PyBytes_Check(obj)) {
1416        if (PyBytes_GET_SIZE(obj) == 0) {
1417            Py_INCREF(unicode_empty);
1418            v = (PyObject *) unicode_empty;
1419        }
1420        else {
1421            v = PyUnicode_Decode(
1422                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1423                    encoding, errors);
1424        }
1425        return v;
1426    }
1427
1428    if (PyUnicode_Check(obj)) {
1429        PyErr_SetString(PyExc_TypeError,
1430                        "decoding str is not supported");
1431        return NULL;
1432    }
1433
1434    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1435    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1436        PyErr_Format(PyExc_TypeError,
1437                     "coercing to str: need bytes, bytearray "
1438                     "or buffer-like object, %.80s found",
1439                     Py_TYPE(obj)->tp_name);
1440        return NULL;
1441    }
1442
1443    if (buffer.len == 0) {
1444        Py_INCREF(unicode_empty);
1445        v = (PyObject *) unicode_empty;
1446    }
1447    else
1448        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1449
1450    PyBuffer_Release(&buffer);
1451    return v;
1452}
1453
1454/* Convert encoding to lower case and replace '_' with '-' in order to
1455   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1456   1 on success. */
1457static int
1458normalize_encoding(const char *encoding,
1459                   char *lower,
1460                   size_t lower_len)
1461{
1462    const char *e;
1463    char *l;
1464    char *l_end;
1465
1466    e = encoding;
1467    l = lower;
1468    l_end = &lower[lower_len - 1];
1469    while (*e) {
1470        if (l == l_end)
1471            return 0;
1472        if (Py_ISUPPER(*e)) {
1473            *l++ = Py_TOLOWER(*e++);
1474        }
1475        else if (*e == '_') {
1476            *l++ = '-';
1477            e++;
1478        }
1479        else {
1480            *l++ = *e++;
1481        }
1482    }
1483    *l = '\0';
1484    return 1;
1485}
1486
1487PyObject *
1488PyUnicode_Decode(const char *s,
1489		 Py_ssize_t size,
1490		 const char *encoding,
1491		 const char *errors)
1492{
1493    PyObject *buffer = NULL, *unicode;
1494    Py_buffer info;
1495    char lower[11];  /* Enough for any encoding shortcut */
1496
1497    if (encoding == NULL)
1498        return PyUnicode_DecodeUTF8(s, size, errors);
1499
1500    /* Shortcuts for common default encodings */
1501    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1502        if ((strcmp(lower, "utf-8") == 0) ||
1503            (strcmp(lower, "utf8") == 0))
1504            return PyUnicode_DecodeUTF8(s, size, errors);
1505        else if ((strcmp(lower, "latin-1") == 0) ||
1506                 (strcmp(lower, "latin1") == 0) ||
1507                 (strcmp(lower, "iso-8859-1") == 0))
1508            return PyUnicode_DecodeLatin1(s, size, errors);
1509#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1510        else if (strcmp(lower, "mbcs") == 0)
1511            return PyUnicode_DecodeMBCS(s, size, errors);
1512#endif
1513        else if (strcmp(lower, "ascii") == 0)
1514            return PyUnicode_DecodeASCII(s, size, errors);
1515        else if (strcmp(lower, "utf-16") == 0)
1516            return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517        else if (strcmp(lower, "utf-32") == 0)
1518            return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519    }
1520
1521    /* Decode via the codec registry */
1522    buffer = NULL;
1523    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1524        goto onError;
1525    buffer = PyMemoryView_FromBuffer(&info);
1526    if (buffer == NULL)
1527        goto onError;
1528    unicode = PyCodec_Decode(buffer, encoding, errors);
1529    if (unicode == NULL)
1530        goto onError;
1531    if (!PyUnicode_Check(unicode)) {
1532        PyErr_Format(PyExc_TypeError,
1533                     "decoder did not return a str object (type=%.400s)",
1534                     Py_TYPE(unicode)->tp_name);
1535        Py_DECREF(unicode);
1536        goto onError;
1537    }
1538    Py_DECREF(buffer);
1539    return unicode;
1540
1541  onError:
1542    Py_XDECREF(buffer);
1543    return NULL;
1544}
1545
1546PyObject *
1547PyUnicode_AsDecodedObject(PyObject *unicode,
1548			  const char *encoding,
1549			  const char *errors)
1550{
1551    PyObject *v;
1552
1553    if (!PyUnicode_Check(unicode)) {
1554        PyErr_BadArgument();
1555        goto onError;
1556    }
1557
1558    if (encoding == NULL)
1559        encoding = PyUnicode_GetDefaultEncoding();
1560
1561    /* Decode via the codec registry */
1562    v = PyCodec_Decode(unicode, encoding, errors);
1563    if (v == NULL)
1564        goto onError;
1565    return v;
1566
1567  onError:
1568    return NULL;
1569}
1570
1571PyObject *
1572PyUnicode_AsDecodedUnicode(PyObject *unicode,
1573			   const char *encoding,
1574			   const char *errors)
1575{
1576    PyObject *v;
1577
1578    if (!PyUnicode_Check(unicode)) {
1579        PyErr_BadArgument();
1580        goto onError;
1581    }
1582
1583    if (encoding == NULL)
1584        encoding = PyUnicode_GetDefaultEncoding();
1585
1586    /* Decode via the codec registry */
1587    v = PyCodec_Decode(unicode, encoding, errors);
1588    if (v == NULL)
1589        goto onError;
1590    if (!PyUnicode_Check(v)) {
1591        PyErr_Format(PyExc_TypeError,
1592                     "decoder did not return a str object (type=%.400s)",
1593                     Py_TYPE(v)->tp_name);
1594        Py_DECREF(v);
1595        goto onError;
1596    }
1597    return v;
1598
1599  onError:
1600    return NULL;
1601}
1602
1603PyObject *
1604PyUnicode_Encode(const Py_UNICODE *s,
1605		 Py_ssize_t size,
1606		 const char *encoding,
1607		 const char *errors)
1608{
1609    PyObject *v, *unicode;
1610
1611    unicode = PyUnicode_FromUnicode(s, size);
1612    if (unicode == NULL)
1613        return NULL;
1614    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615    Py_DECREF(unicode);
1616    return v;
1617}
1618
1619PyObject *
1620PyUnicode_AsEncodedObject(PyObject *unicode,
1621			  const char *encoding,
1622			  const char *errors)
1623{
1624    PyObject *v;
1625
1626    if (!PyUnicode_Check(unicode)) {
1627        PyErr_BadArgument();
1628        goto onError;
1629    }
1630
1631    if (encoding == NULL)
1632        encoding = PyUnicode_GetDefaultEncoding();
1633
1634    /* Encode via the codec registry */
1635    v = PyCodec_Encode(unicode, encoding, errors);
1636    if (v == NULL)
1637        goto onError;
1638    return v;
1639
1640  onError:
1641    return NULL;
1642}
1643
1644PyObject *
1645PyUnicode_EncodeFSDefault(PyObject *unicode)
1646{
1647#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1648    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649                                PyUnicode_GET_SIZE(unicode),
1650                                NULL);
1651#elif defined(__APPLE__)
1652    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653                                PyUnicode_GET_SIZE(unicode),
1654                                "surrogateescape");
1655#else
1656    if (Py_FileSystemDefaultEncoding) {
1657        return PyUnicode_AsEncodedString(unicode,
1658                                         Py_FileSystemDefaultEncoding,
1659                                         "surrogateescape");
1660    }
1661    else {
1662        /* locale encoding with surrogateescape */
1663        wchar_t *wchar;
1664        char *bytes;
1665        PyObject *bytes_obj;
1666        size_t error_pos;
1667
1668        wchar = PyUnicode_AsWideCharString(unicode, NULL);
1669        if (wchar == NULL)
1670            return NULL;
1671        bytes = _Py_wchar2char(wchar, &error_pos);
1672        if (bytes == NULL) {
1673            if (error_pos != (size_t)-1) {
1674                char *errmsg = strerror(errno);
1675                PyObject *exc = NULL;
1676                if (errmsg == NULL)
1677                    errmsg = "Py_wchar2char() failed";
1678                raise_encode_exception(&exc,
1679                    "filesystemencoding",
1680                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1681                    error_pos, error_pos+1,
1682                    errmsg);
1683                Py_XDECREF(exc);
1684            }
1685            else
1686                PyErr_NoMemory();
1687            PyMem_Free(wchar);
1688            return NULL;
1689        }
1690        PyMem_Free(wchar);
1691
1692        bytes_obj = PyBytes_FromString(bytes);
1693        PyMem_Free(bytes);
1694        return bytes_obj;
1695    }
1696#endif
1697}
1698
1699PyObject *
1700PyUnicode_AsEncodedString(PyObject *unicode,
1701			  const char *encoding,
1702			  const char *errors)
1703{
1704    PyObject *v;
1705    char lower[11];  /* Enough for any encoding shortcut */
1706
1707    if (!PyUnicode_Check(unicode)) {
1708        PyErr_BadArgument();
1709        return NULL;
1710    }
1711
1712    if (encoding == NULL)
1713        return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1714                                    PyUnicode_GET_SIZE(unicode),
1715                                    errors);
1716
1717    /* Shortcuts for common default encodings */
1718    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1719        if ((strcmp(lower, "utf-8") == 0) ||
1720            (strcmp(lower, "utf8") == 0))
1721            return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1722                                        PyUnicode_GET_SIZE(unicode),
1723                                        errors);
1724        else if ((strcmp(lower, "latin-1") == 0) ||
1725                 (strcmp(lower, "latin1") == 0) ||
1726                 (strcmp(lower, "iso-8859-1") == 0))
1727            return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1728                                          PyUnicode_GET_SIZE(unicode),
1729                                          errors);
1730#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1731        else if (strcmp(lower, "mbcs") == 0)
1732            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1733                                        PyUnicode_GET_SIZE(unicode),
1734                                        errors);
1735#endif
1736        else if (strcmp(lower, "ascii") == 0)
1737            return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1738                                         PyUnicode_GET_SIZE(unicode),
1739                                         errors);
1740    }
1741
1742    /* Encode via the codec registry */
1743    v = PyCodec_Encode(unicode, encoding, errors);
1744    if (v == NULL)
1745        return NULL;
1746
1747    /* The normal path */
1748    if (PyBytes_Check(v))
1749        return v;
1750
1751    /* If the codec returns a buffer, raise a warning and convert to bytes */
1752    if (PyByteArray_Check(v)) {
1753        int error;
1754        PyObject *b;
1755
1756        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1757            "encoder %s returned bytearray instead of bytes",
1758            encoding);
1759        if (error) {
1760            Py_DECREF(v);
1761            return NULL;
1762        }
1763
1764        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1765        Py_DECREF(v);
1766        return b;
1767    }
1768
1769    PyErr_Format(PyExc_TypeError,
1770                 "encoder did not return a bytes object (type=%.400s)",
1771                 Py_TYPE(v)->tp_name);
1772    Py_DECREF(v);
1773    return NULL;
1774}
1775
1776PyObject *
1777PyUnicode_AsEncodedUnicode(PyObject *unicode,
1778			   const char *encoding,
1779			   const char *errors)
1780{
1781    PyObject *v;
1782
1783    if (!PyUnicode_Check(unicode)) {
1784        PyErr_BadArgument();
1785        goto onError;
1786    }
1787
1788    if (encoding == NULL)
1789        encoding = PyUnicode_GetDefaultEncoding();
1790
1791    /* Encode via the codec registry */
1792    v = PyCodec_Encode(unicode, encoding, errors);
1793    if (v == NULL)
1794        goto onError;
1795    if (!PyUnicode_Check(v)) {
1796        PyErr_Format(PyExc_TypeError,
1797                     "encoder did not return an str object (type=%.400s)",
1798                     Py_TYPE(v)->tp_name);
1799        Py_DECREF(v);
1800        goto onError;
1801    }
1802    return v;
1803
1804  onError:
1805    return NULL;
1806}
1807
1808PyObject *
1809_PyUnicode_AsDefaultEncodedString(PyObject *unicode)
1810{
1811    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1812    if (v)
1813        return v;
1814    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1815                             PyUnicode_GET_SIZE(unicode),
1816                             NULL);
1817    if (!v)
1818        return NULL;
1819    ((PyUnicodeObject *)unicode)->defenc = v;
1820    return v;
1821}
1822
1823PyObject*
1824PyUnicode_DecodeFSDefault(const char *s) {
1825    Py_ssize_t size = (Py_ssize_t)strlen(s);
1826    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1827}
1828
1829PyObject*
1830PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1831{
1832#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1833    return PyUnicode_DecodeMBCS(s, size, NULL);
1834#elif defined(__APPLE__)
1835    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1836#else
1837    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1838       can be undefined. If it is case, decode using UTF-8. The following assumes
1839       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1840       bootstrapping process where the codecs aren't ready yet.
1841    */
1842    if (Py_FileSystemDefaultEncoding) {
1843        return PyUnicode_Decode(s, size,
1844                                Py_FileSystemDefaultEncoding,
1845                                "surrogateescape");
1846    }
1847    else {
1848        /* locale encoding with surrogateescape */
1849        wchar_t *wchar;
1850        PyObject *unicode;
1851        size_t len;
1852
1853        if (s[size] != '\0' || size != strlen(s)) {
1854            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1855            return NULL;
1856        }
1857
1858        wchar = _Py_char2wchar(s, &len);
1859        if (wchar == NULL)
1860            return PyErr_NoMemory();
1861
1862        unicode = PyUnicode_FromWideChar(wchar, len);
1863        PyMem_Free(wchar);
1864        return unicode;
1865    }
1866#endif
1867}
1868
1869
1870int
1871PyUnicode_FSConverter(PyObject* arg, void* addr)
1872{
1873    PyObject *output = NULL;
1874    Py_ssize_t size;
1875    void *data;
1876    if (arg == NULL) {
1877        Py_DECREF(*(PyObject**)addr);
1878        return 1;
1879    }
1880    if (PyBytes_Check(arg)) {
1881        output = arg;
1882        Py_INCREF(output);
1883    }
1884    else {
1885        arg = PyUnicode_FromObject(arg);
1886        if (!arg)
1887            return 0;
1888        output = PyUnicode_EncodeFSDefault(arg);
1889        Py_DECREF(arg);
1890        if (!output)
1891            return 0;
1892        if (!PyBytes_Check(output)) {
1893            Py_DECREF(output);
1894            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1895            return 0;
1896        }
1897    }
1898    size = PyBytes_GET_SIZE(output);
1899    data = PyBytes_AS_STRING(output);
1900    if (size != strlen(data)) {
1901        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1902        Py_DECREF(output);
1903        return 0;
1904    }
1905    *(PyObject**)addr = output;
1906    return Py_CLEANUP_SUPPORTED;
1907}
1908
1909
1910int
1911PyUnicode_FSDecoder(PyObject* arg, void* addr)
1912{
1913    PyObject *output = NULL;
1914    Py_ssize_t size;
1915    void *data;
1916    if (arg == NULL) {
1917        Py_DECREF(*(PyObject**)addr);
1918        return 1;
1919    }
1920    if (PyUnicode_Check(arg)) {
1921        output = arg;
1922        Py_INCREF(output);
1923    }
1924    else {
1925        arg = PyBytes_FromObject(arg);
1926        if (!arg)
1927            return 0;
1928        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1929                                                  PyBytes_GET_SIZE(arg));
1930        Py_DECREF(arg);
1931        if (!output)
1932            return 0;
1933        if (!PyUnicode_Check(output)) {
1934            Py_DECREF(output);
1935            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1936            return 0;
1937        }
1938    }
1939    size = PyUnicode_GET_SIZE(output);
1940    data = PyUnicode_AS_UNICODE(output);
1941    if (size != Py_UNICODE_strlen(data)) {
1942        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1943        Py_DECREF(output);
1944        return 0;
1945    }
1946    *(PyObject**)addr = output;
1947    return Py_CLEANUP_SUPPORTED;
1948}
1949
1950
1951char*
1952_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1953{
1954    PyObject *bytes;
1955    if (!PyUnicode_Check(unicode)) {
1956        PyErr_BadArgument();
1957        return NULL;
1958    }
1959    bytes = _PyUnicode_AsDefaultEncodedString(unicode);
1960    if (bytes == NULL)
1961        return NULL;
1962    if (psize != NULL)
1963        *psize = PyBytes_GET_SIZE(bytes);
1964    return PyBytes_AS_STRING(bytes);
1965}
1966
1967char*
1968_PyUnicode_AsString(PyObject *unicode)
1969{
1970    return _PyUnicode_AsStringAndSize(unicode, NULL);
1971}
1972
1973Py_UNICODE *
1974PyUnicode_AsUnicode(PyObject *unicode)
1975{
1976    if (!PyUnicode_Check(unicode)) {
1977        PyErr_BadArgument();
1978        goto onError;
1979    }
1980    return PyUnicode_AS_UNICODE(unicode);
1981
1982  onError:
1983    return NULL;
1984}
1985
1986Py_ssize_t
1987PyUnicode_GetSize(PyObject *unicode)
1988{
1989    if (!PyUnicode_Check(unicode)) {
1990        PyErr_BadArgument();
1991        goto onError;
1992    }
1993    return PyUnicode_GET_SIZE(unicode);
1994
1995  onError:
1996    return -1;
1997}
1998
1999const char *
2000PyUnicode_GetDefaultEncoding(void)
2001{
2002    return "utf-8";
2003}
2004
2005/* create or adjust a UnicodeDecodeError */
2006static void
2007make_decode_exception(PyObject **exceptionObject,
2008                      const char *encoding,
2009                      const char *input, Py_ssize_t length,
2010                      Py_ssize_t startpos, Py_ssize_t endpos,
2011                      const char *reason)
2012{
2013    if (*exceptionObject == NULL) {
2014        *exceptionObject = PyUnicodeDecodeError_Create(
2015            encoding, input, length, startpos, endpos, reason);
2016    }
2017    else {
2018        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2019            goto onError;
2020        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2021            goto onError;
2022        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2023            goto onError;
2024    }
2025    return;
2026
2027onError:
2028    Py_DECREF(*exceptionObject);
2029    *exceptionObject = NULL;
2030}
2031
2032/* error handling callback helper:
2033   build arguments, call the callback and check the arguments,
2034   if no exception occurred, copy the replacement to the output
2035   and adjust various state variables.
2036   return 0 on success, -1 on error
2037*/
2038
2039static int
2040unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2041				 const char *encoding, const char *reason,
2042				 const char **input, const char **inend, Py_ssize_t *startinpos,
2043				 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2044				 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
2045{
2046    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
2047
2048    PyObject *restuple = NULL;
2049    PyObject *repunicode = NULL;
2050    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
2051    Py_ssize_t insize;
2052    Py_ssize_t requiredsize;
2053    Py_ssize_t newpos;
2054    Py_UNICODE *repptr;
2055    PyObject *inputobj = NULL;
2056    Py_ssize_t repsize;
2057    int res = -1;
2058
2059    if (*errorHandler == NULL) {
2060        *errorHandler = PyCodec_LookupError(errors);
2061        if (*errorHandler == NULL)
2062            goto onError;
2063    }
2064
2065    make_decode_exception(exceptionObject,
2066        encoding,
2067        *input, *inend - *input,
2068        *startinpos, *endinpos,
2069        reason);
2070    if (*exceptionObject == NULL)
2071        goto onError;
2072
2073    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2074    if (restuple == NULL)
2075        goto onError;
2076    if (!PyTuple_Check(restuple)) {
2077        PyErr_SetString(PyExc_TypeError, &argparse[4]);
2078        goto onError;
2079    }
2080    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
2081        goto onError;
2082
2083    /* Copy back the bytes variables, which might have been modified by the
2084       callback */
2085    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2086    if (!inputobj)
2087        goto onError;
2088    if (!PyBytes_Check(inputobj)) {
2089        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
2090    }
2091    *input = PyBytes_AS_STRING(inputobj);
2092    insize = PyBytes_GET_SIZE(inputobj);
2093    *inend = *input + insize;
2094    /* we can DECREF safely, as the exception has another reference,
2095       so the object won't go away. */
2096    Py_DECREF(inputobj);
2097
2098    if (newpos<0)
2099        newpos = insize+newpos;
2100    if (newpos<0 || newpos>insize) {
2101        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2102        goto onError;
2103    }
2104
2105    /* need more space? (at least enough for what we
2106       have+the replacement+the rest of the string (starting
2107       at the new input position), so we won't have to check space
2108       when there are no errors in the rest of the string) */
2109    repptr = PyUnicode_AS_UNICODE(repunicode);
2110    repsize = PyUnicode_GET_SIZE(repunicode);
2111    requiredsize = *outpos + repsize + insize-newpos;
2112    if (requiredsize > outsize) {
2113        if (requiredsize<2*outsize)
2114            requiredsize = 2*outsize;
2115        if (_PyUnicode_Resize(output, requiredsize) < 0)
2116            goto onError;
2117        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
2118    }
2119    *endinpos = newpos;
2120    *inptr = *input + newpos;
2121    Py_UNICODE_COPY(*outptr, repptr, repsize);
2122    *outptr += repsize;
2123    *outpos += repsize;
2124
2125    /* we made it! */
2126    res = 0;
2127
2128  onError:
2129    Py_XDECREF(restuple);
2130    return res;
2131}
2132
2133/* --- UTF-7 Codec -------------------------------------------------------- */
2134
2135/* See RFC2152 for details.  We encode conservatively and decode liberally. */
2136
2137/* Three simple macros defining base-64. */
2138
2139/* Is c a base-64 character? */
2140
2141#define IS_BASE64(c) \
2142    (((c) >= 'A' && (c) <= 'Z') ||     \
2143     ((c) >= 'a' && (c) <= 'z') ||     \
2144     ((c) >= '0' && (c) <= '9') ||     \
2145     (c) == '+' || (c) == '/')
2146
2147/* given that c is a base-64 character, what is its base-64 value? */
2148
2149#define FROM_BASE64(c)                                                  \
2150    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
2151     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
2152     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
2153     (c) == '+' ? 62 : 63)
2154
2155/* What is the base-64 character of the bottom 6 bits of n? */
2156
2157#define TO_BASE64(n)  \
2158    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2159
2160/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2161 * decoded as itself.  We are permissive on decoding; the only ASCII
2162 * byte not decoding to itself is the + which begins a base64
2163 * string. */
2164
2165#define DECODE_DIRECT(c)                                \
2166    ((c) <= 127 && (c) != '+')
2167
2168/* The UTF-7 encoder treats ASCII characters differently according to
2169 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2170 * the above).  See RFC2152.  This array identifies these different
2171 * sets:
2172 * 0 : "Set D"
2173 *     alphanumeric and '(),-./:?
2174 * 1 : "Set O"
2175 *     !"#$%&*;<=>@[]^_`{|}
2176 * 2 : "whitespace"
2177 *     ht nl cr sp
2178 * 3 : special (must be base64 encoded)
2179 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2180 */
2181
2182static
2183char utf7_category[128] = {
2184/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
2185    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
2186/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
2187    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2188/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
2189    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
2190/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
2191    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
2192/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
2193    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2194/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
2195    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
2196/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
2197    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2198/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
2199    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
2200};
2201
2202/* ENCODE_DIRECT: this character should be encoded as itself.  The
2203 * answer depends on whether we are encoding set O as itself, and also
2204 * on whether we are encoding whitespace as itself.  RFC2152 makes it
2205 * clear that the answers to these questions vary between
2206 * applications, so this code needs to be flexible.  */
2207
2208#define ENCODE_DIRECT(c, directO, directWS)             \
2209    ((c) < 128 && (c) > 0 &&                            \
2210     ((utf7_category[(c)] == 0) ||                      \
2211      (directWS && (utf7_category[(c)] == 2)) ||        \
2212      (directO && (utf7_category[(c)] == 1))))
2213
2214PyObject *
2215PyUnicode_DecodeUTF7(const char *s,
2216		     Py_ssize_t size,
2217		     const char *errors)
2218{
2219    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2220}
2221
2222/* The decoder.  The only state we preserve is our read position,
2223 * i.e. how many characters we have consumed.  So if we end in the
2224 * middle of a shift sequence we have to back off the read position
2225 * and the output to the beginning of the sequence, otherwise we lose
2226 * all the shift state (seen bits, number of bits seen, high
2227 * surrogate). */
2228
2229PyObject *
2230PyUnicode_DecodeUTF7Stateful(const char *s,
2231			     Py_ssize_t size,
2232			     const char *errors,
2233			     Py_ssize_t *consumed)
2234{
2235    const char *starts = s;
2236    Py_ssize_t startinpos;
2237    Py_ssize_t endinpos;
2238    Py_ssize_t outpos;
2239    const char *e;
2240    PyUnicodeObject *unicode;
2241    Py_UNICODE *p;
2242    const char *errmsg = "";
2243    int inShift = 0;
2244    Py_UNICODE *shiftOutStart;
2245    unsigned int base64bits = 0;
2246    unsigned long base64buffer = 0;
2247    Py_UNICODE surrogate = 0;
2248    PyObject *errorHandler = NULL;
2249    PyObject *exc = NULL;
2250
2251    unicode = _PyUnicode_New(size);
2252    if (!unicode)
2253        return NULL;
2254    if (size == 0) {
2255        if (consumed)
2256            *consumed = 0;
2257        return (PyObject *)unicode;
2258    }
2259
2260    p = unicode->str;
2261    shiftOutStart = p;
2262    e = s + size;
2263
2264    while (s < e) {
2265        Py_UNICODE ch;
2266      restart:
2267        ch = (unsigned char) *s;
2268
2269        if (inShift) { /* in a base-64 section */
2270            if (IS_BASE64(ch)) { /* consume a base-64 character */
2271                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2272                base64bits += 6;
2273                s++;
2274                if (base64bits >= 16) {
2275                    /* we have enough bits for a UTF-16 value */
2276                    Py_UNICODE outCh = (Py_UNICODE)
2277                                       (base64buffer >> (base64bits-16));
2278                    base64bits -= 16;
2279                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2280                    if (surrogate) {
2281                        /* expecting a second surrogate */
2282                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2283#ifdef Py_UNICODE_WIDE
2284                            *p++ = (((surrogate & 0x3FF)<<10)
2285                                    | (outCh & 0x3FF)) + 0x10000;
2286#else
2287                            *p++ = surrogate;
2288                            *p++ = outCh;
2289#endif
2290                            surrogate = 0;
2291                        }
2292                        else {
2293                            surrogate = 0;
2294                            errmsg = "second surrogate missing";
2295                            goto utf7Error;
2296                        }
2297                    }
2298                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2299                        /* first surrogate */
2300                        surrogate = outCh;
2301                    }
2302                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2303                        errmsg = "unexpected second surrogate";
2304                        goto utf7Error;
2305                    }
2306                    else {
2307                        *p++ = outCh;
2308                    }
2309                }
2310            }
2311            else { /* now leaving a base-64 section */
2312                inShift = 0;
2313                s++;
2314                if (surrogate) {
2315                    errmsg = "second surrogate missing at end of shift sequence";
2316                    goto utf7Error;
2317                }
2318                if (base64bits > 0) { /* left-over bits */
2319                    if (base64bits >= 6) {
2320                        /* We've seen at least one base-64 character */
2321                        errmsg = "partial character in shift sequence";
2322                        goto utf7Error;
2323                    }
2324                    else {
2325                        /* Some bits remain; they should be zero */
2326                        if (base64buffer != 0) {
2327                            errmsg = "non-zero padding bits in shift sequence";
2328                            goto utf7Error;
2329                        }
2330                    }
2331                }
2332                if (ch != '-') {
2333                    /* '-' is absorbed; other terminating
2334                       characters are preserved */
2335                    *p++ = ch;
2336                }
2337            }
2338        }
2339        else if ( ch == '+' ) {
2340            startinpos = s-starts;
2341            s++; /* consume '+' */
2342            if (s < e && *s == '-') { /* '+-' encodes '+' */
2343                s++;
2344                *p++ = '+';
2345            }
2346            else { /* begin base64-encoded section */
2347                inShift = 1;
2348                shiftOutStart = p;
2349                base64bits = 0;
2350            }
2351        }
2352        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
2353            *p++ = ch;
2354            s++;
2355        }
2356        else {
2357            startinpos = s-starts;
2358            s++;
2359            errmsg = "unexpected special character";
2360            goto utf7Error;
2361        }
2362        continue;
2363utf7Error:
2364        outpos = p-PyUnicode_AS_UNICODE(unicode);
2365        endinpos = s-starts;
2366        if (unicode_decode_call_errorhandler(
2367                errors, &errorHandler,
2368                "utf7", errmsg,
2369                &starts, &e, &startinpos, &endinpos, &exc, &s,
2370                &unicode, &outpos, &p))
2371            goto onError;
2372    }
2373
2374    /* end of string */
2375
2376    if (inShift && !consumed) { /* in shift sequence, no more to follow */
2377        /* if we're in an inconsistent state, that's an error */
2378        if (surrogate ||
2379                (base64bits >= 6) ||
2380                (base64bits > 0 && base64buffer != 0)) {
2381            outpos = p-PyUnicode_AS_UNICODE(unicode);
2382            endinpos = size;
2383            if (unicode_decode_call_errorhandler(
2384                    errors, &errorHandler,
2385                    "utf7", "unterminated shift sequence",
2386                    &starts, &e, &startinpos, &endinpos, &exc, &s,
2387                    &unicode, &outpos, &p))
2388                goto onError;
2389            if (s < e)
2390                goto restart;
2391        }
2392    }
2393
2394    /* return state */
2395    if (consumed) {
2396        if (inShift) {
2397            p = shiftOutStart; /* back off output */
2398            *consumed = startinpos;
2399        }
2400        else {
2401            *consumed = s-starts;
2402        }
2403    }
2404
2405    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2406        goto onError;
2407
2408    Py_XDECREF(errorHandler);
2409    Py_XDECREF(exc);
2410    return (PyObject *)unicode;
2411
2412  onError:
2413    Py_XDECREF(errorHandler);
2414    Py_XDECREF(exc);
2415    Py_DECREF(unicode);
2416    return NULL;
2417}
2418
2419
2420PyObject *
2421PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2422		     Py_ssize_t size,
2423		     int base64SetO,
2424		     int base64WhiteSpace,
2425		     const char *errors)
2426{
2427    PyObject *v;
2428    /* It might be possible to tighten this worst case */
2429    Py_ssize_t allocated = 8 * size;
2430    int inShift = 0;
2431    Py_ssize_t i = 0;
2432    unsigned int base64bits = 0;
2433    unsigned long base64buffer = 0;
2434    char * out;
2435    char * start;
2436
2437    if (size == 0)
2438        return PyBytes_FromStringAndSize(NULL, 0);
2439
2440    if (allocated / 8 != size)
2441        return PyErr_NoMemory();
2442
2443    v = PyBytes_FromStringAndSize(NULL, allocated);
2444    if (v == NULL)
2445        return NULL;
2446
2447    start = out = PyBytes_AS_STRING(v);
2448    for (;i < size; ++i) {
2449        Py_UNICODE ch = s[i];
2450
2451        if (inShift) {
2452            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2453                /* shifting out */
2454                if (base64bits) { /* output remaining bits */
2455                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
2456                    base64buffer = 0;
2457                    base64bits = 0;
2458                }
2459                inShift = 0;
2460                /* Characters not in the BASE64 set implicitly unshift the sequence
2461                   so no '-' is required, except if the character is itself a '-' */
2462                if (IS_BASE64(ch) || ch == '-') {
2463                    *out++ = '-';
2464                }
2465                *out++ = (char) ch;
2466            }
2467            else {
2468                goto encode_char;
2469            }
2470        }
2471        else { /* not in a shift sequence */
2472            if (ch == '+') {
2473                *out++ = '+';
2474                        *out++ = '-';
2475            }
2476            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2477                *out++ = (char) ch;
2478            }
2479            else {
2480                *out++ = '+';
2481                inShift = 1;
2482                goto encode_char;
2483            }
2484        }
2485        continue;
2486encode_char:
2487#ifdef Py_UNICODE_WIDE
2488        if (ch >= 0x10000) {
2489            /* code first surrogate */
2490            base64bits += 16;
2491            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2492            while (base64bits >= 6) {
2493                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2494                base64bits -= 6;
2495            }
2496            /* prepare second surrogate */
2497            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
2498        }
2499#endif
2500        base64bits += 16;
2501        base64buffer = (base64buffer << 16) | ch;
2502        while (base64bits >= 6) {
2503            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2504            base64bits -= 6;
2505        }
2506    }
2507    if (base64bits)
2508        *out++= TO_BASE64(base64buffer << (6-base64bits) );
2509    if (inShift)
2510        *out++ = '-';
2511    if (_PyBytes_Resize(&v, out - start) < 0)
2512        return NULL;
2513    return v;
2514}
2515
2516#undef IS_BASE64
2517#undef FROM_BASE64
2518#undef TO_BASE64
2519#undef DECODE_DIRECT
2520#undef ENCODE_DIRECT
2521
2522/* --- UTF-8 Codec -------------------------------------------------------- */
2523
2524static
2525char utf8_code_length[256] = {
2526    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
2527       illegal prefix.  See RFC 3629 for details */
2528    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2529    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2530    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2531    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2532    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2533    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2534    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2535    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2536    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2537    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2538    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2539    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2540    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2541    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2542    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2543    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
2544};
2545
2546PyObject *
2547PyUnicode_DecodeUTF8(const char *s,
2548		     Py_ssize_t size,
2549		     const char *errors)
2550{
2551    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2552}
2553
2554/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2555#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2556
2557/* Mask to quickly check whether a C 'long' contains a
2558   non-ASCII, UTF8-encoded char. */
2559#if (SIZEOF_LONG == 8)
2560# define ASCII_CHAR_MASK 0x8080808080808080L
2561#elif (SIZEOF_LONG == 4)
2562# define ASCII_CHAR_MASK 0x80808080L
2563#else
2564# error C 'long' size should be either 4 or 8!
2565#endif
2566
2567PyObject *
2568PyUnicode_DecodeUTF8Stateful(const char *s,
2569			     Py_ssize_t size,
2570			     const char *errors,
2571			     Py_ssize_t *consumed)
2572{
2573    const char *starts = s;
2574    int n;
2575    int k;
2576    Py_ssize_t startinpos;
2577    Py_ssize_t endinpos;
2578    Py_ssize_t outpos;
2579    const char *e, *aligned_end;
2580    PyUnicodeObject *unicode;
2581    Py_UNICODE *p;
2582    const char *errmsg = "";
2583    PyObject *errorHandler = NULL;
2584    PyObject *exc = NULL;
2585
2586    /* Note: size will always be longer than the resulting Unicode
2587       character count */
2588    unicode = _PyUnicode_New(size);
2589    if (!unicode)
2590        return NULL;
2591    if (size == 0) {
2592        if (consumed)
2593            *consumed = 0;
2594        return (PyObject *)unicode;
2595    }
2596
2597    /* Unpack UTF-8 encoded data */
2598    p = unicode->str;
2599    e = s + size;
2600    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2601
2602    while (s < e) {
2603        Py_UCS4 ch = (unsigned char)*s;
2604
2605        if (ch < 0x80) {
2606            /* Fast path for runs of ASCII characters. Given that common UTF-8
2607               input will consist of an overwhelming majority of ASCII
2608               characters, we try to optimize for this case by checking
2609               as many characters as a C 'long' can contain.
2610               First, check if we can do an aligned read, as most CPUs have
2611               a penalty for unaligned reads.
2612            */
2613            if (!((size_t) s & LONG_PTR_MASK)) {
2614                /* Help register allocation */
2615                register const char *_s = s;
2616                register Py_UNICODE *_p = p;
2617                while (_s < aligned_end) {
2618                    /* Read a whole long at a time (either 4 or 8 bytes),
2619                       and do a fast unrolled copy if it only contains ASCII
2620                       characters. */
2621                    unsigned long data = *(unsigned long *) _s;
2622                    if (data & ASCII_CHAR_MASK)
2623                        break;
2624                    _p[0] = (unsigned char) _s[0];
2625                    _p[1] = (unsigned char) _s[1];
2626                    _p[2] = (unsigned char) _s[2];
2627                    _p[3] = (unsigned char) _s[3];
2628#if (SIZEOF_LONG == 8)
2629                    _p[4] = (unsigned char) _s[4];
2630                    _p[5] = (unsigned char) _s[5];
2631                    _p[6] = (unsigned char) _s[6];
2632                    _p[7] = (unsigned char) _s[7];
2633#endif
2634                    _s += SIZEOF_LONG;
2635                    _p += SIZEOF_LONG;
2636                }
2637                s = _s;
2638                p = _p;
2639                if (s == e)
2640                    break;
2641                ch = (unsigned char)*s;
2642            }
2643        }
2644
2645        if (ch < 0x80) {
2646            *p++ = (Py_UNICODE)ch;
2647            s++;
2648            continue;
2649        }
2650
2651        n = utf8_code_length[ch];
2652
2653        if (s + n > e) {
2654            if (consumed)
2655                break;
2656            else {
2657                errmsg = "unexpected end of data";
2658                startinpos = s-starts;
2659                endinpos = startinpos+1;
2660                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2661                    endinpos++;
2662                goto utf8Error;
2663            }
2664        }
2665
2666        switch (n) {
2667
2668        case 0:
2669            errmsg = "invalid start byte";
2670            startinpos = s-starts;
2671            endinpos = startinpos+1;
2672            goto utf8Error;
2673
2674        case 1:
2675            errmsg = "internal error";
2676            startinpos = s-starts;
2677            endinpos = startinpos+1;
2678            goto utf8Error;
2679
2680        case 2:
2681            if ((s[1] & 0xc0) != 0x80) {
2682                errmsg = "invalid continuation byte";
2683                startinpos = s-starts;
2684                endinpos = startinpos + 1;
2685                goto utf8Error;
2686            }
2687            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2688            assert ((ch > 0x007F) && (ch <= 0x07FF));
2689            *p++ = (Py_UNICODE)ch;
2690            break;
2691
2692        case 3:
2693            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2694               will result in surrogates in range d800-dfff. Surrogates are
2695               not valid UTF-8 so they are rejected.
2696               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2697               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2698            if ((s[1] & 0xc0) != 0x80 ||
2699                (s[2] & 0xc0) != 0x80 ||
2700                ((unsigned char)s[0] == 0xE0 &&
2701                 (unsigned char)s[1] < 0xA0) ||
2702                ((unsigned char)s[0] == 0xED &&
2703                 (unsigned char)s[1] > 0x9F)) {
2704                errmsg = "invalid continuation byte";
2705                startinpos = s-starts;
2706                endinpos = startinpos + 1;
2707
2708                /* if s[1] first two bits are 1 and 0, then the invalid
2709                   continuation byte is s[2], so increment endinpos by 1,
2710                   if not, s[1] is invalid and endinpos doesn't need to
2711                   be incremented. */
2712                if ((s[1] & 0xC0) == 0x80)
2713                    endinpos++;
2714                goto utf8Error;
2715            }
2716            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2717            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2718            *p++ = (Py_UNICODE)ch;
2719            break;
2720
2721        case 4:
2722            if ((s[1] & 0xc0) != 0x80 ||
2723                (s[2] & 0xc0) != 0x80 ||
2724                (s[3] & 0xc0) != 0x80 ||
2725                ((unsigned char)s[0] == 0xF0 &&
2726                 (unsigned char)s[1] < 0x90) ||
2727                ((unsigned char)s[0] == 0xF4 &&
2728                 (unsigned char)s[1] > 0x8F)) {
2729                errmsg = "invalid continuation byte";
2730                startinpos = s-starts;
2731                endinpos = startinpos + 1;
2732                if ((s[1] & 0xC0) == 0x80) {
2733                    endinpos++;
2734                    if ((s[2] & 0xC0) == 0x80)
2735                        endinpos++;
2736                }
2737                goto utf8Error;
2738            }
2739            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2740                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2741            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2742
2743#ifdef Py_UNICODE_WIDE
2744            *p++ = (Py_UNICODE)ch;
2745#else
2746            /*  compute and append the two surrogates: */
2747
2748            /*  translate from 10000..10FFFF to 0..FFFF */
2749            ch -= 0x10000;
2750
2751            /*  high surrogate = top 10 bits added to D800 */
2752            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2753
2754            /*  low surrogate = bottom 10 bits added to DC00 */
2755            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2756#endif
2757            break;
2758        }
2759        s += n;
2760        continue;
2761
2762      utf8Error:
2763        outpos = p-PyUnicode_AS_UNICODE(unicode);
2764        if (unicode_decode_call_errorhandler(
2765                errors, &errorHandler,
2766                "utf8", errmsg,
2767                &starts, &e, &startinpos, &endinpos, &exc, &s,
2768                &unicode, &outpos, &p))
2769            goto onError;
2770        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2771    }
2772    if (consumed)
2773        *consumed = s-starts;
2774
2775    /* Adjust length */
2776    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2777        goto onError;
2778
2779    Py_XDECREF(errorHandler);
2780    Py_XDECREF(exc);
2781    return (PyObject *)unicode;
2782
2783  onError:
2784    Py_XDECREF(errorHandler);
2785    Py_XDECREF(exc);
2786    Py_DECREF(unicode);
2787    return NULL;
2788}
2789
2790#undef ASCII_CHAR_MASK
2791
2792#ifdef __APPLE__
2793
2794/* Simplified UTF-8 decoder using surrogateescape error handler,
2795   used to decode the command line arguments on Mac OS X. */
2796
2797wchar_t*
2798_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2799{
2800    int n;
2801    const char *e;
2802    wchar_t *unicode, *p;
2803
2804    /* Note: size will always be longer than the resulting Unicode
2805       character count */
2806    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2807        PyErr_NoMemory();
2808        return NULL;
2809    }
2810    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2811    if (!unicode)
2812        return NULL;
2813
2814    /* Unpack UTF-8 encoded data */
2815    p = unicode;
2816    e = s + size;
2817    while (s < e) {
2818        Py_UCS4 ch = (unsigned char)*s;
2819
2820        if (ch < 0x80) {
2821            *p++ = (wchar_t)ch;
2822            s++;
2823            continue;
2824        }
2825
2826        n = utf8_code_length[ch];
2827        if (s + n > e) {
2828            goto surrogateescape;
2829        }
2830
2831        switch (n) {
2832        case 0:
2833        case 1:
2834            goto surrogateescape;
2835
2836        case 2:
2837            if ((s[1] & 0xc0) != 0x80)
2838                goto surrogateescape;
2839            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2840            assert ((ch > 0x007F) && (ch <= 0x07FF));
2841            *p++ = (wchar_t)ch;
2842            break;
2843
2844        case 3:
2845            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2846               will result in surrogates in range d800-dfff. Surrogates are
2847               not valid UTF-8 so they are rejected.
2848               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2849               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2850            if ((s[1] & 0xc0) != 0x80 ||
2851                (s[2] & 0xc0) != 0x80 ||
2852                ((unsigned char)s[0] == 0xE0 &&
2853                 (unsigned char)s[1] < 0xA0) ||
2854                ((unsigned char)s[0] == 0xED &&
2855                 (unsigned char)s[1] > 0x9F)) {
2856
2857                goto surrogateescape;
2858            }
2859            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2860            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2861            *p++ = (Py_UNICODE)ch;
2862            break;
2863
2864        case 4:
2865            if ((s[1] & 0xc0) != 0x80 ||
2866                (s[2] & 0xc0) != 0x80 ||
2867                (s[3] & 0xc0) != 0x80 ||
2868                ((unsigned char)s[0] == 0xF0 &&
2869                 (unsigned char)s[1] < 0x90) ||
2870                ((unsigned char)s[0] == 0xF4 &&
2871                 (unsigned char)s[1] > 0x8F)) {
2872                goto surrogateescape;
2873            }
2874            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2875                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2876            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2877
2878#if SIZEOF_WCHAR_T == 4
2879            *p++ = (wchar_t)ch;
2880#else
2881            /*  compute and append the two surrogates: */
2882
2883            /*  translate from 10000..10FFFF to 0..FFFF */
2884            ch -= 0x10000;
2885
2886            /*  high surrogate = top 10 bits added to D800 */
2887            *p++ = (wchar_t)(0xD800 + (ch >> 10));
2888
2889            /*  low surrogate = bottom 10 bits added to DC00 */
2890            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2891#endif
2892            break;
2893        }
2894        s += n;
2895        continue;
2896
2897      surrogateescape:
2898        *p++ = 0xDC00 + ch;
2899        s++;
2900    }
2901    *p = L'\0';
2902    return unicode;
2903}
2904
2905#endif /* __APPLE__ */
2906
2907/* Allocation strategy:  if the string is short, convert into a stack buffer
2908   and allocate exactly as much space needed at the end.  Else allocate the
2909   maximum possible needed (4 result bytes per Unicode character), and return
2910   the excess memory at the end.
2911*/
2912PyObject *
2913PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2914                     Py_ssize_t size,
2915                     const char *errors)
2916{
2917#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2918
2919    Py_ssize_t i;                /* index into s of next input byte */
2920    PyObject *result;            /* result string object */
2921    char *p;                     /* next free byte in output buffer */
2922    Py_ssize_t nallocated;      /* number of result bytes allocated */
2923    Py_ssize_t nneeded;            /* number of result bytes needed */
2924    char stackbuf[MAX_SHORT_UNICHARS * 4];
2925    PyObject *errorHandler = NULL;
2926    PyObject *exc = NULL;
2927
2928    assert(s != NULL);
2929    assert(size >= 0);
2930
2931    if (size <= MAX_SHORT_UNICHARS) {
2932        /* Write into the stack buffer; nallocated can't overflow.
2933         * At the end, we'll allocate exactly as much heap space as it
2934         * turns out we need.
2935         */
2936        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2937        result = NULL;   /* will allocate after we're done */
2938        p = stackbuf;
2939    }
2940    else {
2941        /* Overallocate on the heap, and give the excess back at the end. */
2942        nallocated = size * 4;
2943        if (nallocated / 4 != size)  /* overflow! */
2944            return PyErr_NoMemory();
2945        result = PyBytes_FromStringAndSize(NULL, nallocated);
2946        if (result == NULL)
2947            return NULL;
2948        p = PyBytes_AS_STRING(result);
2949    }
2950
2951    for (i = 0; i < size;) {
2952        Py_UCS4 ch = s[i++];
2953
2954        if (ch < 0x80)
2955            /* Encode ASCII */
2956            *p++ = (char) ch;
2957
2958        else if (ch < 0x0800) {
2959            /* Encode Latin-1 */
2960            *p++ = (char)(0xc0 | (ch >> 6));
2961            *p++ = (char)(0x80 | (ch & 0x3f));
2962        } else if (0xD800 <= ch && ch <= 0xDFFF) {
2963#ifndef Py_UNICODE_WIDE
2964            /* Special case: check for high and low surrogate */
2965            if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2966                Py_UCS4 ch2 = s[i];
2967                /* Combine the two surrogates to form a UCS4 value */
2968                ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2969                i++;
2970
2971                /* Encode UCS4 Unicode ordinals */
2972                *p++ = (char)(0xf0 | (ch >> 18));
2973                *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2974                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2975                *p++ = (char)(0x80 | (ch & 0x3f));
2976            } else {
2977#endif
2978                Py_ssize_t newpos;
2979                PyObject *rep;
2980                Py_ssize_t repsize, k;
2981                rep = unicode_encode_call_errorhandler
2982                    (errors, &errorHandler, "utf-8", "surrogates not allowed",
2983                     s, size, &exc, i-1, i, &newpos);
2984                if (!rep)
2985                    goto error;
2986
2987                if (PyBytes_Check(rep))
2988                    repsize = PyBytes_GET_SIZE(rep);
2989                else
2990                    repsize = PyUnicode_GET_SIZE(rep);
2991
2992                if (repsize > 4) {
2993                    Py_ssize_t offset;
2994
2995                    if (result == NULL)
2996                        offset = p - stackbuf;
2997                    else
2998                        offset = p - PyBytes_AS_STRING(result);
2999
3000                    if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3001                        /* integer overflow */
3002                        PyErr_NoMemory();
3003                        goto error;
3004                    }
3005                    nallocated += repsize - 4;
3006                    if (result != NULL) {
3007                        if (_PyBytes_Resize(&result, nallocated) < 0)
3008                            goto error;
3009                    } else {
3010                        result = PyBytes_FromStringAndSize(NULL, nallocated);
3011                        if (result == NULL)
3012                            goto error;
3013                        Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3014                    }
3015                    p = PyBytes_AS_STRING(result) + offset;
3016                }
3017
3018                if (PyBytes_Check(rep)) {
3019                    char *prep = PyBytes_AS_STRING(rep);
3020                    for(k = repsize; k > 0; k--)
3021                        *p++ = *prep++;
3022                } else /* rep is unicode */ {
3023                    Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3024                    Py_UNICODE c;
3025
3026                    for(k=0; k<repsize; k++) {
3027                        c = prep[k];
3028                        if (0x80 <= c) {
3029                            raise_encode_exception(&exc, "utf-8", s, size,
3030                                                   i-1, i, "surrogates not allowed");
3031                            goto error;
3032                        }
3033                        *p++ = (char)prep[k];
3034                    }
3035                }
3036                Py_DECREF(rep);
3037#ifndef Py_UNICODE_WIDE
3038            }
3039#endif
3040        } else if (ch < 0x10000) {
3041            *p++ = (char)(0xe0 | (ch >> 12));
3042            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3043            *p++ = (char)(0x80 | (ch & 0x3f));
3044        } else /* ch >= 0x10000 */ {
3045            /* Encode UCS4 Unicode ordinals */
3046            *p++ = (char)(0xf0 | (ch >> 18));
3047            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3048            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3049            *p++ = (char)(0x80 | (ch & 0x3f));
3050        }
3051    }
3052
3053    if (result == NULL) {
3054        /* This was stack allocated. */
3055        nneeded = p - stackbuf;
3056        assert(nneeded <= nallocated);
3057        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
3058    }
3059    else {
3060        /* Cut back to size actually needed. */
3061        nneeded = p - PyBytes_AS_STRING(result);
3062        assert(nneeded <= nallocated);
3063        _PyBytes_Resize(&result, nneeded);
3064    }
3065    Py_XDECREF(errorHandler);
3066    Py_XDECREF(exc);
3067    return result;
3068 error:
3069    Py_XDECREF(errorHandler);
3070    Py_XDECREF(exc);
3071    Py_XDECREF(result);
3072    return NULL;
3073
3074#undef MAX_SHORT_UNICHARS
3075}
3076
3077PyObject *
3078PyUnicode_AsUTF8String(PyObject *unicode)
3079{
3080    if (!PyUnicode_Check(unicode)) {
3081        PyErr_BadArgument();
3082        return NULL;
3083    }
3084    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
3085                                PyUnicode_GET_SIZE(unicode),
3086                                NULL);
3087}
3088
3089/* --- UTF-32 Codec ------------------------------------------------------- */
3090
3091PyObject *
3092PyUnicode_DecodeUTF32(const char *s,
3093                      Py_ssize_t size,
3094                      const char *errors,
3095                      int *byteorder)
3096{
3097    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3098}
3099
3100PyObject *
3101PyUnicode_DecodeUTF32Stateful(const char *s,
3102                              Py_ssize_t size,
3103                              const char *errors,
3104                              int *byteorder,
3105                              Py_ssize_t *consumed)
3106{
3107    const char *starts = s;
3108    Py_ssize_t startinpos;
3109    Py_ssize_t endinpos;
3110    Py_ssize_t outpos;
3111    PyUnicodeObject *unicode;
3112    Py_UNICODE *p;
3113#ifndef Py_UNICODE_WIDE
3114    int pairs = 0;
3115    const unsigned char *qq;
3116#else
3117    const int pairs = 0;
3118#endif
3119    const unsigned char *q, *e;
3120    int bo = 0;       /* assume native ordering by default */
3121    const char *errmsg = "";
3122    /* Offsets from q for retrieving bytes in the right order. */
3123#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3124    int iorder[] = {0, 1, 2, 3};
3125#else
3126    int iorder[] = {3, 2, 1, 0};
3127#endif
3128    PyObject *errorHandler = NULL;
3129    PyObject *exc = NULL;
3130
3131    q = (unsigned char *)s;
3132    e = q + size;
3133
3134    if (byteorder)
3135        bo = *byteorder;
3136
3137    /* Check for BOM marks (U+FEFF) in the input and adjust current
3138       byte order setting accordingly. In native mode, the leading BOM
3139       mark is skipped, in all other modes, it is copied to the output
3140       stream as-is (giving a ZWNBSP character). */
3141    if (bo == 0) {
3142        if (size >= 4) {
3143            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3144                (q[iorder[1]] << 8) | q[iorder[0]];
3145#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3146            if (bom == 0x0000FEFF) {
3147                q += 4;
3148                bo = -1;
3149            }
3150            else if (bom == 0xFFFE0000) {
3151                q += 4;
3152                bo = 1;
3153            }
3154#else
3155            if (bom == 0x0000FEFF) {
3156                q += 4;
3157                bo = 1;
3158            }
3159            else if (bom == 0xFFFE0000) {
3160                q += 4;
3161                bo = -1;
3162            }
3163#endif
3164        }
3165    }
3166
3167    if (bo == -1) {
3168        /* force LE */
3169        iorder[0] = 0;
3170        iorder[1] = 1;
3171        iorder[2] = 2;
3172        iorder[3] = 3;
3173    }
3174    else if (bo == 1) {
3175        /* force BE */
3176        iorder[0] = 3;
3177        iorder[1] = 2;
3178        iorder[2] = 1;
3179        iorder[3] = 0;
3180    }
3181
3182    /* On narrow builds we split characters outside the BMP into two
3183       codepoints => count how much extra space we need. */
3184#ifndef Py_UNICODE_WIDE
3185    for (qq = q; qq < e; qq += 4)
3186        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3187            pairs++;
3188#endif
3189
3190    /* This might be one to much, because of a BOM */
3191    unicode = _PyUnicode_New((size+3)/4+pairs);
3192    if (!unicode)
3193        return NULL;
3194    if (size == 0)
3195        return (PyObject *)unicode;
3196
3197    /* Unpack UTF-32 encoded data */
3198    p = unicode->str;
3199
3200    while (q < e) {
3201        Py_UCS4 ch;
3202        /* remaining bytes at the end? (size should be divisible by 4) */
3203        if (e-q<4) {
3204            if (consumed)
3205                break;
3206            errmsg = "truncated data";
3207            startinpos = ((const char *)q)-starts;
3208            endinpos = ((const char *)e)-starts;
3209            goto utf32Error;
3210            /* The remaining input chars are ignored if the callback
3211               chooses to skip the input */
3212        }
3213        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3214            (q[iorder[1]] << 8) | q[iorder[0]];
3215
3216        if (ch >= 0x110000)
3217        {
3218            errmsg = "codepoint not in range(0x110000)";
3219            startinpos = ((const char *)q)-starts;
3220            endinpos = startinpos+4;
3221            goto utf32Error;
3222        }
3223#ifndef Py_UNICODE_WIDE
3224        if (ch >= 0x10000)
3225        {
3226            *p++ = 0xD800 | ((ch-0x10000) >> 10);
3227            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3228        }
3229        else
3230#endif
3231            *p++ = ch;
3232        q += 4;
3233        continue;
3234      utf32Error:
3235        outpos = p-PyUnicode_AS_UNICODE(unicode);
3236        if (unicode_decode_call_errorhandler(
3237                errors, &errorHandler,
3238                "utf32", errmsg,
3239                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3240                &unicode, &outpos, &p))
3241            goto onError;
3242    }
3243
3244    if (byteorder)
3245        *byteorder = bo;
3246
3247    if (consumed)
3248        *consumed = (const char *)q-starts;
3249
3250    /* Adjust length */
3251    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3252        goto onError;
3253
3254    Py_XDECREF(errorHandler);
3255    Py_XDECREF(exc);
3256    return (PyObject *)unicode;
3257
3258  onError:
3259    Py_DECREF(unicode);
3260    Py_XDECREF(errorHandler);
3261    Py_XDECREF(exc);
3262    return NULL;
3263}
3264
3265PyObject *
3266PyUnicode_EncodeUTF32(const Py_UNICODE *s,
3267                      Py_ssize_t size,
3268                      const char *errors,
3269                      int byteorder)
3270{
3271    PyObject *v;
3272    unsigned char *p;
3273    Py_ssize_t nsize, bytesize;
3274#ifndef Py_UNICODE_WIDE
3275    Py_ssize_t i, pairs;
3276#else
3277    const int pairs = 0;
3278#endif
3279    /* Offsets from p for storing byte pairs in the right order. */
3280#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3281    int iorder[] = {0, 1, 2, 3};
3282#else
3283    int iorder[] = {3, 2, 1, 0};
3284#endif
3285
3286#define STORECHAR(CH)                           \
3287    do {                                        \
3288        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
3289        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
3290        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
3291        p[iorder[0]] = (CH) & 0xff;             \
3292        p += 4;                                 \
3293    } while(0)
3294
3295    /* In narrow builds we can output surrogate pairs as one codepoint,
3296       so we need less space. */
3297#ifndef Py_UNICODE_WIDE
3298    for (i = pairs = 0; i < size-1; i++)
3299        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3300            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3301            pairs++;
3302#endif
3303    nsize = (size - pairs + (byteorder == 0));
3304    bytesize = nsize * 4;
3305    if (bytesize / 4 != nsize)
3306        return PyErr_NoMemory();
3307    v = PyBytes_FromStringAndSize(NULL, bytesize);
3308    if (v == NULL)
3309        return NULL;
3310
3311    p = (unsigned char *)PyBytes_AS_STRING(v);
3312    if (byteorder == 0)
3313        STORECHAR(0xFEFF);
3314    if (size == 0)
3315        goto done;
3316
3317    if (byteorder == -1) {
3318        /* force LE */
3319        iorder[0] = 0;
3320        iorder[1] = 1;
3321        iorder[2] = 2;
3322        iorder[3] = 3;
3323    }
3324    else if (byteorder == 1) {
3325        /* force BE */
3326        iorder[0] = 3;
3327        iorder[1] = 2;
3328        iorder[2] = 1;
3329        iorder[3] = 0;
3330    }
3331
3332    while (size-- > 0) {
3333        Py_UCS4 ch = *s++;
3334#ifndef Py_UNICODE_WIDE
3335        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3336            Py_UCS4 ch2 = *s;
3337            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3338                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3339                s++;
3340                size--;
3341            }
3342        }
3343#endif
3344        STORECHAR(ch);
3345    }
3346
3347  done:
3348    return v;
3349#undef STORECHAR
3350}
3351
3352PyObject *
3353PyUnicode_AsUTF32String(PyObject *unicode)
3354{
3355    if (!PyUnicode_Check(unicode)) {
3356        PyErr_BadArgument();
3357        return NULL;
3358    }
3359    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
3360                                 PyUnicode_GET_SIZE(unicode),
3361                                 NULL,
3362                                 0);
3363}
3364
3365/* --- UTF-16 Codec ------------------------------------------------------- */
3366
3367PyObject *
3368PyUnicode_DecodeUTF16(const char *s,
3369                      Py_ssize_t size,
3370                      const char *errors,
3371                      int *byteorder)
3372{
3373    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3374}
3375
3376/* Two masks for fast checking of whether a C 'long' may contain
3377   UTF16-encoded surrogate characters. This is an efficient heuristic,
3378   assuming that non-surrogate characters with a code point >= 0x8000 are
3379   rare in most input.
3380   FAST_CHAR_MASK is used when the input is in native byte ordering,
3381   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
3382*/
3383#if (SIZEOF_LONG == 8)
3384# define FAST_CHAR_MASK         0x8000800080008000L
3385# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3386#elif (SIZEOF_LONG == 4)
3387# define FAST_CHAR_MASK         0x80008000L
3388# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3389#else
3390# error C 'long' size should be either 4 or 8!
3391#endif
3392
3393PyObject *
3394PyUnicode_DecodeUTF16Stateful(const char *s,
3395                              Py_ssize_t size,
3396                              const char *errors,
3397                              int *byteorder,
3398                              Py_ssize_t *consumed)
3399{
3400    const char *starts = s;
3401    Py_ssize_t startinpos;
3402    Py_ssize_t endinpos;
3403    Py_ssize_t outpos;
3404    PyUnicodeObject *unicode;
3405    Py_UNICODE *p;
3406    const unsigned char *q, *e, *aligned_end;
3407    int bo = 0;       /* assume native ordering by default */
3408    int native_ordering = 0;
3409    const char *errmsg = "";
3410    /* Offsets from q for retrieving byte pairs in the right order. */
3411#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3412    int ihi = 1, ilo = 0;
3413#else
3414    int ihi = 0, ilo = 1;
3415#endif
3416    PyObject *errorHandler = NULL;
3417    PyObject *exc = NULL;
3418
3419    /* Note: size will always be longer than the resulting Unicode
3420       character count */
3421    unicode = _PyUnicode_New(size);
3422    if (!unicode)
3423        return NULL;
3424    if (size == 0)
3425        return (PyObject *)unicode;
3426
3427    /* Unpack UTF-16 encoded data */
3428    p = unicode->str;
3429    q = (unsigned char *)s;
3430    e = q + size - 1;
3431
3432    if (byteorder)
3433        bo = *byteorder;
3434
3435    /* Check for BOM marks (U+FEFF) in the input and adjust current
3436       byte order setting accordingly. In native mode, the leading BOM
3437       mark is skipped, in all other modes, it is copied to the output
3438       stream as-is (giving a ZWNBSP character). */
3439    if (bo == 0) {
3440        if (size >= 2) {
3441            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
3442#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3443            if (bom == 0xFEFF) {
3444                q += 2;
3445                bo = -1;
3446            }
3447            else if (bom == 0xFFFE) {
3448                q += 2;
3449                bo = 1;
3450            }
3451#else
3452            if (bom == 0xFEFF) {
3453                q += 2;
3454                bo = 1;
3455            }
3456            else if (bom == 0xFFFE) {
3457                q += 2;
3458                bo = -1;
3459            }
3460#endif
3461        }
3462    }
3463
3464    if (bo == -1) {
3465        /* force LE */
3466        ihi = 1;
3467        ilo = 0;
3468    }
3469    else if (bo == 1) {
3470        /* force BE */
3471        ihi = 0;
3472        ilo = 1;
3473    }
3474#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3475    native_ordering = ilo < ihi;
3476#else
3477    native_ordering = ilo > ihi;
3478#endif
3479
3480    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3481    while (q < e) {
3482        Py_UNICODE ch;
3483        /* First check for possible aligned read of a C 'long'. Unaligned
3484           reads are more expensive, better to defer to another iteration. */
3485        if (!((size_t) q & LONG_PTR_MASK)) {
3486            /* Fast path for runs of non-surrogate chars. */
3487            register const unsigned char *_q = q;
3488            Py_UNICODE *_p = p;
3489            if (native_ordering) {
3490                /* Native ordering is simple: as long as the input cannot
3491                   possibly contain a surrogate char, do an unrolled copy
3492                   of several 16-bit code points to the target object.
3493                   The non-surrogate check is done on several input bytes
3494                   at a time (as many as a C 'long' can contain). */
3495                while (_q < aligned_end) {
3496                    unsigned long data = * (unsigned long *) _q;
3497                    if (data & FAST_CHAR_MASK)
3498                        break;
3499                    _p[0] = ((unsigned short *) _q)[0];
3500                    _p[1] = ((unsigned short *) _q)[1];
3501#if (SIZEOF_LONG == 8)
3502                    _p[2] = ((unsigned short *) _q)[2];
3503                    _p[3] = ((unsigned short *) _q)[3];
3504#endif
3505                    _q += SIZEOF_LONG;
3506                    _p += SIZEOF_LONG / 2;
3507                }
3508            }
3509            else {
3510                /* Byteswapped ordering is similar, but we must decompose
3511                   the copy bytewise, and take care of zero'ing out the
3512                   upper bytes if the target object is in 32-bit units
3513                   (that is, in UCS-4 builds). */
3514                while (_q < aligned_end) {
3515                    unsigned long data = * (unsigned long *) _q;
3516                    if (data & SWAPPED_FAST_CHAR_MASK)
3517                        break;
3518                    /* Zero upper bytes in UCS-4 builds */
3519#if (Py_UNICODE_SIZE > 2)
3520                    _p[0] = 0;
3521                    _p[1] = 0;
3522#if (SIZEOF_LONG == 8)
3523                    _p[2] = 0;
3524                    _p[3] = 0;
3525#endif
3526#endif
3527                    /* Issue #4916; UCS-4 builds on big endian machines must
3528                       fill the two last bytes of each 4-byte unit. */
3529#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3530# define OFF 2
3531#else
3532# define OFF 0
3533#endif
3534                    ((unsigned char *) _p)[OFF + 1] = _q[0];
3535                    ((unsigned char *) _p)[OFF + 0] = _q[1];
3536                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3537                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3538#if (SIZEOF_LONG == 8)
3539                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3540                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3541                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3542                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3543#endif
3544#undef OFF
3545                    _q += SIZEOF_LONG;
3546                    _p += SIZEOF_LONG / 2;
3547                }
3548            }
3549            p = _p;
3550            q = _q;
3551            if (q >= e)
3552                break;
3553        }
3554        ch = (q[ihi] << 8) | q[ilo];
3555
3556        q += 2;
3557
3558        if (ch < 0xD800 || ch > 0xDFFF) {
3559            *p++ = ch;
3560            continue;
3561        }
3562
3563        /* UTF-16 code pair: */
3564        if (q > e) {
3565            errmsg = "unexpected end of data";
3566            startinpos = (((const char *)q) - 2) - starts;
3567            endinpos = ((const char *)e) + 1 - starts;
3568            goto utf16Error;
3569        }
3570        if (0xD800 <= ch && ch <= 0xDBFF) {
3571            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3572            q += 2;
3573            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3574#ifndef Py_UNICODE_WIDE
3575                *p++ = ch;
3576                *p++ = ch2;
3577#else
3578                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3579#endif
3580                continue;
3581            }
3582            else {
3583                errmsg = "illegal UTF-16 surrogate";
3584                startinpos = (((const char *)q)-4)-starts;
3585                endinpos = startinpos+2;
3586                goto utf16Error;
3587            }
3588
3589        }
3590        errmsg = "illegal encoding";
3591        startinpos = (((const char *)q)-2)-starts;
3592        endinpos = startinpos+2;
3593        /* Fall through to report the error */
3594
3595      utf16Error:
3596        outpos = p - PyUnicode_AS_UNICODE(unicode);
3597        if (unicode_decode_call_errorhandler(
3598                errors,
3599                &errorHandler,
3600                "utf16", errmsg,
3601                &starts,
3602                (const char **)&e,
3603                &startinpos,
3604                &endinpos,
3605                &exc,
3606                (const char **)&q,
3607                &unicode,
3608                &outpos,
3609                &p))
3610            goto onError;
3611    }
3612    /* remaining byte at the end? (size should be even) */
3613    if (e == q) {
3614        if (!consumed) {
3615            errmsg = "truncated data";
3616            startinpos = ((const char *)q) - starts;
3617            endinpos = ((const char *)e) + 1 - starts;
3618            outpos = p - PyUnicode_AS_UNICODE(unicode);
3619            if (unicode_decode_call_errorhandler(
3620                    errors,
3621                    &errorHandler,
3622                    "utf16", errmsg,
3623                    &starts,
3624                    (const char **)&e,
3625                    &startinpos,
3626                    &endinpos,
3627                    &exc,
3628                    (const char **)&q,
3629                    &unicode,
3630                    &outpos,
3631                    &p))
3632                goto onError;
3633            /* The remaining input chars are ignored if the callback
3634               chooses to skip the input */
3635        }
3636    }
3637
3638    if (byteorder)
3639        *byteorder = bo;
3640
3641    if (consumed)
3642        *consumed = (const char *)q-starts;
3643
3644    /* Adjust length */
3645    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3646        goto onError;
3647
3648    Py_XDECREF(errorHandler);
3649    Py_XDECREF(exc);
3650    return (PyObject *)unicode;
3651
3652  onError:
3653    Py_DECREF(unicode);
3654    Py_XDECREF(errorHandler);
3655    Py_XDECREF(exc);
3656    return NULL;
3657}
3658
3659#undef FAST_CHAR_MASK
3660#undef SWAPPED_FAST_CHAR_MASK
3661
3662PyObject *
3663PyUnicode_EncodeUTF16(const Py_UNICODE *s,
3664                      Py_ssize_t size,
3665                      const char *errors,
3666                      int byteorder)
3667{
3668    PyObject *v;
3669    unsigned char *p;
3670    Py_ssize_t nsize, bytesize;
3671#ifdef Py_UNICODE_WIDE
3672    Py_ssize_t i, pairs;
3673#else
3674    const int pairs = 0;
3675#endif
3676    /* Offsets from p for storing byte pairs in the right order. */
3677#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3678    int ihi = 1, ilo = 0;
3679#else
3680    int ihi = 0, ilo = 1;
3681#endif
3682
3683#define STORECHAR(CH)                           \
3684    do {                                        \
3685        p[ihi] = ((CH) >> 8) & 0xff;            \
3686        p[ilo] = (CH) & 0xff;                   \
3687        p += 2;                                 \
3688    } while(0)
3689
3690#ifdef Py_UNICODE_WIDE
3691    for (i = pairs = 0; i < size; i++)
3692        if (s[i] >= 0x10000)
3693            pairs++;
3694#endif
3695    /* 2 * (size + pairs + (byteorder == 0)) */
3696    if (size > PY_SSIZE_T_MAX ||
3697        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
3698        return PyErr_NoMemory();
3699    nsize = size + pairs + (byteorder == 0);
3700    bytesize = nsize * 2;
3701    if (bytesize / 2 != nsize)
3702        return PyErr_NoMemory();
3703    v = PyBytes_FromStringAndSize(NULL, bytesize);
3704    if (v == NULL)
3705        return NULL;
3706
3707    p = (unsigned char *)PyBytes_AS_STRING(v);
3708    if (byteorder == 0)
3709        STORECHAR(0xFEFF);
3710    if (size == 0)
3711        goto done;
3712
3713    if (byteorder == -1) {
3714        /* force LE */
3715        ihi = 1;
3716        ilo = 0;
3717    }
3718    else if (byteorder == 1) {
3719        /* force BE */
3720        ihi = 0;
3721        ilo = 1;
3722    }
3723
3724    while (size-- > 0) {
3725        Py_UNICODE ch = *s++;
3726        Py_UNICODE ch2 = 0;
3727#ifdef Py_UNICODE_WIDE
3728        if (ch >= 0x10000) {
3729            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3730            ch  = 0xD800 | ((ch-0x10000) >> 10);
3731        }
3732#endif
3733        STORECHAR(ch);
3734        if (ch2)
3735            STORECHAR(ch2);
3736    }
3737
3738  done:
3739    return v;
3740#undef STORECHAR
3741}
3742
3743PyObject *
3744PyUnicode_AsUTF16String(PyObject *unicode)
3745{
3746    if (!PyUnicode_Check(unicode)) {
3747        PyErr_BadArgument();
3748        return NULL;
3749    }
3750    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3751                                 PyUnicode_GET_SIZE(unicode),
3752                                 NULL,
3753                                 0);
3754}
3755
3756/* --- Unicode Escape Codec ----------------------------------------------- */
3757
3758static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3759
3760PyObject *
3761PyUnicode_DecodeUnicodeEscape(const char *s,
3762			      Py_ssize_t size,
3763			      const char *errors)
3764{
3765    const char *starts = s;
3766    Py_ssize_t startinpos;
3767    Py_ssize_t endinpos;
3768    Py_ssize_t outpos;
3769    int i;
3770    PyUnicodeObject *v;
3771    Py_UNICODE *p;
3772    const char *end;
3773    char* message;
3774    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3775    PyObject *errorHandler = NULL;
3776    PyObject *exc = NULL;
3777
3778    /* Escaped strings will always be longer than the resulting
3779       Unicode string, so we start with size here and then reduce the
3780       length after conversion to the true value.
3781       (but if the error callback returns a long replacement string
3782       we'll have to allocate more space) */
3783    v = _PyUnicode_New(size);
3784    if (v == NULL)
3785        goto onError;
3786    if (size == 0)
3787        return (PyObject *)v;
3788
3789    p = PyUnicode_AS_UNICODE(v);
3790    end = s + size;
3791
3792    while (s < end) {
3793        unsigned char c;
3794        Py_UNICODE x;
3795        int digits;
3796
3797        /* Non-escape characters are interpreted as Unicode ordinals */
3798        if (*s != '\\') {
3799            *p++ = (unsigned char) *s++;
3800            continue;
3801        }
3802
3803        startinpos = s-starts;
3804        /* \ - Escapes */
3805        s++;
3806        c = *s++;
3807        if (s > end)
3808            c = '\0'; /* Invalid after \ */
3809        switch (c) {
3810
3811            /* \x escapes */
3812        case '\n': break;
3813        case '\\': *p++ = '\\'; break;
3814        case '\'': *p++ = '\''; break;
3815        case '\"': *p++ = '\"'; break;
3816        case 'b': *p++ = '\b'; break;
3817        case 'f': *p++ = '\014'; break; /* FF */
3818        case 't': *p++ = '\t'; break;
3819        case 'n': *p++ = '\n'; break;
3820        case 'r': *p++ = '\r'; break;
3821        case 'v': *p++ = '\013'; break; /* VT */
3822        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3823
3824            /* \OOO (octal) escapes */
3825        case '0': case '1': case '2': case '3':
3826        case '4': case '5': case '6': case '7':
3827            x = s[-1] - '0';
3828            if (s < end && '0' <= *s && *s <= '7') {
3829                x = (x<<3) + *s++ - '0';
3830                if (s < end && '0' <= *s && *s <= '7')
3831                    x = (x<<3) + *s++ - '0';
3832            }
3833            *p++ = x;
3834            break;
3835
3836            /* hex escapes */
3837            /* \xXX */
3838        case 'x':
3839            digits = 2;
3840            message = "truncated \\xXX escape";
3841            goto hexescape;
3842
3843            /* \uXXXX */
3844        case 'u':
3845            digits = 4;
3846            message = "truncated \\uXXXX escape";
3847            goto hexescape;
3848
3849            /* \UXXXXXXXX */
3850        case 'U':
3851            digits = 8;
3852            message = "truncated \\UXXXXXXXX escape";
3853        hexescape:
3854            chr = 0;
3855            outpos = p-PyUnicode_AS_UNICODE(v);
3856            if (s+digits>end) {
3857                endinpos = size;
3858                if (unicode_decode_call_errorhandler(
3859                        errors, &errorHandler,
3860                        "unicodeescape", "end of string in escape sequence",
3861                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3862                        &v, &outpos, &p))
3863                    goto onError;
3864                goto nextByte;
3865            }
3866            for (i = 0; i < digits; ++i) {
3867                c = (unsigned char) s[i];
3868                if (!Py_ISXDIGIT(c)) {
3869                    endinpos = (s+i+1)-starts;
3870                    if (unicode_decode_call_errorhandler(
3871                            errors, &errorHandler,
3872                            "unicodeescape", message,
3873                            &starts, &end, &startinpos, &endinpos, &exc, &s,
3874                            &v, &outpos, &p))
3875                        goto onError;
3876                    goto nextByte;
3877                }
3878                chr = (chr<<4) & ~0xF;
3879                if (c >= '0' && c <= '9')
3880                    chr += c - '0';
3881                else if (c >= 'a' && c <= 'f')
3882                    chr += 10 + c - 'a';
3883                else
3884                    chr += 10 + c - 'A';
3885            }
3886            s += i;
3887            if (chr == 0xffffffff && PyErr_Occurred())
3888                /* _decoding_error will have already written into the
3889                   target buffer. */
3890                break;
3891        store:
3892            /* when we get here, chr is a 32-bit unicode character */
3893            if (chr <= 0xffff)
3894                /* UCS-2 character */
3895                *p++ = (Py_UNICODE) chr;
3896            else if (chr <= 0x10ffff) {
3897                /* UCS-4 character. Either store directly, or as
3898                   surrogate pair. */
3899#ifdef Py_UNICODE_WIDE
3900                *p++ = chr;
3901#else
3902                chr -= 0x10000L;
3903                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3904                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3905#endif
3906            } else {
3907                endinpos = s-starts;
3908                outpos = p-PyUnicode_AS_UNICODE(v);
3909                if (unicode_decode_call_errorhandler(
3910                        errors, &errorHandler,
3911                        "unicodeescape", "illegal Unicode character",
3912                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3913                        &v, &outpos, &p))
3914                    goto onError;
3915            }
3916            break;
3917
3918            /* \N{name} */
3919        case 'N':
3920            message = "malformed \\N character escape";
3921            if (ucnhash_CAPI == NULL) {
3922                /* load the unicode data module */
3923                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3924                if (ucnhash_CAPI == NULL)
3925                    goto ucnhashError;
3926            }
3927            if (*s == '{') {
3928                const char *start = s+1;
3929                /* look for the closing brace */
3930                while (*s != '}' && s < end)
3931                    s++;
3932                if (s > start && s < end && *s == '}') {
3933                    /* found a name.  look it up in the unicode database */
3934                    message = "unknown Unicode character name";
3935                    s++;
3936                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3937                        goto store;
3938                }
3939            }
3940            endinpos = s-starts;
3941            outpos = p-PyUnicode_AS_UNICODE(v);
3942            if (unicode_decode_call_errorhandler(
3943                    errors, &errorHandler,
3944                    "unicodeescape", message,
3945                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3946                    &v, &outpos, &p))
3947                goto onError;
3948            break;
3949
3950        default:
3951            if (s > end) {
3952                message = "\\ at end of string";
3953                s--;
3954                endinpos = s-starts;
3955                outpos = p-PyUnicode_AS_UNICODE(v);
3956                if (unicode_decode_call_errorhandler(
3957                        errors, &errorHandler,
3958                        "unicodeescape", message,
3959                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3960                        &v, &outpos, &p))
3961                    goto onError;
3962            }
3963            else {
3964                *p++ = '\\';
3965                *p++ = (unsigned char)s[-1];
3966            }
3967            break;
3968        }
3969      nextByte:
3970        ;
3971    }
3972    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3973        goto onError;
3974    Py_XDECREF(errorHandler);
3975    Py_XDECREF(exc);
3976    return (PyObject *)v;
3977
3978  ucnhashError:
3979    PyErr_SetString(
3980        PyExc_UnicodeError,
3981        "\\N escapes not supported (can't load unicodedata module)"
3982        );
3983    Py_XDECREF(v);
3984    Py_XDECREF(errorHandler);
3985    Py_XDECREF(exc);
3986    return NULL;
3987
3988  onError:
3989    Py_XDECREF(v);
3990    Py_XDECREF(errorHandler);
3991    Py_XDECREF(exc);
3992    return NULL;
3993}
3994
3995/* Return a Unicode-Escape string version of the Unicode object.
3996
3997   If quotes is true, the string is enclosed in u"" or u'' quotes as
3998   appropriate.
3999
4000*/
4001
4002Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
4003                                             Py_ssize_t size,
4004                                             Py_UNICODE ch)
4005{
4006    /* like wcschr, but doesn't stop at NULL characters */
4007
4008    while (size-- > 0) {
4009        if (*s == ch)
4010            return s;
4011        s++;
4012    }
4013
4014    return NULL;
4015}
4016
4017static const char *hexdigits = "0123456789abcdef";
4018
4019PyObject *
4020PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4021			      Py_ssize_t size)
4022{
4023    PyObject *repr;
4024    char *p;
4025
4026#ifdef Py_UNICODE_WIDE
4027    const Py_ssize_t expandsize = 10;
4028#else
4029    const Py_ssize_t expandsize = 6;
4030#endif
4031
4032    /* XXX(nnorwitz): rather than over-allocating, it would be
4033       better to choose a different scheme.  Perhaps scan the
4034       first N-chars of the string and allocate based on that size.
4035    */
4036    /* Initial allocation is based on the longest-possible unichr
4037       escape.
4038
4039       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4040       unichr, so in this case it's the longest unichr escape. In
4041       narrow (UTF-16) builds this is five chars per source unichr
4042       since there are two unichrs in the surrogate pair, so in narrow
4043       (UTF-16) builds it's not the longest unichr escape.
4044
4045       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4046       so in the narrow (UTF-16) build case it's the longest unichr
4047       escape.
4048    */
4049
4050    if (size == 0)
4051        return PyBytes_FromStringAndSize(NULL, 0);
4052
4053    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
4054        return PyErr_NoMemory();
4055
4056    repr = PyBytes_FromStringAndSize(NULL,
4057                                     2
4058                                     + expandsize*size
4059                                     + 1);
4060    if (repr == NULL)
4061        return NULL;
4062
4063    p = PyBytes_AS_STRING(repr);
4064
4065    while (size-- > 0) {
4066        Py_UNICODE ch = *s++;
4067
4068        /* Escape backslashes */
4069        if (ch == '\\') {
4070            *p++ = '\\';
4071            *p++ = (char) ch;
4072            continue;
4073        }
4074
4075#ifdef Py_UNICODE_WIDE
4076        /* Map 21-bit characters to '\U00xxxxxx' */
4077        else if (ch >= 0x10000) {
4078            *p++ = '\\';
4079            *p++ = 'U';
4080            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4081            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4082            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4083            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4084            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4085            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4086            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4087            *p++ = hexdigits[ch & 0x0000000F];
4088            continue;
4089        }
4090#else
4091        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4092        else if (ch >= 0xD800 && ch < 0xDC00) {
4093            Py_UNICODE ch2;
4094            Py_UCS4 ucs;
4095
4096            ch2 = *s++;
4097            size--;
4098            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4099                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4100                *p++ = '\\';
4101                *p++ = 'U';
4102                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4103                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4104                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4105                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4106                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4107                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4108                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4109                *p++ = hexdigits[ucs & 0x0000000F];
4110                continue;
4111            }
4112            /* Fall through: isolated surrogates are copied as-is */
4113            s--;
4114            size++;
4115        }
4116#endif
4117
4118        /* Map 16-bit characters to '\uxxxx' */
4119        if (ch >= 256) {
4120            *p++ = '\\';
4121            *p++ = 'u';
4122            *p++ = hexdigits[(ch >> 12) & 0x000F];
4123            *p++ = hexdigits[(ch >> 8) & 0x000F];
4124            *p++ = hexdigits[(ch >> 4) & 0x000F];
4125            *p++ = hexdigits[ch & 0x000F];
4126        }
4127
4128        /* Map special whitespace to '\t', \n', '\r' */
4129        else if (ch == '\t') {
4130            *p++ = '\\';
4131            *p++ = 't';
4132        }
4133        else if (ch == '\n') {
4134            *p++ = '\\';
4135            *p++ = 'n';
4136        }
4137        else if (ch == '\r') {
4138            *p++ = '\\';
4139            *p++ = 'r';
4140        }
4141
4142        /* Map non-printable US ASCII to '\xhh' */
4143        else if (ch < ' ' || ch >= 0x7F) {
4144            *p++ = '\\';
4145            *p++ = 'x';
4146            *p++ = hexdigits[(ch >> 4) & 0x000F];
4147            *p++ = hexdigits[ch & 0x000F];
4148        }
4149
4150        /* Copy everything else as-is */
4151        else
4152            *p++ = (char) ch;
4153    }
4154
4155    assert(p - PyBytes_AS_STRING(repr) > 0);
4156    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4157        return NULL;
4158    return repr;
4159}
4160
4161PyObject *
4162PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
4163{
4164    PyObject *s;
4165    if (!PyUnicode_Check(unicode)) {
4166        PyErr_BadArgument();
4167        return NULL;
4168    }
4169    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4170                                      PyUnicode_GET_SIZE(unicode));
4171    return s;
4172}
4173
4174/* --- Raw Unicode Escape Codec ------------------------------------------- */
4175
4176PyObject *
4177PyUnicode_DecodeRawUnicodeEscape(const char *s,
4178				 Py_ssize_t size,
4179				 const char *errors)
4180{
4181    const char *starts = s;
4182    Py_ssize_t startinpos;
4183    Py_ssize_t endinpos;
4184    Py_ssize_t outpos;
4185    PyUnicodeObject *v;
4186    Py_UNICODE *p;
4187    const char *end;
4188    const char *bs;
4189    PyObject *errorHandler = NULL;
4190    PyObject *exc = NULL;
4191
4192    /* Escaped strings will always be longer than the resulting
4193       Unicode string, so we start with size here and then reduce the
4194       length after conversion to the true value. (But decoding error
4195       handler might have to resize the string) */
4196    v = _PyUnicode_New(size);
4197    if (v == NULL)
4198        goto onError;
4199    if (size == 0)
4200        return (PyObject *)v;
4201    p = PyUnicode_AS_UNICODE(v);
4202    end = s + size;
4203    while (s < end) {
4204        unsigned char c;
4205        Py_UCS4 x;
4206        int i;
4207        int count;
4208
4209        /* Non-escape characters are interpreted as Unicode ordinals */
4210        if (*s != '\\') {
4211            *p++ = (unsigned char)*s++;
4212            continue;
4213        }
4214        startinpos = s-starts;
4215
4216        /* \u-escapes are only interpreted iff the number of leading
4217           backslashes if odd */
4218        bs = s;
4219        for (;s < end;) {
4220            if (*s != '\\')
4221                break;
4222            *p++ = (unsigned char)*s++;
4223        }
4224        if (((s - bs) & 1) == 0 ||
4225            s >= end ||
4226            (*s != 'u' && *s != 'U')) {
4227            continue;
4228        }
4229        p--;
4230        count = *s=='u' ? 4 : 8;
4231        s++;
4232
4233        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4234        outpos = p-PyUnicode_AS_UNICODE(v);
4235        for (x = 0, i = 0; i < count; ++i, ++s) {
4236            c = (unsigned char)*s;
4237            if (!Py_ISXDIGIT(c)) {
4238                endinpos = s-starts;
4239                if (unicode_decode_call_errorhandler(
4240                        errors, &errorHandler,
4241                        "rawunicodeescape", "truncated \\uXXXX",
4242                        &starts, &end, &startinpos, &endinpos, &exc, &s,
4243                        &v, &outpos, &p))
4244                    goto onError;
4245                goto nextByte;
4246            }
4247            x = (x<<4) & ~0xF;
4248            if (c >= '0' && c <= '9')
4249                x += c - '0';
4250            else if (c >= 'a' && c <= 'f')
4251                x += 10 + c - 'a';
4252            else
4253                x += 10 + c - 'A';
4254        }
4255        if (x <= 0xffff)
4256            /* UCS-2 character */
4257            *p++ = (Py_UNICODE) x;
4258        else if (x <= 0x10ffff) {
4259            /* UCS-4 character. Either store directly, or as
4260               surrogate pair. */
4261#ifdef Py_UNICODE_WIDE
4262            *p++ = (Py_UNICODE) x;
4263#else
4264            x -= 0x10000L;
4265            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4266            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
4267#endif
4268        } else {
4269            endinpos = s-starts;
4270            outpos = p-PyUnicode_AS_UNICODE(v);
4271            if (unicode_decode_call_errorhandler(
4272                    errors, &errorHandler,
4273                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
4274                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4275                    &v, &outpos, &p))
4276                goto onError;
4277        }
4278      nextByte:
4279        ;
4280    }
4281    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4282        goto onError;
4283    Py_XDECREF(errorHandler);
4284    Py_XDECREF(exc);
4285    return (PyObject *)v;
4286
4287  onError:
4288    Py_XDECREF(v);
4289    Py_XDECREF(errorHandler);
4290    Py_XDECREF(exc);
4291    return NULL;
4292}
4293
4294PyObject *
4295PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4296				 Py_ssize_t size)
4297{
4298    PyObject *repr;
4299    char *p;
4300    char *q;
4301
4302#ifdef Py_UNICODE_WIDE
4303    const Py_ssize_t expandsize = 10;
4304#else
4305    const Py_ssize_t expandsize = 6;
4306#endif
4307
4308    if (size > PY_SSIZE_T_MAX / expandsize)
4309        return PyErr_NoMemory();
4310
4311    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
4312    if (repr == NULL)
4313        return NULL;
4314    if (size == 0)
4315        return repr;
4316
4317    p = q = PyBytes_AS_STRING(repr);
4318    while (size-- > 0) {
4319        Py_UNICODE ch = *s++;
4320#ifdef Py_UNICODE_WIDE
4321        /* Map 32-bit characters to '\Uxxxxxxxx' */
4322        if (ch >= 0x10000) {
4323            *p++ = '\\';
4324            *p++ = 'U';
4325            *p++ = hexdigits[(ch >> 28) & 0xf];
4326            *p++ = hexdigits[(ch >> 24) & 0xf];
4327            *p++ = hexdigits[(ch >> 20) & 0xf];
4328            *p++ = hexdigits[(ch >> 16) & 0xf];
4329            *p++ = hexdigits[(ch >> 12) & 0xf];
4330            *p++ = hexdigits[(ch >> 8) & 0xf];
4331            *p++ = hexdigits[(ch >> 4) & 0xf];
4332            *p++ = hexdigits[ch & 15];
4333        }
4334        else
4335#else
4336            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4337            if (ch >= 0xD800 && ch < 0xDC00) {
4338                Py_UNICODE ch2;
4339                Py_UCS4 ucs;
4340
4341                ch2 = *s++;
4342                size--;
4343                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4344                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4345                    *p++ = '\\';
4346                    *p++ = 'U';
4347                    *p++ = hexdigits[(ucs >> 28) & 0xf];
4348                    *p++ = hexdigits[(ucs >> 24) & 0xf];
4349                    *p++ = hexdigits[(ucs >> 20) & 0xf];
4350                    *p++ = hexdigits[(ucs >> 16) & 0xf];
4351                    *p++ = hexdigits[(ucs >> 12) & 0xf];
4352                    *p++ = hexdigits[(ucs >> 8) & 0xf];
4353                    *p++ = hexdigits[(ucs >> 4) & 0xf];
4354                    *p++ = hexdigits[ucs & 0xf];
4355                    continue;
4356                }
4357                /* Fall through: isolated surrogates are copied as-is */
4358                s--;
4359                size++;
4360            }
4361#endif
4362        /* Map 16-bit characters to '\uxxxx' */
4363        if (ch >= 256) {
4364            *p++ = '\\';
4365            *p++ = 'u';
4366            *p++ = hexdigits[(ch >> 12) & 0xf];
4367            *p++ = hexdigits[(ch >> 8) & 0xf];
4368            *p++ = hexdigits[(ch >> 4) & 0xf];
4369            *p++ = hexdigits[ch & 15];
4370        }
4371        /* Copy everything else as-is */
4372        else
4373            *p++ = (char) ch;
4374    }
4375    size = p - q;
4376
4377    assert(size > 0);
4378    if (_PyBytes_Resize(&repr, size) < 0)
4379        return NULL;
4380    return repr;
4381}
4382
4383PyObject *
4384PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4385{
4386    PyObject *s;
4387    if (!PyUnicode_Check(unicode)) {
4388        PyErr_BadArgument();
4389        return NULL;
4390    }
4391    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4392                                         PyUnicode_GET_SIZE(unicode));
4393
4394    return s;
4395}
4396
4397/* --- Unicode Internal Codec ------------------------------------------- */
4398
4399PyObject *
4400_PyUnicode_DecodeUnicodeInternal(const char *s,
4401				 Py_ssize_t size,
4402				 const char *errors)
4403{
4404    const char *starts = s;
4405    Py_ssize_t startinpos;
4406    Py_ssize_t endinpos;
4407    Py_ssize_t outpos;
4408    PyUnicodeObject *v;
4409    Py_UNICODE *p;
4410    const char *end;
4411    const char *reason;
4412    PyObject *errorHandler = NULL;
4413    PyObject *exc = NULL;
4414
4415#ifdef Py_UNICODE_WIDE
4416    Py_UNICODE unimax = PyUnicode_GetMax();
4417#endif
4418
4419    /* XXX overflow detection missing */
4420    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4421    if (v == NULL)
4422        goto onError;
4423    if (PyUnicode_GetSize((PyObject *)v) == 0)
4424        return (PyObject *)v;
4425    p = PyUnicode_AS_UNICODE(v);
4426    end = s + size;
4427
4428    while (s < end) {
4429        memcpy(p, s, sizeof(Py_UNICODE));
4430        /* We have to sanity check the raw data, otherwise doom looms for
4431           some malformed UCS-4 data. */
4432        if (
4433#ifdef Py_UNICODE_WIDE
4434            *p > unimax || *p < 0 ||
4435#endif
4436            end-s < Py_UNICODE_SIZE
4437            )
4438        {
4439            startinpos = s - starts;
4440            if (end-s < Py_UNICODE_SIZE) {
4441                endinpos = end-starts;
4442                reason = "truncated input";
4443            }
4444            else {
4445                endinpos = s - starts + Py_UNICODE_SIZE;
4446                reason = "illegal code point (> 0x10FFFF)";
4447            }
4448            outpos = p - PyUnicode_AS_UNICODE(v);
4449            if (unicode_decode_call_errorhandler(
4450                    errors, &errorHandler,
4451                    "unicode_internal", reason,
4452                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4453                    &v, &outpos, &p)) {
4454                goto onError;
4455            }
4456        }
4457        else {
4458            p++;
4459            s += Py_UNICODE_SIZE;
4460        }
4461    }
4462
4463    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4464        goto onError;
4465    Py_XDECREF(errorHandler);
4466    Py_XDECREF(exc);
4467    return (PyObject *)v;
4468
4469  onError:
4470    Py_XDECREF(v);
4471    Py_XDECREF(errorHandler);
4472    Py_XDECREF(exc);
4473    return NULL;
4474}
4475
4476/* --- Latin-1 Codec ------------------------------------------------------ */
4477
4478PyObject *
4479PyUnicode_DecodeLatin1(const char *s,
4480		       Py_ssize_t size,
4481		       const char *errors)
4482{
4483    PyUnicodeObject *v;
4484    Py_UNICODE *p;
4485    const char *e, *unrolled_end;
4486
4487    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
4488    if (size == 1) {
4489        Py_UNICODE r = *(unsigned char*)s;
4490        return PyUnicode_FromUnicode(&r, 1);
4491    }
4492
4493    v = _PyUnicode_New(size);
4494    if (v == NULL)
4495        goto onError;
4496    if (size == 0)
4497        return (PyObject *)v;
4498    p = PyUnicode_AS_UNICODE(v);
4499    e = s + size;
4500    /* Unrolling the copy makes it much faster by reducing the looping
4501       overhead. This is similar to what many memcpy() implementations do. */
4502    unrolled_end = e - 4;
4503    while (s < unrolled_end) {
4504        p[0] = (unsigned char) s[0];
4505        p[1] = (unsigned char) s[1];
4506        p[2] = (unsigned char) s[2];
4507        p[3] = (unsigned char) s[3];
4508        s += 4;
4509        p += 4;
4510    }
4511    while (s < e)
4512        *p++ = (unsigned char) *s++;
4513    return (PyObject *)v;
4514
4515  onError:
4516    Py_XDECREF(v);
4517    return NULL;
4518}
4519
4520/* create or adjust a UnicodeEncodeError */
4521static void
4522make_encode_exception(PyObject **exceptionObject,
4523		      const char *encoding,
4524		      const Py_UNICODE *unicode, Py_ssize_t size,
4525		      Py_ssize_t startpos, Py_ssize_t endpos,
4526		      const char *reason)
4527{
4528    if (*exceptionObject == NULL) {
4529        *exceptionObject = PyUnicodeEncodeError_Create(
4530            encoding, unicode, size, startpos, endpos, reason);
4531    }
4532    else {
4533        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4534            goto onError;
4535        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4536            goto onError;
4537        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4538            goto onError;
4539        return;
4540      onError:
4541        Py_DECREF(*exceptionObject);
4542        *exceptionObject = NULL;
4543    }
4544}
4545
4546/* raises a UnicodeEncodeError */
4547static void
4548raise_encode_exception(PyObject **exceptionObject,
4549		       const char *encoding,
4550		       const Py_UNICODE *unicode, Py_ssize_t size,
4551		       Py_ssize_t startpos, Py_ssize_t endpos,
4552		       const char *reason)
4553{
4554    make_encode_exception(exceptionObject,
4555                          encoding, unicode, size, startpos, endpos, reason);
4556    if (*exceptionObject != NULL)
4557        PyCodec_StrictErrors(*exceptionObject);
4558}
4559
4560/* error handling callback helper:
4561   build arguments, call the callback and check the arguments,
4562   put the result into newpos and return the replacement string, which
4563   has to be freed by the caller */
4564static PyObject *
4565unicode_encode_call_errorhandler(const char *errors,
4566				 PyObject **errorHandler,
4567				 const char *encoding, const char *reason,
4568				 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4569				 Py_ssize_t startpos, Py_ssize_t endpos,
4570				 Py_ssize_t *newpos)
4571{
4572    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4573
4574    PyObject *restuple;
4575    PyObject *resunicode;
4576
4577    if (*errorHandler == NULL) {
4578        *errorHandler = PyCodec_LookupError(errors);
4579        if (*errorHandler == NULL)
4580            return NULL;
4581    }
4582
4583    make_encode_exception(exceptionObject,
4584                          encoding, unicode, size, startpos, endpos, reason);
4585    if (*exceptionObject == NULL)
4586        return NULL;
4587
4588    restuple = PyObject_CallFunctionObjArgs(
4589        *errorHandler, *exceptionObject, NULL);
4590    if (restuple == NULL)
4591        return NULL;
4592    if (!PyTuple_Check(restuple)) {
4593        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4594        Py_DECREF(restuple);
4595        return NULL;
4596    }
4597    if (!PyArg_ParseTuple(restuple, argparse,
4598                          &resunicode, newpos)) {
4599        Py_DECREF(restuple);
4600        return NULL;
4601    }
4602    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4603        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4604        Py_DECREF(restuple);
4605        return NULL;
4606    }
4607    if (*newpos<0)
4608        *newpos = size+*newpos;
4609    if (*newpos<0 || *newpos>size) {
4610        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4611        Py_DECREF(restuple);
4612        return NULL;
4613    }
4614    Py_INCREF(resunicode);
4615    Py_DECREF(restuple);
4616    return resunicode;
4617}
4618
4619static PyObject *
4620unicode_encode_ucs1(const Py_UNICODE *p,
4621		    Py_ssize_t size,
4622		    const char *errors,
4623		    int limit)
4624{
4625    /* output object */
4626    PyObject *res;
4627    /* pointers to the beginning and end+1 of input */
4628    const Py_UNICODE *startp = p;
4629    const Py_UNICODE *endp = p + size;
4630    /* pointer to the beginning of the unencodable characters */
4631    /* const Py_UNICODE *badp = NULL; */
4632    /* pointer into the output */
4633    char *str;
4634    /* current output position */
4635    Py_ssize_t ressize;
4636    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4637    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4638    PyObject *errorHandler = NULL;
4639    PyObject *exc = NULL;
4640    /* the following variable is used for caching string comparisons
4641     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4642    int known_errorHandler = -1;
4643
4644    /* allocate enough for a simple encoding without
4645       replacements, if we need more, we'll resize */
4646    if (size == 0)
4647        return PyBytes_FromStringAndSize(NULL, 0);
4648    res = PyBytes_FromStringAndSize(NULL, size);
4649    if (res == NULL)
4650        return NULL;
4651    str = PyBytes_AS_STRING(res);
4652    ressize = size;
4653
4654    while (p<endp) {
4655        Py_UNICODE c = *p;
4656
4657        /* can we encode this? */
4658        if (c<limit) {
4659            /* no overflow check, because we know that the space is enough */
4660            *str++ = (char)c;
4661            ++p;
4662        }
4663        else {
4664            Py_ssize_t unicodepos = p-startp;
4665            Py_ssize_t requiredsize;
4666            PyObject *repunicode;
4667            Py_ssize_t repsize;
4668            Py_ssize_t newpos;
4669            Py_ssize_t respos;
4670            Py_UNICODE *uni2;
4671            /* startpos for collecting unencodable chars */
4672            const Py_UNICODE *collstart = p;
4673            const Py_UNICODE *collend = p;
4674            /* find all unecodable characters */
4675            while ((collend < endp) && ((*collend)>=limit))
4676                ++collend;
4677            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4678            if (known_errorHandler==-1) {
4679                if ((errors==NULL) || (!strcmp(errors, "strict")))
4680                    known_errorHandler = 1;
4681                else if (!strcmp(errors, "replace"))
4682                    known_errorHandler = 2;
4683                else if (!strcmp(errors, "ignore"))
4684                    known_errorHandler = 3;
4685                else if (!strcmp(errors, "xmlcharrefreplace"))
4686                    known_errorHandler = 4;
4687                else
4688                    known_errorHandler = 0;
4689            }
4690            switch (known_errorHandler) {
4691            case 1: /* strict */
4692                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4693                goto onError;
4694            case 2: /* replace */
4695                while (collstart++<collend)
4696                    *str++ = '?'; /* fall through */
4697            case 3: /* ignore */
4698                p = collend;
4699                break;
4700            case 4: /* xmlcharrefreplace */
4701                respos = str - PyBytes_AS_STRING(res);
4702                /* determine replacement size (temporarily (mis)uses p) */
4703                for (p = collstart, repsize = 0; p < collend; ++p) {
4704                    if (*p<10)
4705                        repsize += 2+1+1;
4706                    else if (*p<100)
4707                        repsize += 2+2+1;
4708                    else if (*p<1000)
4709                        repsize += 2+3+1;
4710                    else if (*p<10000)
4711                        repsize += 2+4+1;
4712#ifndef Py_UNICODE_WIDE
4713                    else
4714                        repsize += 2+5+1;
4715#else
4716                    else if (*p<100000)
4717                        repsize += 2+5+1;
4718                    else if (*p<1000000)
4719                        repsize += 2+6+1;
4720                    else
4721                        repsize += 2+7+1;
4722#endif
4723                }
4724                requiredsize = respos+repsize+(endp-collend);
4725                if (requiredsize > ressize) {
4726                    if (requiredsize<2*ressize)
4727                        requiredsize = 2*ressize;
4728                    if (_PyBytes_Resize(&res, requiredsize))
4729                        goto onError;
4730                    str = PyBytes_AS_STRING(res) + respos;
4731                    ressize = requiredsize;
4732                }
4733                /* generate replacement (temporarily (mis)uses p) */
4734                for (p = collstart; p < collend; ++p) {
4735                    str += sprintf(str, "&#%d;", (int)*p);
4736                }
4737                p = collend;
4738                break;
4739            default:
4740                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4741                                                              encoding, reason, startp, size, &exc,
4742                                                              collstart-startp, collend-startp, &newpos);
4743                if (repunicode == NULL)
4744                    goto onError;
4745                if (PyBytes_Check(repunicode)) {
4746                    /* Directly copy bytes result to output. */
4747                    repsize = PyBytes_Size(repunicode);
4748                    if (repsize > 1) {
4749                        /* Make room for all additional bytes. */
4750                        respos = str - PyBytes_AS_STRING(res);
4751                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4752                            Py_DECREF(repunicode);
4753                            goto onError;
4754                        }
4755                        str = PyBytes_AS_STRING(res) + respos;
4756                        ressize += repsize-1;
4757                    }
4758                    memcpy(str, PyBytes_AsString(repunicode), repsize);
4759                    str += repsize;
4760                    p = startp + newpos;
4761                    Py_DECREF(repunicode);
4762                    break;
4763                }
4764                /* need more space? (at least enough for what we
4765                   have+the replacement+the rest of the string, so
4766                   we won't have to check space for encodable characters) */
4767                respos = str - PyBytes_AS_STRING(res);
4768                repsize = PyUnicode_GET_SIZE(repunicode);
4769                requiredsize = respos+repsize+(endp-collend);
4770                if (requiredsize > ressize) {
4771                    if (requiredsize<2*ressize)
4772                        requiredsize = 2*ressize;
4773                    if (_PyBytes_Resize(&res, requiredsize)) {
4774                        Py_DECREF(repunicode);
4775                        goto onError;
4776                    }
4777                    str = PyBytes_AS_STRING(res) + respos;
4778                    ressize = requiredsize;
4779                }
4780                /* check if there is anything unencodable in the replacement
4781                   and copy it to the output */
4782                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4783                    c = *uni2;
4784                    if (c >= limit) {
4785                        raise_encode_exception(&exc, encoding, startp, size,
4786                                               unicodepos, unicodepos+1, reason);
4787                        Py_DECREF(repunicode);
4788                        goto onError;
4789                    }
4790                    *str = (char)c;
4791                }
4792                p = startp + newpos;
4793                Py_DECREF(repunicode);
4794            }
4795        }
4796    }
4797    /* Resize if we allocated to much */
4798    size = str - PyBytes_AS_STRING(res);
4799    if (size < ressize) { /* If this falls res will be NULL */
4800        assert(size >= 0);
4801        if (_PyBytes_Resize(&res, size) < 0)
4802            goto onError;
4803    }
4804
4805    Py_XDECREF(errorHandler);
4806    Py_XDECREF(exc);
4807    return res;
4808
4809  onError:
4810    Py_XDECREF(res);
4811    Py_XDECREF(errorHandler);
4812    Py_XDECREF(exc);
4813    return NULL;
4814}
4815
4816PyObject *
4817PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4818		       Py_ssize_t size,
4819		       const char *errors)
4820{
4821    return unicode_encode_ucs1(p, size, errors, 256);
4822}
4823
4824PyObject *
4825PyUnicode_AsLatin1String(PyObject *unicode)
4826{
4827    if (!PyUnicode_Check(unicode)) {
4828        PyErr_BadArgument();
4829        return NULL;
4830    }
4831    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4832                                  PyUnicode_GET_SIZE(unicode),
4833                                  NULL);
4834}
4835
4836/* --- 7-bit ASCII Codec -------------------------------------------------- */
4837
4838PyObject *
4839PyUnicode_DecodeASCII(const char *s,
4840                      Py_ssize_t size,
4841                      const char *errors)
4842{
4843    const char *starts = s;
4844    PyUnicodeObject *v;
4845    Py_UNICODE *p;
4846    Py_ssize_t startinpos;
4847    Py_ssize_t endinpos;
4848    Py_ssize_t outpos;
4849    const char *e;
4850    PyObject *errorHandler = NULL;
4851    PyObject *exc = NULL;
4852
4853    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4854    if (size == 1 && *(unsigned char*)s < 128) {
4855        Py_UNICODE r = *(unsigned char*)s;
4856        return PyUnicode_FromUnicode(&r, 1);
4857    }
4858
4859    v = _PyUnicode_New(size);
4860    if (v == NULL)
4861        goto onError;
4862    if (size == 0)
4863        return (PyObject *)v;
4864    p = PyUnicode_AS_UNICODE(v);
4865    e = s + size;
4866    while (s < e) {
4867        register unsigned char c = (unsigned char)*s;
4868        if (c < 128) {
4869            *p++ = c;
4870            ++s;
4871        }
4872        else {
4873            startinpos = s-starts;
4874            endinpos = startinpos + 1;
4875            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4876            if (unicode_decode_call_errorhandler(
4877                    errors, &errorHandler,
4878                    "ascii", "ordinal not in range(128)",
4879                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4880                    &v, &outpos, &p))
4881                goto onError;
4882        }
4883    }
4884    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4885        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4886            goto onError;
4887    Py_XDECREF(errorHandler);
4888    Py_XDECREF(exc);
4889    return (PyObject *)v;
4890
4891  onError:
4892    Py_XDECREF(v);
4893    Py_XDECREF(errorHandler);
4894    Py_XDECREF(exc);
4895    return NULL;
4896}
4897
4898PyObject *
4899PyUnicode_EncodeASCII(const Py_UNICODE *p,
4900                      Py_ssize_t size,
4901                      const char *errors)
4902{
4903    return unicode_encode_ucs1(p, size, errors, 128);
4904}
4905
4906PyObject *
4907PyUnicode_AsASCIIString(PyObject *unicode)
4908{
4909    if (!PyUnicode_Check(unicode)) {
4910        PyErr_BadArgument();
4911        return NULL;
4912    }
4913    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4914                                 PyUnicode_GET_SIZE(unicode),
4915                                 NULL);
4916}
4917
4918#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4919
4920/* --- MBCS codecs for Windows -------------------------------------------- */
4921
4922#if SIZEOF_INT < SIZEOF_SIZE_T
4923#define NEED_RETRY
4924#endif
4925
4926/* XXX This code is limited to "true" double-byte encodings, as
4927   a) it assumes an incomplete character consists of a single byte, and
4928   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4929   encodings, see IsDBCSLeadByteEx documentation. */
4930
4931static int
4932is_dbcs_lead_byte(const char *s, int offset)
4933{
4934    const char *curr = s + offset;
4935
4936    if (IsDBCSLeadByte(*curr)) {
4937        const char *prev = CharPrev(s, curr);
4938        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4939    }
4940    return 0;
4941}
4942
4943/*
4944 * Decode MBCS string into unicode object. If 'final' is set, converts
4945 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4946 */
4947static int
4948decode_mbcs(PyUnicodeObject **v,
4949            const char *s, /* MBCS string */
4950            int size, /* sizeof MBCS string */
4951            int final,
4952            const char *errors)
4953{
4954    Py_UNICODE *p;
4955    Py_ssize_t n;
4956    DWORD usize;
4957    DWORD flags;
4958
4959    assert(size >= 0);
4960
4961    /* check and handle 'errors' arg */
4962    if (errors==NULL || strcmp(errors, "strict")==0)
4963        flags = MB_ERR_INVALID_CHARS;
4964    else if (strcmp(errors, "ignore")==0)
4965        flags = 0;
4966    else {
4967        PyErr_Format(PyExc_ValueError,
4968                     "mbcs encoding does not support errors='%s'",
4969                     errors);
4970        return -1;
4971    }
4972
4973    /* Skip trailing lead-byte unless 'final' is set */
4974    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4975        --size;
4976
4977    /* First get the size of the result */
4978    if (size > 0) {
4979        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4980        if (usize==0)
4981            goto mbcs_decode_error;
4982    } else
4983        usize = 0;
4984
4985    if (*v == NULL) {
4986        /* Create unicode object */
4987        *v = _PyUnicode_New(usize);
4988        if (*v == NULL)
4989            return -1;
4990        n = 0;
4991    }
4992    else {
4993        /* Extend unicode object */
4994        n = PyUnicode_GET_SIZE(*v);
4995        if (_PyUnicode_Resize(v, n + usize) < 0)
4996            return -1;
4997    }
4998
4999    /* Do the conversion */
5000    if (usize > 0) {
5001        p = PyUnicode_AS_UNICODE(*v) + n;
5002        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5003            goto mbcs_decode_error;
5004        }
5005    }
5006    return size;
5007
5008mbcs_decode_error:
5009    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5010       we raise a UnicodeDecodeError - else it is a 'generic'
5011       windows error
5012     */
5013    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5014        /* Ideally, we should get reason from FormatMessage - this
5015           is the Windows 2000 English version of the message
5016        */
5017        PyObject *exc = NULL;
5018        const char *reason = "No mapping for the Unicode character exists "
5019                             "in the target multi-byte code page.";
5020        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5021        if (exc != NULL) {
5022            PyCodec_StrictErrors(exc);
5023            Py_DECREF(exc);
5024        }
5025    } else {
5026        PyErr_SetFromWindowsErrWithFilename(0, NULL);
5027    }
5028    return -1;
5029}
5030
5031PyObject *
5032PyUnicode_DecodeMBCSStateful(const char *s,
5033                             Py_ssize_t size,
5034                             const char *errors,
5035                             Py_ssize_t *consumed)
5036{
5037    PyUnicodeObject *v = NULL;
5038    int done;
5039
5040    if (consumed)
5041        *consumed = 0;
5042
5043#ifdef NEED_RETRY
5044  retry:
5045    if (size > INT_MAX)
5046        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
5047    else
5048#endif
5049        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
5050
5051    if (done < 0) {
5052        Py_XDECREF(v);
5053        return NULL;
5054    }
5055
5056    if (consumed)
5057        *consumed += done;
5058
5059#ifdef NEED_RETRY
5060    if (size > INT_MAX) {
5061        s += done;
5062        size -= done;
5063        goto retry;
5064    }
5065#endif
5066
5067    return (PyObject *)v;
5068}
5069
5070PyObject *
5071PyUnicode_DecodeMBCS(const char *s,
5072                     Py_ssize_t size,
5073                     const char *errors)
5074{
5075    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5076}
5077
5078/*
5079 * Convert unicode into string object (MBCS).
5080 * Returns 0 if succeed, -1 otherwise.
5081 */
5082static int
5083encode_mbcs(PyObject **repr,
5084            const Py_UNICODE *p, /* unicode */
5085            int size, /* size of unicode */
5086            const char* errors)
5087{
5088    BOOL usedDefaultChar = FALSE;
5089    BOOL *pusedDefaultChar;
5090    int mbcssize;
5091    Py_ssize_t n;
5092    PyObject *exc = NULL;
5093    DWORD flags;
5094
5095    assert(size >= 0);
5096
5097    /* check and handle 'errors' arg */
5098    if (errors==NULL || strcmp(errors, "strict")==0) {
5099        flags = WC_NO_BEST_FIT_CHARS;
5100        pusedDefaultChar = &usedDefaultChar;
5101    } else if (strcmp(errors, "replace")==0) {
5102        flags = 0;
5103        pusedDefaultChar = NULL;
5104    } else {
5105         PyErr_Format(PyExc_ValueError,
5106                      "mbcs encoding does not support errors='%s'",
5107                      errors);
5108         return -1;
5109    }
5110
5111    /* First get the size of the result */
5112    if (size > 0) {
5113        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5114                                       NULL, pusedDefaultChar);
5115        if (mbcssize == 0) {
5116            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5117            return -1;
5118        }
5119        /* If we used a default char, then we failed! */
5120        if (pusedDefaultChar && *pusedDefaultChar)
5121            goto mbcs_encode_error;
5122    } else {
5123        mbcssize = 0;
5124    }
5125
5126    if (*repr == NULL) {
5127        /* Create string object */
5128        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5129        if (*repr == NULL)
5130            return -1;
5131        n = 0;
5132    }
5133    else {
5134        /* Extend string object */
5135        n = PyBytes_Size(*repr);
5136        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5137            return -1;
5138    }
5139
5140    /* Do the conversion */
5141    if (size > 0) {
5142        char *s = PyBytes_AS_STRING(*repr) + n;
5143        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5144                                     NULL, pusedDefaultChar)) {
5145            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5146            return -1;
5147        }
5148        if (pusedDefaultChar && *pusedDefaultChar)
5149            goto mbcs_encode_error;
5150    }
5151    return 0;
5152
5153mbcs_encode_error:
5154    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5155    Py_XDECREF(exc);
5156    return -1;
5157}
5158
5159PyObject *
5160PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5161                     Py_ssize_t size,
5162                     const char *errors)
5163{
5164    PyObject *repr = NULL;
5165    int ret;
5166
5167#ifdef NEED_RETRY
5168  retry:
5169    if (size > INT_MAX)
5170        ret = encode_mbcs(&repr, p, INT_MAX, errors);
5171    else
5172#endif
5173        ret = encode_mbcs(&repr, p, (int)size, errors);
5174
5175    if (ret < 0) {
5176        Py_XDECREF(repr);
5177        return NULL;
5178    }
5179
5180#ifdef NEED_RETRY
5181    if (size > INT_MAX) {
5182        p += INT_MAX;
5183        size -= INT_MAX;
5184        goto retry;
5185    }
5186#endif
5187
5188    return repr;
5189}
5190
5191PyObject *
5192PyUnicode_AsMBCSString(PyObject *unicode)
5193{
5194    if (!PyUnicode_Check(unicode)) {
5195        PyErr_BadArgument();
5196        return NULL;
5197    }
5198    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
5199                                PyUnicode_GET_SIZE(unicode),
5200                                NULL);
5201}
5202
5203#undef NEED_RETRY
5204
5205#endif /* MS_WINDOWS */
5206
5207/* --- Character Mapping Codec -------------------------------------------- */
5208
5209PyObject *
5210PyUnicode_DecodeCharmap(const char *s,
5211                        Py_ssize_t size,
5212                        PyObject *mapping,
5213                        const char *errors)
5214{
5215    const char *starts = s;
5216    Py_ssize_t startinpos;
5217    Py_ssize_t endinpos;
5218    Py_ssize_t outpos;
5219    const char *e;
5220    PyUnicodeObject *v;
5221    Py_UNICODE *p;
5222    Py_ssize_t extrachars = 0;
5223    PyObject *errorHandler = NULL;
5224    PyObject *exc = NULL;
5225    Py_UNICODE *mapstring = NULL;
5226    Py_ssize_t maplen = 0;
5227
5228    /* Default to Latin-1 */
5229    if (mapping == NULL)
5230        return PyUnicode_DecodeLatin1(s, size, errors);
5231
5232    v = _PyUnicode_New(size);
5233    if (v == NULL)
5234        goto onError;
5235    if (size == 0)
5236        return (PyObject *)v;
5237    p = PyUnicode_AS_UNICODE(v);
5238    e = s + size;
5239    if (PyUnicode_CheckExact(mapping)) {
5240        mapstring = PyUnicode_AS_UNICODE(mapping);
5241        maplen = PyUnicode_GET_SIZE(mapping);
5242        while (s < e) {
5243            unsigned char ch = *s;
5244            Py_UNICODE x = 0xfffe; /* illegal value */
5245
5246            if (ch < maplen)
5247                x = mapstring[ch];
5248
5249            if (x == 0xfffe) {
5250                /* undefined mapping */
5251                outpos = p-PyUnicode_AS_UNICODE(v);
5252                startinpos = s-starts;
5253                endinpos = startinpos+1;
5254                if (unicode_decode_call_errorhandler(
5255                        errors, &errorHandler,
5256                        "charmap", "character maps to <undefined>",
5257                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5258                        &v, &outpos, &p)) {
5259                    goto onError;
5260                }
5261                continue;
5262            }
5263            *p++ = x;
5264            ++s;
5265        }
5266    }
5267    else {
5268        while (s < e) {
5269            unsigned char ch = *s;
5270            PyObject *w, *x;
5271
5272            /* Get mapping (char ordinal -> integer, Unicode char or None) */
5273            w = PyLong_FromLong((long)ch);
5274            if (w == NULL)
5275                goto onError;
5276            x = PyObject_GetItem(mapping, w);
5277            Py_DECREF(w);
5278            if (x == NULL) {
5279                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5280                    /* No mapping found means: mapping is undefined. */
5281                    PyErr_Clear();
5282                    x = Py_None;
5283                    Py_INCREF(x);
5284                } else
5285                    goto onError;
5286            }
5287
5288            /* Apply mapping */
5289            if (PyLong_Check(x)) {
5290                long value = PyLong_AS_LONG(x);
5291                if (value < 0 || value > 65535) {
5292                    PyErr_SetString(PyExc_TypeError,
5293                                    "character mapping must be in range(65536)");
5294                    Py_DECREF(x);
5295                    goto onError;
5296                }
5297                *p++ = (Py_UNICODE)value;
5298            }
5299            else if (x == Py_None) {
5300                /* undefined mapping */
5301                outpos = p-PyUnicode_AS_UNICODE(v);
5302                startinpos = s-starts;
5303                endinpos = startinpos+1;
5304                if (unicode_decode_call_errorhandler(
5305                        errors, &errorHandler,
5306                        "charmap", "character maps to <undefined>",
5307                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5308                        &v, &outpos, &p)) {
5309                    Py_DECREF(x);
5310                    goto onError;
5311                }
5312                Py_DECREF(x);
5313                continue;
5314            }
5315            else if (PyUnicode_Check(x)) {
5316                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
5317
5318                if (targetsize == 1)
5319                    /* 1-1 mapping */
5320                    *p++ = *PyUnicode_AS_UNICODE(x);
5321
5322                else if (targetsize > 1) {
5323                    /* 1-n mapping */
5324                    if (targetsize > extrachars) {
5325                        /* resize first */
5326                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5327                        Py_ssize_t needed = (targetsize - extrachars) + \
5328                            (targetsize << 2);
5329                        extrachars += needed;
5330                        /* XXX overflow detection missing */
5331                        if (_PyUnicode_Resize(&v,
5332                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
5333                            Py_DECREF(x);
5334                            goto onError;
5335                        }
5336                        p = PyUnicode_AS_UNICODE(v) + oldpos;
5337                    }
5338                    Py_UNICODE_COPY(p,
5339                                    PyUnicode_AS_UNICODE(x),
5340                                    targetsize);
5341                    p += targetsize;
5342                    extrachars -= targetsize;
5343                }
5344                /* 1-0 mapping: skip the character */
5345            }
5346            else {
5347                /* wrong return value */
5348                PyErr_SetString(PyExc_TypeError,
5349                                "character mapping must return integer, None or str");
5350                Py_DECREF(x);
5351                goto onError;
5352            }
5353            Py_DECREF(x);
5354            ++s;
5355        }
5356    }
5357    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
5358        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5359            goto onError;
5360    Py_XDECREF(errorHandler);
5361    Py_XDECREF(exc);
5362    return (PyObject *)v;
5363
5364  onError:
5365    Py_XDECREF(errorHandler);
5366    Py_XDECREF(exc);
5367    Py_XDECREF(v);
5368    return NULL;
5369}
5370
5371/* Charmap encoding: the lookup table */
5372
5373struct encoding_map {
5374    PyObject_HEAD
5375    unsigned char level1[32];
5376    int count2, count3;
5377    unsigned char level23[1];
5378};
5379
5380static PyObject*
5381encoding_map_size(PyObject *obj, PyObject* args)
5382{
5383    struct encoding_map *map = (struct encoding_map*)obj;
5384    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
5385                           128*map->count3);
5386}
5387
5388static PyMethodDef encoding_map_methods[] = {
5389    {"size", encoding_map_size, METH_NOARGS,
5390     PyDoc_STR("Return the size (in bytes) of this object") },
5391    { 0 }
5392};
5393
5394static void
5395encoding_map_dealloc(PyObject* o)
5396{
5397    PyObject_FREE(o);
5398}
5399
5400static PyTypeObject EncodingMapType = {
5401    PyVarObject_HEAD_INIT(NULL, 0)
5402    "EncodingMap",          /*tp_name*/
5403    sizeof(struct encoding_map),   /*tp_basicsize*/
5404    0,                      /*tp_itemsize*/
5405    /* methods */
5406    encoding_map_dealloc,   /*tp_dealloc*/
5407    0,                      /*tp_print*/
5408    0,                      /*tp_getattr*/
5409    0,                      /*tp_setattr*/
5410    0,                      /*tp_reserved*/
5411    0,                      /*tp_repr*/
5412    0,                      /*tp_as_number*/
5413    0,                      /*tp_as_sequence*/
5414    0,                      /*tp_as_mapping*/
5415    0,                      /*tp_hash*/
5416    0,                      /*tp_call*/
5417    0,                      /*tp_str*/
5418    0,                      /*tp_getattro*/
5419    0,                      /*tp_setattro*/
5420    0,                      /*tp_as_buffer*/
5421    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
5422    0,                      /*tp_doc*/
5423    0,                      /*tp_traverse*/
5424    0,                      /*tp_clear*/
5425    0,                      /*tp_richcompare*/
5426    0,                      /*tp_weaklistoffset*/
5427    0,                      /*tp_iter*/
5428    0,                      /*tp_iternext*/
5429    encoding_map_methods,   /*tp_methods*/
5430    0,                      /*tp_members*/
5431    0,                      /*tp_getset*/
5432    0,                      /*tp_base*/
5433    0,                      /*tp_dict*/
5434    0,                      /*tp_descr_get*/
5435    0,                      /*tp_descr_set*/
5436    0,                      /*tp_dictoffset*/
5437    0,                      /*tp_init*/
5438    0,                      /*tp_alloc*/
5439    0,                      /*tp_new*/
5440    0,                      /*tp_free*/
5441    0,                      /*tp_is_gc*/
5442};
5443
5444PyObject*
5445PyUnicode_BuildEncodingMap(PyObject* string)
5446{
5447    Py_UNICODE *decode;
5448    PyObject *result;
5449    struct encoding_map *mresult;
5450    int i;
5451    int need_dict = 0;
5452    unsigned char level1[32];
5453    unsigned char level2[512];
5454    unsigned char *mlevel1, *mlevel2, *mlevel3;
5455    int count2 = 0, count3 = 0;
5456
5457    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5458        PyErr_BadArgument();
5459        return NULL;
5460    }
5461    decode = PyUnicode_AS_UNICODE(string);
5462    memset(level1, 0xFF, sizeof level1);
5463    memset(level2, 0xFF, sizeof level2);
5464
5465    /* If there isn't a one-to-one mapping of NULL to \0,
5466       or if there are non-BMP characters, we need to use
5467       a mapping dictionary. */
5468    if (decode[0] != 0)
5469        need_dict = 1;
5470    for (i = 1; i < 256; i++) {
5471        int l1, l2;
5472        if (decode[i] == 0
5473#ifdef Py_UNICODE_WIDE
5474            || decode[i] > 0xFFFF
5475#endif
5476            ) {
5477            need_dict = 1;
5478            break;
5479        }
5480        if (decode[i] == 0xFFFE)
5481            /* unmapped character */
5482            continue;
5483        l1 = decode[i] >> 11;
5484        l2 = decode[i] >> 7;
5485        if (level1[l1] == 0xFF)
5486            level1[l1] = count2++;
5487        if (level2[l2] == 0xFF)
5488            level2[l2] = count3++;
5489    }
5490
5491    if (count2 >= 0xFF || count3 >= 0xFF)
5492        need_dict = 1;
5493
5494    if (need_dict) {
5495        PyObject *result = PyDict_New();
5496        PyObject *key, *value;
5497        if (!result)
5498            return NULL;
5499        for (i = 0; i < 256; i++) {
5500            key = PyLong_FromLong(decode[i]);
5501            value = PyLong_FromLong(i);
5502            if (!key || !value)
5503                goto failed1;
5504            if (PyDict_SetItem(result, key, value) == -1)
5505                goto failed1;
5506            Py_DECREF(key);
5507            Py_DECREF(value);
5508        }
5509        return result;
5510      failed1:
5511        Py_XDECREF(key);
5512        Py_XDECREF(value);
5513        Py_DECREF(result);
5514        return NULL;
5515    }
5516
5517    /* Create a three-level trie */
5518    result = PyObject_MALLOC(sizeof(struct encoding_map) +
5519                             16*count2 + 128*count3 - 1);
5520    if (!result)
5521        return PyErr_NoMemory();
5522    PyObject_Init(result, &EncodingMapType);
5523    mresult = (struct encoding_map*)result;
5524    mresult->count2 = count2;
5525    mresult->count3 = count3;
5526    mlevel1 = mresult->level1;
5527    mlevel2 = mresult->level23;
5528    mlevel3 = mresult->level23 + 16*count2;
5529    memcpy(mlevel1, level1, 32);
5530    memset(mlevel2, 0xFF, 16*count2);
5531    memset(mlevel3, 0, 128*count3);
5532    count3 = 0;
5533    for (i = 1; i < 256; i++) {
5534        int o1, o2, o3, i2, i3;
5535        if (decode[i] == 0xFFFE)
5536            /* unmapped character */
5537            continue;
5538        o1 = decode[i]>>11;
5539        o2 = (decode[i]>>7) & 0xF;
5540        i2 = 16*mlevel1[o1] + o2;
5541        if (mlevel2[i2] == 0xFF)
5542            mlevel2[i2] = count3++;
5543        o3 = decode[i] & 0x7F;
5544        i3 = 128*mlevel2[i2] + o3;
5545        mlevel3[i3] = i;
5546    }
5547    return result;
5548}
5549
5550static int
5551encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5552{
5553    struct encoding_map *map = (struct encoding_map*)mapping;
5554    int l1 = c>>11;
5555    int l2 = (c>>7) & 0xF;
5556    int l3 = c & 0x7F;
5557    int i;
5558
5559#ifdef Py_UNICODE_WIDE
5560    if (c > 0xFFFF) {
5561        return -1;
5562    }
5563#endif
5564    if (c == 0)
5565        return 0;
5566    /* level 1*/
5567    i = map->level1[l1];
5568    if (i == 0xFF) {
5569        return -1;
5570    }
5571    /* level 2*/
5572    i = map->level23[16*i+l2];
5573    if (i == 0xFF) {
5574        return -1;
5575    }
5576    /* level 3 */
5577    i = map->level23[16*map->count2 + 128*i + l3];
5578    if (i == 0) {
5579        return -1;
5580    }
5581    return i;
5582}
5583
5584/* Lookup the character ch in the mapping. If the character
5585   can't be found, Py_None is returned (or NULL, if another
5586   error occurred). */
5587static PyObject *
5588charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5589{
5590    PyObject *w = PyLong_FromLong((long)c);
5591    PyObject *x;
5592
5593    if (w == NULL)
5594        return NULL;
5595    x = PyObject_GetItem(mapping, w);
5596    Py_DECREF(w);
5597    if (x == NULL) {
5598        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5599            /* No mapping found means: mapping is undefined. */
5600            PyErr_Clear();
5601            x = Py_None;
5602            Py_INCREF(x);
5603            return x;
5604        } else
5605            return NULL;
5606    }
5607    else if (x == Py_None)
5608        return x;
5609    else if (PyLong_Check(x)) {
5610        long value = PyLong_AS_LONG(x);
5611        if (value < 0 || value > 255) {
5612            PyErr_SetString(PyExc_TypeError,
5613                            "character mapping must be in range(256)");
5614            Py_DECREF(x);
5615            return NULL;
5616        }
5617        return x;
5618    }
5619    else if (PyBytes_Check(x))
5620        return x;
5621    else {
5622        /* wrong return value */
5623        PyErr_Format(PyExc_TypeError,
5624                     "character mapping must return integer, bytes or None, not %.400s",
5625                     x->ob_type->tp_name);
5626        Py_DECREF(x);
5627        return NULL;
5628    }
5629}
5630
5631static int
5632charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5633{
5634    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5635    /* exponentially overallocate to minimize reallocations */
5636    if (requiredsize < 2*outsize)
5637        requiredsize = 2*outsize;
5638    if (_PyBytes_Resize(outobj, requiredsize))
5639        return -1;
5640    return 0;
5641}
5642
5643typedef enum charmapencode_result {
5644    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5645} charmapencode_result;
5646/* lookup the character, put the result in the output string and adjust
5647   various state variables. Resize the output bytes object if not enough
5648   space is available. Return a new reference to the object that
5649   was put in the output buffer, or Py_None, if the mapping was undefined
5650   (in which case no character was written) or NULL, if a
5651   reallocation error occurred. The caller must decref the result */
5652static charmapencode_result
5653charmapencode_output(Py_UNICODE c, PyObject *mapping,
5654                     PyObject **outobj, Py_ssize_t *outpos)
5655{
5656    PyObject *rep;
5657    char *outstart;
5658    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5659
5660    if (Py_TYPE(mapping) == &EncodingMapType) {
5661        int res = encoding_map_lookup(c, mapping);
5662        Py_ssize_t requiredsize = *outpos+1;
5663        if (res == -1)
5664            return enc_FAILED;
5665        if (outsize<requiredsize)
5666            if (charmapencode_resize(outobj, outpos, requiredsize))
5667                return enc_EXCEPTION;
5668        outstart = PyBytes_AS_STRING(*outobj);
5669        outstart[(*outpos)++] = (char)res;
5670        return enc_SUCCESS;
5671    }
5672
5673    rep = charmapencode_lookup(c, mapping);
5674    if (rep==NULL)
5675        return enc_EXCEPTION;
5676    else if (rep==Py_None) {
5677        Py_DECREF(rep);
5678        return enc_FAILED;
5679    } else {
5680        if (PyLong_Check(rep)) {
5681            Py_ssize_t requiredsize = *outpos+1;
5682            if (outsize<requiredsize)
5683                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5684                    Py_DECREF(rep);
5685                    return enc_EXCEPTION;
5686                }
5687            outstart = PyBytes_AS_STRING(*outobj);
5688            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5689        }
5690        else {
5691            const char *repchars = PyBytes_AS_STRING(rep);
5692            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5693            Py_ssize_t requiredsize = *outpos+repsize;
5694            if (outsize<requiredsize)
5695                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5696                    Py_DECREF(rep);
5697                    return enc_EXCEPTION;
5698                }
5699            outstart = PyBytes_AS_STRING(*outobj);
5700            memcpy(outstart + *outpos, repchars, repsize);
5701            *outpos += repsize;
5702        }
5703    }
5704    Py_DECREF(rep);
5705    return enc_SUCCESS;
5706}
5707
5708/* handle an error in PyUnicode_EncodeCharmap
5709   Return 0 on success, -1 on error */
5710static int
5711charmap_encoding_error(
5712    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5713    PyObject **exceptionObject,
5714    int *known_errorHandler, PyObject **errorHandler, const char *errors,
5715    PyObject **res, Py_ssize_t *respos)
5716{
5717    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5718    Py_ssize_t repsize;
5719    Py_ssize_t newpos;
5720    Py_UNICODE *uni2;
5721    /* startpos for collecting unencodable chars */
5722    Py_ssize_t collstartpos = *inpos;
5723    Py_ssize_t collendpos = *inpos+1;
5724    Py_ssize_t collpos;
5725    char *encoding = "charmap";
5726    char *reason = "character maps to <undefined>";
5727    charmapencode_result x;
5728
5729    /* find all unencodable characters */
5730    while (collendpos < size) {
5731        PyObject *rep;
5732        if (Py_TYPE(mapping) == &EncodingMapType) {
5733            int res = encoding_map_lookup(p[collendpos], mapping);
5734            if (res != -1)
5735                break;
5736            ++collendpos;
5737            continue;
5738        }
5739
5740        rep = charmapencode_lookup(p[collendpos], mapping);
5741        if (rep==NULL)
5742            return -1;
5743        else if (rep!=Py_None) {
5744            Py_DECREF(rep);
5745            break;
5746        }
5747        Py_DECREF(rep);
5748        ++collendpos;
5749    }
5750    /* cache callback name lookup
5751     * (if not done yet, i.e. it's the first error) */
5752    if (*known_errorHandler==-1) {
5753        if ((errors==NULL) || (!strcmp(errors, "strict")))
5754            *known_errorHandler = 1;
5755        else if (!strcmp(errors, "replace"))
5756            *known_errorHandler = 2;
5757        else if (!strcmp(errors, "ignore"))
5758            *known_errorHandler = 3;
5759        else if (!strcmp(errors, "xmlcharrefreplace"))
5760            *known_errorHandler = 4;
5761        else
5762            *known_errorHandler = 0;
5763    }
5764    switch (*known_errorHandler) {
5765    case 1: /* strict */
5766        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5767        return -1;
5768    case 2: /* replace */
5769        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
5770            x = charmapencode_output('?', mapping, res, respos);
5771            if (x==enc_EXCEPTION) {
5772                return -1;
5773            }
5774            else if (x==enc_FAILED) {
5775                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5776                return -1;
5777            }
5778        }
5779        /* fall through */
5780    case 3: /* ignore */
5781        *inpos = collendpos;
5782        break;
5783    case 4: /* xmlcharrefreplace */
5784        /* generate replacement (temporarily (mis)uses p) */
5785        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
5786            char buffer[2+29+1+1];
5787            char *cp;
5788            sprintf(buffer, "&#%d;", (int)p[collpos]);
5789            for (cp = buffer; *cp; ++cp) {
5790                x = charmapencode_output(*cp, mapping, res, respos);
5791                if (x==enc_EXCEPTION)
5792                    return -1;
5793                else if (x==enc_FAILED) {
5794                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5795                    return -1;
5796                }
5797            }
5798        }
5799        *inpos = collendpos;
5800        break;
5801    default:
5802        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
5803                                                      encoding, reason, p, size, exceptionObject,
5804                                                      collstartpos, collendpos, &newpos);
5805        if (repunicode == NULL)
5806            return -1;
5807        if (PyBytes_Check(repunicode)) {
5808            /* Directly copy bytes result to output. */
5809            Py_ssize_t outsize = PyBytes_Size(*res);
5810            Py_ssize_t requiredsize;
5811            repsize = PyBytes_Size(repunicode);
5812            requiredsize = *respos + repsize;
5813            if (requiredsize > outsize)
5814                /* Make room for all additional bytes. */
5815                if (charmapencode_resize(res, respos, requiredsize)) {
5816                    Py_DECREF(repunicode);
5817                    return -1;
5818                }
5819            memcpy(PyBytes_AsString(*res) + *respos,
5820                   PyBytes_AsString(repunicode),  repsize);
5821            *respos += repsize;
5822            *inpos = newpos;
5823            Py_DECREF(repunicode);
5824            break;
5825        }
5826        /* generate replacement  */
5827        repsize = PyUnicode_GET_SIZE(repunicode);
5828        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5829            x = charmapencode_output(*uni2, mapping, res, respos);
5830            if (x==enc_EXCEPTION) {
5831                return -1;
5832            }
5833            else if (x==enc_FAILED) {
5834                Py_DECREF(repunicode);
5835                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5836                return -1;
5837            }
5838        }
5839        *inpos = newpos;
5840        Py_DECREF(repunicode);
5841    }
5842    return 0;
5843}
5844
5845PyObject *
5846PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5847                        Py_ssize_t size,
5848                        PyObject *mapping,
5849                        const char *errors)
5850{
5851    /* output object */
5852    PyObject *res = NULL;
5853    /* current input position */
5854    Py_ssize_t inpos = 0;
5855    /* current output position */
5856    Py_ssize_t respos = 0;
5857    PyObject *errorHandler = NULL;
5858    PyObject *exc = NULL;
5859    /* the following variable is used for caching string comparisons
5860     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5861     * 3=ignore, 4=xmlcharrefreplace */
5862    int known_errorHandler = -1;
5863
5864    /* Default to Latin-1 */
5865    if (mapping == NULL)
5866        return PyUnicode_EncodeLatin1(p, size, errors);
5867
5868    /* allocate enough for a simple encoding without
5869       replacements, if we need more, we'll resize */
5870    res = PyBytes_FromStringAndSize(NULL, size);
5871    if (res == NULL)
5872        goto onError;
5873    if (size == 0)
5874        return res;
5875
5876    while (inpos<size) {
5877        /* try to encode it */
5878        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5879        if (x==enc_EXCEPTION) /* error */
5880            goto onError;
5881        if (x==enc_FAILED) { /* unencodable character */
5882            if (charmap_encoding_error(p, size, &inpos, mapping,
5883                                       &exc,
5884                                       &known_errorHandler, &errorHandler, errors,
5885                                       &res, &respos)) {
5886                goto onError;
5887            }
5888        }
5889        else
5890            /* done with this character => adjust input position */
5891            ++inpos;
5892    }
5893
5894    /* Resize if we allocated to much */
5895    if (respos<PyBytes_GET_SIZE(res))
5896        if (_PyBytes_Resize(&res, respos) < 0)
5897            goto onError;
5898
5899    Py_XDECREF(exc);
5900    Py_XDECREF(errorHandler);
5901    return res;
5902
5903  onError:
5904    Py_XDECREF(res);
5905    Py_XDECREF(exc);
5906    Py_XDECREF(errorHandler);
5907    return NULL;
5908}
5909
5910PyObject *
5911PyUnicode_AsCharmapString(PyObject *unicode,
5912                          PyObject *mapping)
5913{
5914    if (!PyUnicode_Check(unicode) || mapping == NULL) {
5915        PyErr_BadArgument();
5916        return NULL;
5917    }
5918    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5919                                   PyUnicode_GET_SIZE(unicode),
5920                                   mapping,
5921                                   NULL);
5922}
5923
5924/* create or adjust a UnicodeTranslateError */
5925static void
5926make_translate_exception(PyObject **exceptionObject,
5927                         const Py_UNICODE *unicode, Py_ssize_t size,
5928                         Py_ssize_t startpos, Py_ssize_t endpos,
5929                         const char *reason)
5930{
5931    if (*exceptionObject == NULL) {
5932        *exceptionObject = PyUnicodeTranslateError_Create(
5933            unicode, size, startpos, endpos, reason);
5934    }
5935    else {
5936        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5937            goto onError;
5938        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5939            goto onError;
5940        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5941            goto onError;
5942        return;
5943      onError:
5944        Py_DECREF(*exceptionObject);
5945        *exceptionObject = NULL;
5946    }
5947}
5948
5949/* raises a UnicodeTranslateError */
5950static void
5951raise_translate_exception(PyObject **exceptionObject,
5952                          const Py_UNICODE *unicode, Py_ssize_t size,
5953                          Py_ssize_t startpos, Py_ssize_t endpos,
5954                          const char *reason)
5955{
5956    make_translate_exception(exceptionObject,
5957                             unicode, size, startpos, endpos, reason);
5958    if (*exceptionObject != NULL)
5959        PyCodec_StrictErrors(*exceptionObject);
5960}
5961
5962/* error handling callback helper:
5963   build arguments, call the callback and check the arguments,
5964   put the result into newpos and return the replacement string, which
5965   has to be freed by the caller */
5966static PyObject *
5967unicode_translate_call_errorhandler(const char *errors,
5968                                    PyObject **errorHandler,
5969                                    const char *reason,
5970                                    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5971                                    Py_ssize_t startpos, Py_ssize_t endpos,
5972                                    Py_ssize_t *newpos)
5973{
5974    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5975
5976    Py_ssize_t i_newpos;
5977    PyObject *restuple;
5978    PyObject *resunicode;
5979
5980    if (*errorHandler == NULL) {
5981        *errorHandler = PyCodec_LookupError(errors);
5982        if (*errorHandler == NULL)
5983            return NULL;
5984    }
5985
5986    make_translate_exception(exceptionObject,
5987                             unicode, size, startpos, endpos, reason);
5988    if (*exceptionObject == NULL)
5989        return NULL;
5990
5991    restuple = PyObject_CallFunctionObjArgs(
5992        *errorHandler, *exceptionObject, NULL);
5993    if (restuple == NULL)
5994        return NULL;
5995    if (!PyTuple_Check(restuple)) {
5996        PyErr_SetString(PyExc_TypeError, &argparse[4]);
5997        Py_DECREF(restuple);
5998        return NULL;
5999    }
6000    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
6001                          &resunicode, &i_newpos)) {
6002        Py_DECREF(restuple);
6003        return NULL;
6004    }
6005    if (i_newpos<0)
6006        *newpos = size+i_newpos;
6007    else
6008        *newpos = i_newpos;
6009    if (*newpos<0 || *newpos>size) {
6010        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6011        Py_DECREF(restuple);
6012        return NULL;
6013    }
6014    Py_INCREF(resunicode);
6015    Py_DECREF(restuple);
6016    return resunicode;
6017}
6018
6019/* Lookup the character ch in the mapping and put the result in result,
6020   which must be decrefed by the caller.
6021   Return 0 on success, -1 on error */
6022static int
6023charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
6024{
6025    PyObject *w = PyLong_FromLong((long)c);
6026    PyObject *x;
6027
6028    if (w == NULL)
6029        return -1;
6030    x = PyObject_GetItem(mapping, w);
6031    Py_DECREF(w);
6032    if (x == NULL) {
6033        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6034            /* No mapping found means: use 1:1 mapping. */
6035            PyErr_Clear();
6036            *result = NULL;
6037            return 0;
6038        } else
6039            return -1;
6040    }
6041    else if (x == Py_None) {
6042        *result = x;
6043        return 0;
6044    }
6045    else if (PyLong_Check(x)) {
6046        long value = PyLong_AS_LONG(x);
6047        long max = PyUnicode_GetMax();
6048        if (value < 0 || value > max) {
6049            PyErr_Format(PyExc_TypeError,
6050                         "character mapping must be in range(0x%x)", max+1);
6051            Py_DECREF(x);
6052            return -1;
6053        }
6054        *result = x;
6055        return 0;
6056    }
6057    else if (PyUnicode_Check(x)) {
6058        *result = x;
6059        return 0;
6060    }
6061    else {
6062        /* wrong return value */
6063        PyErr_SetString(PyExc_TypeError,
6064                        "character mapping must return integer, None or str");
6065        Py_DECREF(x);
6066        return -1;
6067    }
6068}
6069/* ensure that *outobj is at least requiredsize characters long,
6070   if not reallocate and adjust various state variables.
6071   Return 0 on success, -1 on error */
6072static int
6073charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
6074                               Py_ssize_t requiredsize)
6075{
6076    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
6077    if (requiredsize > oldsize) {
6078        /* remember old output position */
6079        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6080        /* exponentially overallocate to minimize reallocations */
6081        if (requiredsize < 2 * oldsize)
6082            requiredsize = 2 * oldsize;
6083        if (PyUnicode_Resize(outobj, requiredsize) < 0)
6084            return -1;
6085        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
6086    }
6087    return 0;
6088}
6089/* lookup the character, put the result in the output string and adjust
6090   various state variables. Return a new reference to the object that
6091   was put in the output buffer in *result, or Py_None, if the mapping was
6092   undefined (in which case no character was written).
6093   The called must decref result.
6094   Return 0 on success, -1 on error. */
6095static int
6096charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6097                        Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6098                        PyObject **res)
6099{
6100    if (charmaptranslate_lookup(*curinp, mapping, res))
6101        return -1;
6102    if (*res==NULL) {
6103        /* not found => default to 1:1 mapping */
6104        *(*outp)++ = *curinp;
6105    }
6106    else if (*res==Py_None)
6107        ;
6108    else if (PyLong_Check(*res)) {
6109        /* no overflow check, because we know that the space is enough */
6110        *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
6111    }
6112    else if (PyUnicode_Check(*res)) {
6113        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6114        if (repsize==1) {
6115            /* no overflow check, because we know that the space is enough */
6116            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6117        }
6118        else if (repsize!=0) {
6119            /* more than one character */
6120            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6121                (insize - (curinp-startinp)) +
6122                repsize - 1;
6123            if (charmaptranslate_makespace(outobj, outp, requiredsize))
6124                return -1;
6125            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6126            *outp += repsize;
6127        }
6128    }
6129    else
6130        return -1;
6131    return 0;
6132}
6133
6134PyObject *
6135PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6136                           Py_ssize_t size,
6137                           PyObject *mapping,
6138                           const char *errors)
6139{
6140    /* output object */
6141    PyObject *res = NULL;
6142    /* pointers to the beginning and end+1 of input */
6143    const Py_UNICODE *startp = p;
6144    const Py_UNICODE *endp = p + size;
6145    /* pointer into the output */
6146    Py_UNICODE *str;
6147    /* current output position */
6148    Py_ssize_t respos = 0;
6149    char *reason = "character maps to <undefined>";
6150    PyObject *errorHandler = NULL;
6151    PyObject *exc = NULL;
6152    /* the following variable is used for caching string comparisons
6153     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6154     * 3=ignore, 4=xmlcharrefreplace */
6155    int known_errorHandler = -1;
6156
6157    if (mapping == NULL) {
6158        PyErr_BadArgument();
6159        return NULL;
6160    }
6161
6162    /* allocate enough for a simple 1:1 translation without
6163       replacements, if we need more, we'll resize */
6164    res = PyUnicode_FromUnicode(NULL, size);
6165    if (res == NULL)
6166        goto onError;
6167    if (size == 0)
6168        return res;
6169    str = PyUnicode_AS_UNICODE(res);
6170
6171    while (p<endp) {
6172        /* try to encode it */
6173        PyObject *x = NULL;
6174        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6175            Py_XDECREF(x);
6176            goto onError;
6177        }
6178        Py_XDECREF(x);
6179        if (x!=Py_None) /* it worked => adjust input pointer */
6180            ++p;
6181        else { /* untranslatable character */
6182            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6183            Py_ssize_t repsize;
6184            Py_ssize_t newpos;
6185            Py_UNICODE *uni2;
6186            /* startpos for collecting untranslatable chars */
6187            const Py_UNICODE *collstart = p;
6188            const Py_UNICODE *collend = p+1;
6189            const Py_UNICODE *coll;
6190
6191            /* find all untranslatable characters */
6192            while (collend < endp) {
6193                if (charmaptranslate_lookup(*collend, mapping, &x))
6194                    goto onError;
6195                Py_XDECREF(x);
6196                if (x!=Py_None)
6197                    break;
6198                ++collend;
6199            }
6200            /* cache callback name lookup
6201             * (if not done yet, i.e. it's the first error) */
6202            if (known_errorHandler==-1) {
6203                if ((errors==NULL) || (!strcmp(errors, "strict")))
6204                    known_errorHandler = 1;
6205                else if (!strcmp(errors, "replace"))
6206                    known_errorHandler = 2;
6207                else if (!strcmp(errors, "ignore"))
6208                    known_errorHandler = 3;
6209                else if (!strcmp(errors, "xmlcharrefreplace"))
6210                    known_errorHandler = 4;
6211                else
6212                    known_errorHandler = 0;
6213            }
6214            switch (known_errorHandler) {
6215            case 1: /* strict */
6216                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
6217                goto onError;
6218            case 2: /* replace */
6219                /* No need to check for space, this is a 1:1 replacement */
6220                for (coll = collstart; coll<collend; ++coll)
6221                    *str++ = '?';
6222                /* fall through */
6223            case 3: /* ignore */
6224                p = collend;
6225                break;
6226            case 4: /* xmlcharrefreplace */
6227                /* generate replacement (temporarily (mis)uses p) */
6228                for (p = collstart; p < collend; ++p) {
6229                    char buffer[2+29+1+1];
6230                    char *cp;
6231                    sprintf(buffer, "&#%d;", (int)*p);
6232                    if (charmaptranslate_makespace(&res, &str,
6233                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6234                        goto onError;
6235                    for (cp = buffer; *cp; ++cp)
6236                        *str++ = *cp;
6237                }
6238                p = collend;
6239                break;
6240            default:
6241                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6242                                                                 reason, startp, size, &exc,
6243                                                                 collstart-startp, collend-startp, &newpos);
6244                if (repunicode == NULL)
6245                    goto onError;
6246                /* generate replacement  */
6247                repsize = PyUnicode_GET_SIZE(repunicode);
6248                if (charmaptranslate_makespace(&res, &str,
6249                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6250                    Py_DECREF(repunicode);
6251                    goto onError;
6252                }
6253                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6254                    *str++ = *uni2;
6255                p = startp + newpos;
6256                Py_DECREF(repunicode);
6257            }
6258        }
6259    }
6260    /* Resize if we allocated to much */
6261    respos = str-PyUnicode_AS_UNICODE(res);
6262    if (respos<PyUnicode_GET_SIZE(res)) {
6263        if (PyUnicode_Resize(&res, respos) < 0)
6264            goto onError;
6265    }
6266    Py_XDECREF(exc);
6267    Py_XDECREF(errorHandler);
6268    return res;
6269
6270  onError:
6271    Py_XDECREF(res);
6272    Py_XDECREF(exc);
6273    Py_XDECREF(errorHandler);
6274    return NULL;
6275}
6276
6277PyObject *
6278PyUnicode_Translate(PyObject *str,
6279                    PyObject *mapping,
6280                    const char *errors)
6281{
6282    PyObject *result;
6283
6284    str = PyUnicode_FromObject(str);
6285    if (str == NULL)
6286        goto onError;
6287    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
6288                                        PyUnicode_GET_SIZE(str),
6289                                        mapping,
6290                                        errors);
6291    Py_DECREF(str);
6292    return result;
6293
6294  onError:
6295    Py_XDECREF(str);
6296    return NULL;
6297}
6298
6299PyObject *
6300PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6301                                  Py_ssize_t length)
6302{
6303    PyObject *result;
6304    Py_UNICODE *p; /* write pointer into result */
6305    Py_ssize_t i;
6306    /* Copy to a new string */
6307    result = (PyObject *)_PyUnicode_New(length);
6308    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6309    if (result == NULL)
6310        return result;
6311    p = PyUnicode_AS_UNICODE(result);
6312    /* Iterate over code points */
6313    for (i = 0; i < length; i++) {
6314        Py_UNICODE ch =s[i];
6315        if (ch > 127) {
6316            int decimal = Py_UNICODE_TODECIMAL(ch);
6317            if (decimal >= 0)
6318                p[i] = '0' + decimal;
6319        }
6320    }
6321    return result;
6322}
6323/* --- Decimal Encoder ---------------------------------------------------- */
6324
6325int
6326PyUnicode_EncodeDecimal(Py_UNICODE *s,
6327                        Py_ssize_t length,
6328                        char *output,
6329                        const char *errors)
6330{
6331    Py_UNICODE *p, *end;
6332    PyObject *errorHandler = NULL;
6333    PyObject *exc = NULL;
6334    const char *encoding = "decimal";
6335    const char *reason = "invalid decimal Unicode string";
6336    /* the following variable is used for caching string comparisons
6337     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6338    int known_errorHandler = -1;
6339
6340    if (output == NULL) {
6341        PyErr_BadArgument();
6342        return -1;
6343    }
6344
6345    p = s;
6346    end = s + length;
6347    while (p < end) {
6348        register Py_UNICODE ch = *p;
6349        int decimal;
6350        PyObject *repunicode;
6351        Py_ssize_t repsize;
6352        Py_ssize_t newpos;
6353        Py_UNICODE *uni2;
6354        Py_UNICODE *collstart;
6355        Py_UNICODE *collend;
6356
6357        if (Py_UNICODE_ISSPACE(ch)) {
6358            *output++ = ' ';
6359            ++p;
6360            continue;
6361        }
6362        decimal = Py_UNICODE_TODECIMAL(ch);
6363        if (decimal >= 0) {
6364            *output++ = '0' + decimal;
6365            ++p;
6366            continue;
6367        }
6368        if (0 < ch && ch < 256) {
6369            *output++ = (char)ch;
6370            ++p;
6371            continue;
6372        }
6373        /* All other characters are considered unencodable */
6374        collstart = p;
6375        collend = p+1;
6376        while (collend < end) {
6377            if ((0 < *collend && *collend < 256) ||
6378                !Py_UNICODE_ISSPACE(*collend) ||
6379                Py_UNICODE_TODECIMAL(*collend))
6380                break;
6381        }
6382        /* cache callback name lookup
6383         * (if not done yet, i.e. it's the first error) */
6384        if (known_errorHandler==-1) {
6385            if ((errors==NULL) || (!strcmp(errors, "strict")))
6386                known_errorHandler = 1;
6387            else if (!strcmp(errors, "replace"))
6388                known_errorHandler = 2;
6389            else if (!strcmp(errors, "ignore"))
6390                known_errorHandler = 3;
6391            else if (!strcmp(errors, "xmlcharrefreplace"))
6392                known_errorHandler = 4;
6393            else
6394                known_errorHandler = 0;
6395        }
6396        switch (known_errorHandler) {
6397        case 1: /* strict */
6398            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6399            goto onError;
6400        case 2: /* replace */
6401            for (p = collstart; p < collend; ++p)
6402                *output++ = '?';
6403            /* fall through */
6404        case 3: /* ignore */
6405            p = collend;
6406            break;
6407        case 4: /* xmlcharrefreplace */
6408            /* generate replacement (temporarily (mis)uses p) */
6409            for (p = collstart; p < collend; ++p)
6410                output += sprintf(output, "&#%d;", (int)*p);
6411            p = collend;
6412            break;
6413        default:
6414            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6415                                                          encoding, reason, s, length, &exc,
6416                                                          collstart-s, collend-s, &newpos);
6417            if (repunicode == NULL)
6418                goto onError;
6419            if (!PyUnicode_Check(repunicode)) {
6420                /* Byte results not supported, since they have no decimal property. */
6421                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6422                Py_DECREF(repunicode);
6423                goto onError;
6424            }
6425            /* generate replacement  */
6426            repsize = PyUnicode_GET_SIZE(repunicode);
6427            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6428                Py_UNICODE ch = *uni2;
6429                if (Py_UNICODE_ISSPACE(ch))
6430                    *output++ = ' ';
6431                else {
6432                    decimal = Py_UNICODE_TODECIMAL(ch);
6433                    if (decimal >= 0)
6434                        *output++ = '0' + decimal;
6435                    else if (0 < ch && ch < 256)
6436                        *output++ = (char)ch;
6437                    else {
6438                        Py_DECREF(repunicode);
6439                        raise_encode_exception(&exc, encoding,
6440                                               s, length, collstart-s, collend-s, reason);
6441                        goto onError;
6442                    }
6443                }
6444            }
6445            p = s + newpos;
6446            Py_DECREF(repunicode);
6447        }
6448    }
6449    /* 0-terminate the output string */
6450    *output++ = '\0';
6451    Py_XDECREF(exc);
6452    Py_XDECREF(errorHandler);
6453    return 0;
6454
6455  onError:
6456    Py_XDECREF(exc);
6457    Py_XDECREF(errorHandler);
6458    return -1;
6459}
6460
6461/* --- Helpers ------------------------------------------------------------ */
6462
6463#include "stringlib/unicodedefs.h"
6464#include "stringlib/fastsearch.h"
6465
6466#include "stringlib/count.h"
6467#include "stringlib/find.h"
6468#include "stringlib/partition.h"
6469#include "stringlib/split.h"
6470
6471#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
6472#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
6473#include "stringlib/localeutil.h"
6474
6475/* helper macro to fixup start/end slice values */
6476#define ADJUST_INDICES(start, end, len)         \
6477    if (end > len)                              \
6478        end = len;                              \
6479    else if (end < 0) {                         \
6480        end += len;                             \
6481        if (end < 0)                            \
6482            end = 0;                            \
6483    }                                           \
6484    if (start < 0) {                            \
6485        start += len;                           \
6486        if (start < 0)                          \
6487            start = 0;                          \
6488    }
6489
6490Py_ssize_t
6491PyUnicode_Count(PyObject *str,
6492                PyObject *substr,
6493                Py_ssize_t start,
6494                Py_ssize_t end)
6495{
6496    Py_ssize_t result;
6497    PyUnicodeObject* str_obj;
6498    PyUnicodeObject* sub_obj;
6499
6500    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6501    if (!str_obj)
6502        return -1;
6503    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6504    if (!sub_obj) {
6505        Py_DECREF(str_obj);
6506        return -1;
6507    }
6508
6509    ADJUST_INDICES(start, end, str_obj->length);
6510    result = stringlib_count(
6511        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6512        PY_SSIZE_T_MAX
6513        );
6514
6515    Py_DECREF(sub_obj);
6516    Py_DECREF(str_obj);
6517
6518    return result;
6519}
6520
6521Py_ssize_t
6522PyUnicode_Find(PyObject *str,
6523               PyObject *sub,
6524               Py_ssize_t start,
6525               Py_ssize_t end,
6526               int direction)
6527{
6528    Py_ssize_t result;
6529
6530    str = PyUnicode_FromObject(str);
6531    if (!str)
6532        return -2;
6533    sub = PyUnicode_FromObject(sub);
6534    if (!sub) {
6535        Py_DECREF(str);
6536        return -2;
6537    }
6538
6539    if (direction > 0)
6540        result = stringlib_find_slice(
6541            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6542            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6543            start, end
6544            );
6545    else
6546        result = stringlib_rfind_slice(
6547            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6548            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6549            start, end
6550            );
6551
6552    Py_DECREF(str);
6553    Py_DECREF(sub);
6554
6555    return result;
6556}
6557
6558static int
6559tailmatch(PyUnicodeObject *self,
6560          PyUnicodeObject *substring,
6561          Py_ssize_t start,
6562          Py_ssize_t end,
6563          int direction)
6564{
6565    if (substring->length == 0)
6566        return 1;
6567
6568    ADJUST_INDICES(start, end, self->length);
6569    end -= substring->length;
6570    if (end < start)
6571        return 0;
6572
6573    if (direction > 0) {
6574        if (Py_UNICODE_MATCH(self, end, substring))
6575            return 1;
6576    } else {
6577        if (Py_UNICODE_MATCH(self, start, substring))
6578            return 1;
6579    }
6580
6581    return 0;
6582}
6583
6584Py_ssize_t
6585PyUnicode_Tailmatch(PyObject *str,
6586                    PyObject *substr,
6587                    Py_ssize_t start,
6588                    Py_ssize_t end,
6589                    int direction)
6590{
6591    Py_ssize_t result;
6592
6593    str = PyUnicode_FromObject(str);
6594    if (str == NULL)
6595        return -1;
6596    substr = PyUnicode_FromObject(substr);
6597    if (substr == NULL) {
6598        Py_DECREF(str);
6599        return -1;
6600    }
6601
6602    result = tailmatch((PyUnicodeObject *)str,
6603                       (PyUnicodeObject *)substr,
6604                       start, end, direction);
6605    Py_DECREF(str);
6606    Py_DECREF(substr);
6607    return result;
6608}
6609
6610/* Apply fixfct filter to the Unicode object self and return a
6611   reference to the modified object */
6612
6613static PyObject *
6614fixup(PyUnicodeObject *self,
6615      int (*fixfct)(PyUnicodeObject *s))
6616{
6617
6618    PyUnicodeObject *u;
6619
6620    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6621    if (u == NULL)
6622        return NULL;
6623
6624    Py_UNICODE_COPY(u->str, self->str, self->length);
6625
6626    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6627        /* fixfct should return TRUE if it modified the buffer. If
6628           FALSE, return a reference to the original buffer instead
6629           (to save space, not time) */
6630        Py_INCREF(self);
6631        Py_DECREF(u);
6632        return (PyObject*) self;
6633    }
6634    return (PyObject*) u;
6635}
6636
6637static int
6638fixupper(PyUnicodeObject *self)
6639{
6640    Py_ssize_t len = self->length;
6641    Py_UNICODE *s = self->str;
6642    int status = 0;
6643
6644    while (len-- > 0) {
6645        register Py_UNICODE ch;
6646
6647        ch = Py_UNICODE_TOUPPER(*s);
6648        if (ch != *s) {
6649            status = 1;
6650            *s = ch;
6651        }
6652        s++;
6653    }
6654
6655    return status;
6656}
6657
6658static int
6659fixlower(PyUnicodeObject *self)
6660{
6661    Py_ssize_t len = self->length;
6662    Py_UNICODE *s = self->str;
6663    int status = 0;
6664
6665    while (len-- > 0) {
6666        register Py_UNICODE ch;
6667
6668        ch = Py_UNICODE_TOLOWER(*s);
6669        if (ch != *s) {
6670            status = 1;
6671            *s = ch;
6672        }
6673        s++;
6674    }
6675
6676    return status;
6677}
6678
6679static int
6680fixswapcase(PyUnicodeObject *self)
6681{
6682    Py_ssize_t len = self->length;
6683    Py_UNICODE *s = self->str;
6684    int status = 0;
6685
6686    while (len-- > 0) {
6687        if (Py_UNICODE_ISUPPER(*s)) {
6688            *s = Py_UNICODE_TOLOWER(*s);
6689            status = 1;
6690        } else if (Py_UNICODE_ISLOWER(*s)) {
6691            *s = Py_UNICODE_TOUPPER(*s);
6692            status = 1;
6693        }
6694        s++;
6695    }
6696
6697    return status;
6698}
6699
6700static int
6701fixcapitalize(PyUnicodeObject *self)
6702{
6703    Py_ssize_t len = self->length;
6704    Py_UNICODE *s = self->str;
6705    int status = 0;
6706
6707    if (len == 0)
6708        return 0;
6709    if (Py_UNICODE_ISLOWER(*s)) {
6710        *s = Py_UNICODE_TOUPPER(*s);
6711        status = 1;
6712    }
6713    s++;
6714    while (--len > 0) {
6715        if (Py_UNICODE_ISUPPER(*s)) {
6716            *s = Py_UNICODE_TOLOWER(*s);
6717            status = 1;
6718        }
6719        s++;
6720    }
6721    return status;
6722}
6723
6724static int
6725fixtitle(PyUnicodeObject *self)
6726{
6727    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6728    register Py_UNICODE *e;
6729    int previous_is_cased;
6730
6731    /* Shortcut for single character strings */
6732    if (PyUnicode_GET_SIZE(self) == 1) {
6733        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6734        if (*p != ch) {
6735            *p = ch;
6736            return 1;
6737        }
6738        else
6739            return 0;
6740    }
6741
6742    e = p + PyUnicode_GET_SIZE(self);
6743    previous_is_cased = 0;
6744    for (; p < e; p++) {
6745        register const Py_UNICODE ch = *p;
6746
6747        if (previous_is_cased)
6748            *p = Py_UNICODE_TOLOWER(ch);
6749        else
6750            *p = Py_UNICODE_TOTITLE(ch);
6751
6752        if (Py_UNICODE_ISLOWER(ch) ||
6753            Py_UNICODE_ISUPPER(ch) ||
6754            Py_UNICODE_ISTITLE(ch))
6755            previous_is_cased = 1;
6756        else
6757            previous_is_cased = 0;
6758    }
6759    return 1;
6760}
6761
6762PyObject *
6763PyUnicode_Join(PyObject *separator, PyObject *seq)
6764{
6765    const Py_UNICODE blank = ' ';
6766    const Py_UNICODE *sep = &blank;
6767    Py_ssize_t seplen = 1;
6768    PyUnicodeObject *res = NULL; /* the result */
6769    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
6770    PyObject *fseq;          /* PySequence_Fast(seq) */
6771    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
6772    PyObject **items;
6773    PyObject *item;
6774    Py_ssize_t sz, i;
6775
6776    fseq = PySequence_Fast(seq, "");
6777    if (fseq == NULL) {
6778        return NULL;
6779    }
6780
6781    /* NOTE: the following code can't call back into Python code,
6782     * so we are sure that fseq won't be mutated.
6783     */
6784
6785    seqlen = PySequence_Fast_GET_SIZE(fseq);
6786    /* If empty sequence, return u"". */
6787    if (seqlen == 0) {
6788        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
6789        goto Done;
6790    }
6791    items = PySequence_Fast_ITEMS(fseq);
6792    /* If singleton sequence with an exact Unicode, return that. */
6793    if (seqlen == 1) {
6794        item = items[0];
6795        if (PyUnicode_CheckExact(item)) {
6796            Py_INCREF(item);
6797            res = (PyUnicodeObject *)item;
6798            goto Done;
6799        }
6800    }
6801    else {
6802        /* Set up sep and seplen */
6803        if (separator == NULL) {
6804            sep = &blank;
6805            seplen = 1;
6806        }
6807        else {
6808            if (!PyUnicode_Check(separator)) {
6809                PyErr_Format(PyExc_TypeError,
6810                             "separator: expected str instance,"
6811                             " %.80s found",
6812                             Py_TYPE(separator)->tp_name);
6813                goto onError;
6814            }
6815            sep = PyUnicode_AS_UNICODE(separator);
6816            seplen = PyUnicode_GET_SIZE(separator);
6817        }
6818    }
6819
6820    /* There are at least two things to join, or else we have a subclass
6821     * of str in the sequence.
6822     * Do a pre-pass to figure out the total amount of space we'll
6823     * need (sz), and see whether all argument are strings.
6824     */
6825    sz = 0;
6826    for (i = 0; i < seqlen; i++) {
6827        const Py_ssize_t old_sz = sz;
6828        item = items[i];
6829        if (!PyUnicode_Check(item)) {
6830            PyErr_Format(PyExc_TypeError,
6831                         "sequence item %zd: expected str instance,"
6832                         " %.80s found",
6833                         i, Py_TYPE(item)->tp_name);
6834            goto onError;
6835        }
6836        sz += PyUnicode_GET_SIZE(item);
6837        if (i != 0)
6838            sz += seplen;
6839        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6840            PyErr_SetString(PyExc_OverflowError,
6841                            "join() result is too long for a Python string");
6842            goto onError;
6843        }
6844    }
6845
6846    res = _PyUnicode_New(sz);
6847    if (res == NULL)
6848        goto onError;
6849
6850    /* Catenate everything. */
6851    res_p = PyUnicode_AS_UNICODE(res);
6852    for (i = 0; i < seqlen; ++i) {
6853        Py_ssize_t itemlen;
6854        item = items[i];
6855        itemlen = PyUnicode_GET_SIZE(item);
6856        /* Copy item, and maybe the separator. */
6857        if (i) {
6858            Py_UNICODE_COPY(res_p, sep, seplen);
6859            res_p += seplen;
6860        }
6861        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6862        res_p += itemlen;
6863    }
6864
6865  Done:
6866    Py_DECREF(fseq);
6867    return (PyObject *)res;
6868
6869  onError:
6870    Py_DECREF(fseq);
6871    Py_XDECREF(res);
6872    return NULL;
6873}
6874
6875static PyUnicodeObject *
6876pad(PyUnicodeObject *self,
6877    Py_ssize_t left,
6878    Py_ssize_t right,
6879    Py_UNICODE fill)
6880{
6881    PyUnicodeObject *u;
6882
6883    if (left < 0)
6884        left = 0;
6885    if (right < 0)
6886        right = 0;
6887
6888    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6889        Py_INCREF(self);
6890        return self;
6891    }
6892
6893    if (left > PY_SSIZE_T_MAX - self->length ||
6894        right > PY_SSIZE_T_MAX - (left + self->length)) {
6895        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6896        return NULL;
6897    }
6898    u = _PyUnicode_New(left + self->length + right);
6899    if (u) {
6900        if (left)
6901            Py_UNICODE_FILL(u->str, fill, left);
6902        Py_UNICODE_COPY(u->str + left, self->str, self->length);
6903        if (right)
6904            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6905    }
6906
6907    return u;
6908}
6909
6910PyObject *
6911PyUnicode_Splitlines(PyObject *string, int keepends)
6912{
6913    PyObject *list;
6914
6915    string = PyUnicode_FromObject(string);
6916    if (string == NULL)
6917        return NULL;
6918
6919    list = stringlib_splitlines(
6920        (PyObject*) string, PyUnicode_AS_UNICODE(string),
6921        PyUnicode_GET_SIZE(string), keepends);
6922
6923    Py_DECREF(string);
6924    return list;
6925}
6926
6927static PyObject *
6928split(PyUnicodeObject *self,
6929      PyUnicodeObject *substring,
6930      Py_ssize_t maxcount)
6931{
6932    if (maxcount < 0)
6933        maxcount = PY_SSIZE_T_MAX;
6934
6935    if (substring == NULL)
6936        return stringlib_split_whitespace(
6937            (PyObject*) self,  self->str, self->length, maxcount
6938            );
6939
6940    return stringlib_split(
6941        (PyObject*) self,  self->str, self->length,
6942        substring->str, substring->length,
6943        maxcount
6944        );
6945}
6946
6947static PyObject *
6948rsplit(PyUnicodeObject *self,
6949       PyUnicodeObject *substring,
6950       Py_ssize_t maxcount)
6951{
6952    if (maxcount < 0)
6953        maxcount = PY_SSIZE_T_MAX;
6954
6955    if (substring == NULL)
6956        return stringlib_rsplit_whitespace(
6957            (PyObject*) self,  self->str, self->length, maxcount
6958            );
6959
6960    return stringlib_rsplit(
6961        (PyObject*) self,  self->str, self->length,
6962        substring->str, substring->length,
6963        maxcount
6964        );
6965}
6966
6967static PyObject *
6968replace(PyUnicodeObject *self,
6969        PyUnicodeObject *str1,
6970        PyUnicodeObject *str2,
6971        Py_ssize_t maxcount)
6972{
6973    PyUnicodeObject *u;
6974
6975    if (maxcount < 0)
6976        maxcount = PY_SSIZE_T_MAX;
6977    else if (maxcount == 0 || self->length == 0)
6978        goto nothing;
6979
6980    if (str1->length == str2->length) {
6981        Py_ssize_t i;
6982        /* same length */
6983        if (str1->length == 0)
6984            goto nothing;
6985        if (str1->length == 1) {
6986            /* replace characters */
6987            Py_UNICODE u1, u2;
6988            if (!findchar(self->str, self->length, str1->str[0]))
6989                goto nothing;
6990            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6991            if (!u)
6992                return NULL;
6993            Py_UNICODE_COPY(u->str, self->str, self->length);
6994            u1 = str1->str[0];
6995            u2 = str2->str[0];
6996            for (i = 0; i < u->length; i++)
6997                if (u->str[i] == u1) {
6998                    if (--maxcount < 0)
6999                        break;
7000                    u->str[i] = u2;
7001                }
7002        } else {
7003            i = stringlib_find(
7004                self->str, self->length, str1->str, str1->length, 0
7005                );
7006            if (i < 0)
7007                goto nothing;
7008            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7009            if (!u)
7010                return NULL;
7011            Py_UNICODE_COPY(u->str, self->str, self->length);
7012
7013            /* change everything in-place, starting with this one */
7014            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7015            i += str1->length;
7016
7017            while ( --maxcount > 0) {
7018                i = stringlib_find(self->str+i, self->length-i,
7019                                   str1->str, str1->length,
7020                                   i);
7021                if (i == -1)
7022                    break;
7023                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7024                i += str1->length;
7025            }
7026        }
7027    } else {
7028
7029        Py_ssize_t n, i, j;
7030        Py_ssize_t product, new_size, delta;
7031        Py_UNICODE *p;
7032
7033        /* replace strings */
7034        n = stringlib_count(self->str, self->length, str1->str, str1->length,
7035                            maxcount);
7036        if (n == 0)
7037            goto nothing;
7038        /* new_size = self->length + n * (str2->length - str1->length)); */
7039        delta = (str2->length - str1->length);
7040        if (delta == 0) {
7041            new_size = self->length;
7042        } else {
7043            product = n * (str2->length - str1->length);
7044            if ((product / (str2->length - str1->length)) != n) {
7045                PyErr_SetString(PyExc_OverflowError,
7046                                "replace string is too long");
7047                return NULL;
7048            }
7049            new_size = self->length + product;
7050            if (new_size < 0) {
7051                PyErr_SetString(PyExc_OverflowError,
7052                                "replace string is too long");
7053                return NULL;
7054            }
7055        }
7056        u = _PyUnicode_New(new_size);
7057        if (!u)
7058            return NULL;
7059        i = 0;
7060        p = u->str;
7061        if (str1->length > 0) {
7062            while (n-- > 0) {
7063                /* look for next match */
7064                j = stringlib_find(self->str+i, self->length-i,
7065                                   str1->str, str1->length,
7066                                   i);
7067                if (j == -1)
7068                    break;
7069                else if (j > i) {
7070                    /* copy unchanged part [i:j] */
7071                    Py_UNICODE_COPY(p, self->str+i, j-i);
7072                    p += j - i;
7073                }
7074                /* copy substitution string */
7075                if (str2->length > 0) {
7076                    Py_UNICODE_COPY(p, str2->str, str2->length);
7077                    p += str2->length;
7078                }
7079                i = j + str1->length;
7080            }
7081            if (i < self->length)
7082                /* copy tail [i:] */
7083                Py_UNICODE_COPY(p, self->str+i, self->length-i);
7084        } else {
7085            /* interleave */
7086            while (n > 0) {
7087                Py_UNICODE_COPY(p, str2->str, str2->length);
7088                p += str2->length;
7089                if (--n <= 0)
7090                    break;
7091                *p++ = self->str[i++];
7092            }
7093            Py_UNICODE_COPY(p, self->str+i, self->length-i);
7094        }
7095    }
7096    return (PyObject *) u;
7097
7098  nothing:
7099    /* nothing to replace; return original string (when possible) */
7100    if (PyUnicode_CheckExact(self)) {
7101        Py_INCREF(self);
7102        return (PyObject *) self;
7103    }
7104    return PyUnicode_FromUnicode(self->str, self->length);
7105}
7106
7107/* --- Unicode Object Methods --------------------------------------------- */
7108
7109PyDoc_STRVAR(title__doc__,
7110             "S.title() -> str\n\
7111\n\
7112Return a titlecased version of S, i.e. words start with title case\n\
7113characters, all remaining cased characters have lower case.");
7114
7115static PyObject*
7116unicode_title(PyUnicodeObject *self)
7117{
7118    return fixup(self, fixtitle);
7119}
7120
7121PyDoc_STRVAR(capitalize__doc__,
7122             "S.capitalize() -> str\n\
7123\n\
7124Return a capitalized version of S, i.e. make the first character\n\
7125have upper case and the rest lower case.");
7126
7127static PyObject*
7128unicode_capitalize(PyUnicodeObject *self)
7129{
7130    return fixup(self, fixcapitalize);
7131}
7132
7133#if 0
7134PyDoc_STRVAR(capwords__doc__,
7135             "S.capwords() -> str\n\
7136\n\
7137Apply .capitalize() to all words in S and return the result with\n\
7138normalized whitespace (all whitespace strings are replaced by ' ').");
7139
7140static PyObject*
7141unicode_capwords(PyUnicodeObject *self)
7142{
7143    PyObject *list;
7144    PyObject *item;
7145    Py_ssize_t i;
7146
7147    /* Split into words */
7148    list = split(self, NULL, -1);
7149    if (!list)
7150        return NULL;
7151
7152    /* Capitalize each word */
7153    for (i = 0; i < PyList_GET_SIZE(list); i++) {
7154        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
7155                     fixcapitalize);
7156        if (item == NULL)
7157            goto onError;
7158        Py_DECREF(PyList_GET_ITEM(list, i));
7159        PyList_SET_ITEM(list, i, item);
7160    }
7161
7162    /* Join the words to form a new string */
7163    item = PyUnicode_Join(NULL, list);
7164
7165  onError:
7166    Py_DECREF(list);
7167    return (PyObject *)item;
7168}
7169#endif
7170
7171/* Argument converter.  Coerces to a single unicode character */
7172
7173static int
7174convert_uc(PyObject *obj, void *addr)
7175{
7176    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7177    PyObject *uniobj;
7178    Py_UNICODE *unistr;
7179
7180    uniobj = PyUnicode_FromObject(obj);
7181    if (uniobj == NULL) {
7182        PyErr_SetString(PyExc_TypeError,
7183                        "The fill character cannot be converted to Unicode");
7184        return 0;
7185    }
7186    if (PyUnicode_GET_SIZE(uniobj) != 1) {
7187        PyErr_SetString(PyExc_TypeError,
7188                        "The fill character must be exactly one character long");
7189        Py_DECREF(uniobj);
7190        return 0;
7191    }
7192    unistr = PyUnicode_AS_UNICODE(uniobj);
7193    *fillcharloc = unistr[0];
7194    Py_DECREF(uniobj);
7195    return 1;
7196}
7197
7198PyDoc_STRVAR(center__doc__,
7199             "S.center(width[, fillchar]) -> str\n\
7200\n\
7201Return S centered in a string of length width. Padding is\n\
7202done using the specified fill character (default is a space)");
7203
7204static PyObject *
7205unicode_center(PyUnicodeObject *self, PyObject *args)
7206{
7207    Py_ssize_t marg, left;
7208    Py_ssize_t width;
7209    Py_UNICODE fillchar = ' ';
7210
7211    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
7212        return NULL;
7213
7214    if (self->length >= width && PyUnicode_CheckExact(self)) {
7215        Py_INCREF(self);
7216        return (PyObject*) self;
7217    }
7218
7219    marg = width - self->length;
7220    left = marg / 2 + (marg & width & 1);
7221
7222    return (PyObject*) pad(self, left, marg - left, fillchar);
7223}
7224
7225#if 0
7226
7227/* This code should go into some future Unicode collation support
7228   module. The basic comparison should compare ordinals on a naive
7229   basis (this is what Java does and thus Jython too). */
7230
7231/* speedy UTF-16 code point order comparison */
7232/* gleaned from: */
7233/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7234
7235static short utf16Fixup[32] =
7236{
7237    0, 0, 0, 0, 0, 0, 0, 0,
7238    0, 0, 0, 0, 0, 0, 0, 0,
7239    0, 0, 0, 0, 0, 0, 0, 0,
7240    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
7241};
7242
7243static int
7244unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7245{
7246    Py_ssize_t len1, len2;
7247
7248    Py_UNICODE *s1 = str1->str;
7249    Py_UNICODE *s2 = str2->str;
7250
7251    len1 = str1->length;
7252    len2 = str2->length;
7253
7254    while (len1 > 0 && len2 > 0) {
7255        Py_UNICODE c1, c2;
7256
7257        c1 = *s1++;
7258        c2 = *s2++;
7259
7260        if (c1 > (1<<11) * 26)
7261            c1 += utf16Fixup[c1>>11];
7262        if (c2 > (1<<11) * 26)
7263            c2 += utf16Fixup[c2>>11];
7264        /* now c1 and c2 are in UTF-32-compatible order */
7265
7266        if (c1 != c2)
7267            return (c1 < c2) ? -1 : 1;
7268
7269        len1--; len2--;
7270    }
7271
7272    return (len1 < len2) ? -1 : (len1 != len2);
7273}
7274
7275#else
7276
7277static int
7278unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7279{
7280    register Py_ssize_t len1, len2;
7281
7282    Py_UNICODE *s1 = str1->str;
7283    Py_UNICODE *s2 = str2->str;
7284
7285    len1 = str1->length;
7286    len2 = str2->length;
7287
7288    while (len1 > 0 && len2 > 0) {
7289        Py_UNICODE c1, c2;
7290
7291        c1 = *s1++;
7292        c2 = *s2++;
7293
7294        if (c1 != c2)
7295            return (c1 < c2) ? -1 : 1;
7296
7297        len1--; len2--;
7298    }
7299
7300    return (len1 < len2) ? -1 : (len1 != len2);
7301}
7302
7303#endif
7304
7305int
7306PyUnicode_Compare(PyObject *left, PyObject *right)
7307{
7308    if (PyUnicode_Check(left) && PyUnicode_Check(right))
7309        return unicode_compare((PyUnicodeObject *)left,
7310                               (PyUnicodeObject *)right);
7311    PyErr_Format(PyExc_TypeError,
7312                 "Can't compare %.100s and %.100s",
7313                 left->ob_type->tp_name,
7314                 right->ob_type->tp_name);
7315    return -1;
7316}
7317
7318int
7319PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7320{
7321    int i;
7322    Py_UNICODE *id;
7323    assert(PyUnicode_Check(uni));
7324    id = PyUnicode_AS_UNICODE(uni);
7325    /* Compare Unicode string and source character set string */
7326    for (i = 0; id[i] && str[i]; i++)
7327        if (id[i] != str[i])
7328            return ((int)id[i] < (int)str[i]) ? -1 : 1;
7329    /* This check keeps Python strings that end in '\0' from comparing equal
7330     to C strings identical up to that point. */
7331    if (PyUnicode_GET_SIZE(uni) != i || id[i])
7332        return 1; /* uni is longer */
7333    if (str[i])
7334        return -1; /* str is longer */
7335    return 0;
7336}
7337
7338
7339#define TEST_COND(cond)                         \
7340    ((cond) ? Py_True : Py_False)
7341
7342PyObject *
7343PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
7344{
7345    int result;
7346
7347    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7348        PyObject *v;
7349        if (((PyUnicodeObject *) left)->length !=
7350            ((PyUnicodeObject *) right)->length) {
7351            if (op == Py_EQ) {
7352                Py_INCREF(Py_False);
7353                return Py_False;
7354            }
7355            if (op == Py_NE) {
7356                Py_INCREF(Py_True);
7357                return Py_True;
7358            }
7359        }
7360        if (left == right)
7361            result = 0;
7362        else
7363            result = unicode_compare((PyUnicodeObject *)left,
7364                                     (PyUnicodeObject *)right);
7365
7366        /* Convert the return value to a Boolean */
7367        switch (op) {
7368        case Py_EQ:
7369            v = TEST_COND(result == 0);
7370            break;
7371        case Py_NE:
7372            v = TEST_COND(result != 0);
7373            break;
7374        case Py_LE:
7375            v = TEST_COND(result <= 0);
7376            break;
7377        case Py_GE:
7378            v = TEST_COND(result >= 0);
7379            break;
7380        case Py_LT:
7381            v = TEST_COND(result == -1);
7382            break;
7383        case Py_GT:
7384            v = TEST_COND(result == 1);
7385            break;
7386        default:
7387            PyErr_BadArgument();
7388            return NULL;
7389        }
7390        Py_INCREF(v);
7391        return v;
7392    }
7393
7394    Py_INCREF(Py_NotImplemented);
7395    return Py_NotImplemented;
7396}
7397
7398int
7399PyUnicode_Contains(PyObject *container, PyObject *element)
7400{
7401    PyObject *str, *sub;
7402    int result;
7403
7404    /* Coerce the two arguments */
7405    sub = PyUnicode_FromObject(element);
7406    if (!sub) {
7407        PyErr_Format(PyExc_TypeError,
7408                     "'in <string>' requires string as left operand, not %s",
7409                     element->ob_type->tp_name);
7410        return -1;
7411    }
7412
7413    str = PyUnicode_FromObject(container);
7414    if (!str) {
7415        Py_DECREF(sub);
7416        return -1;
7417    }
7418
7419    result = stringlib_contains_obj(str, sub);
7420
7421    Py_DECREF(str);
7422    Py_DECREF(sub);
7423
7424    return result;
7425}
7426
7427/* Concat to string or Unicode object giving a new Unicode object. */
7428
7429PyObject *
7430PyUnicode_Concat(PyObject *left, PyObject *right)
7431{
7432    PyUnicodeObject *u = NULL, *v = NULL, *w;
7433
7434    /* Coerce the two arguments */
7435    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7436    if (u == NULL)
7437        goto onError;
7438    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7439    if (v == NULL)
7440        goto onError;
7441
7442    /* Shortcuts */
7443    if (v == unicode_empty) {
7444        Py_DECREF(v);
7445        return (PyObject *)u;
7446    }
7447    if (u == unicode_empty) {
7448        Py_DECREF(u);
7449        return (PyObject *)v;
7450    }
7451
7452    /* Concat the two Unicode strings */
7453    w = _PyUnicode_New(u->length + v->length);
7454    if (w == NULL)
7455        goto onError;
7456    Py_UNICODE_COPY(w->str, u->str, u->length);
7457    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7458
7459    Py_DECREF(u);
7460    Py_DECREF(v);
7461    return (PyObject *)w;
7462
7463  onError:
7464    Py_XDECREF(u);
7465    Py_XDECREF(v);
7466    return NULL;
7467}
7468
7469void
7470PyUnicode_Append(PyObject **pleft, PyObject *right)
7471{
7472    PyObject *new;
7473    if (*pleft == NULL)
7474        return;
7475    if (right == NULL || !PyUnicode_Check(*pleft)) {
7476        Py_DECREF(*pleft);
7477        *pleft = NULL;
7478        return;
7479    }
7480    new = PyUnicode_Concat(*pleft, right);
7481    Py_DECREF(*pleft);
7482    *pleft = new;
7483}
7484
7485void
7486PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7487{
7488    PyUnicode_Append(pleft, right);
7489    Py_XDECREF(right);
7490}
7491
7492PyDoc_STRVAR(count__doc__,
7493             "S.count(sub[, start[, end]]) -> int\n\
7494\n\
7495Return the number of non-overlapping occurrences of substring sub in\n\
7496string S[start:end].  Optional arguments start and end are\n\
7497interpreted as in slice notation.");
7498
7499static PyObject *
7500unicode_count(PyUnicodeObject *self, PyObject *args)
7501{
7502    PyUnicodeObject *substring;
7503    Py_ssize_t start = 0;
7504    Py_ssize_t end = PY_SSIZE_T_MAX;
7505    PyObject *result;
7506
7507    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
7508                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7509        return NULL;
7510
7511    substring = (PyUnicodeObject *)PyUnicode_FromObject(
7512        (PyObject *)substring);
7513    if (substring == NULL)
7514        return NULL;
7515
7516    ADJUST_INDICES(start, end, self->length);
7517    result = PyLong_FromSsize_t(
7518        stringlib_count(self->str + start, end - start,
7519                        substring->str, substring->length,
7520                        PY_SSIZE_T_MAX)
7521        );
7522
7523    Py_DECREF(substring);
7524
7525    return result;
7526}
7527
7528PyDoc_STRVAR(encode__doc__,
7529             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
7530\n\
7531Encode S using the codec registered for encoding. Default encoding\n\
7532is 'utf-8'. errors may be given to set a different error\n\
7533handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7534a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7535'xmlcharrefreplace' as well as any other name registered with\n\
7536codecs.register_error that can handle UnicodeEncodeErrors.");
7537
7538static PyObject *
7539unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7540{
7541    static char *kwlist[] = {"encoding", "errors", 0};
7542    char *encoding = NULL;
7543    char *errors = NULL;
7544
7545    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7546                                     kwlist, &encoding, &errors))
7547        return NULL;
7548    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7549}
7550
7551PyDoc_STRVAR(expandtabs__doc__,
7552             "S.expandtabs([tabsize]) -> str\n\
7553\n\
7554Return a copy of S where all tab characters are expanded using spaces.\n\
7555If tabsize is not given, a tab size of 8 characters is assumed.");
7556
7557static PyObject*
7558unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7559{
7560    Py_UNICODE *e;
7561    Py_UNICODE *p;
7562    Py_UNICODE *q;
7563    Py_UNICODE *qe;
7564    Py_ssize_t i, j, incr;
7565    PyUnicodeObject *u;
7566    int tabsize = 8;
7567
7568    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
7569        return NULL;
7570
7571    /* First pass: determine size of output string */
7572    i = 0; /* chars up to and including most recent \n or \r */
7573    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7574    e = self->str + self->length; /* end of input */
7575    for (p = self->str; p < e; p++)
7576        if (*p == '\t') {
7577            if (tabsize > 0) {
7578                incr = tabsize - (j % tabsize); /* cannot overflow */
7579                if (j > PY_SSIZE_T_MAX - incr)
7580                    goto overflow1;
7581                j += incr;
7582            }
7583        }
7584        else {
7585            if (j > PY_SSIZE_T_MAX - 1)
7586                goto overflow1;
7587            j++;
7588            if (*p == '\n' || *p == '\r') {
7589                if (i > PY_SSIZE_T_MAX - j)
7590                    goto overflow1;
7591                i += j;
7592                j = 0;
7593            }
7594        }
7595
7596    if (i > PY_SSIZE_T_MAX - j)
7597        goto overflow1;
7598
7599    /* Second pass: create output string and fill it */
7600    u = _PyUnicode_New(i + j);
7601    if (!u)
7602        return NULL;
7603
7604    j = 0; /* same as in first pass */
7605    q = u->str; /* next output char */
7606    qe = u->str + u->length; /* end of output */
7607
7608    for (p = self->str; p < e; p++)
7609        if (*p == '\t') {
7610            if (tabsize > 0) {
7611                i = tabsize - (j % tabsize);
7612                j += i;
7613                while (i--) {
7614                    if (q >= qe)
7615                        goto overflow2;
7616                    *q++ = ' ';
7617                }
7618            }
7619        }
7620        else {
7621            if (q >= qe)
7622                goto overflow2;
7623            *q++ = *p;
7624            j++;
7625            if (*p == '\n' || *p == '\r')
7626                j = 0;
7627        }
7628
7629    return (PyObject*) u;
7630
7631  overflow2:
7632    Py_DECREF(u);
7633  overflow1:
7634    PyErr_SetString(PyExc_OverflowError, "new string is too long");
7635    return NULL;
7636}
7637
7638PyDoc_STRVAR(find__doc__,
7639             "S.find(sub[, start[, end]]) -> int\n\
7640\n\
7641Return the lowest index in S where substring sub is found,\n\
7642such that sub is contained within s[start:end].  Optional\n\
7643arguments start and end are interpreted as in slice notation.\n\
7644\n\
7645Return -1 on failure.");
7646
7647static PyObject *
7648unicode_find(PyUnicodeObject *self, PyObject *args)
7649{
7650    PyObject *substring;
7651    Py_ssize_t start;
7652    Py_ssize_t end;
7653    Py_ssize_t result;
7654
7655    if (!_ParseTupleFinds(args, &substring, &start, &end))
7656        return NULL;
7657
7658    result = stringlib_find_slice(
7659        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7660        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7661        start, end
7662        );
7663
7664    Py_DECREF(substring);
7665
7666    return PyLong_FromSsize_t(result);
7667}
7668
7669static PyObject *
7670unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7671{
7672    if (index < 0 || index >= self->length) {
7673        PyErr_SetString(PyExc_IndexError, "string index out of range");
7674        return NULL;
7675    }
7676
7677    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7678}
7679
7680/* Believe it or not, this produces the same value for ASCII strings
7681   as string_hash(). */
7682static Py_hash_t
7683unicode_hash(PyUnicodeObject *self)
7684{
7685    Py_ssize_t len;
7686    Py_UNICODE *p;
7687    Py_hash_t x;
7688
7689    if (self->hash != -1)
7690        return self->hash;
7691    len = Py_SIZE(self);
7692    p = self->str;
7693    x = *p << 7;
7694    while (--len >= 0)
7695        x = (1000003*x) ^ *p++;
7696    x ^= Py_SIZE(self);
7697    if (x == -1)
7698        x = -2;
7699    self->hash = x;
7700    return x;
7701}
7702
7703PyDoc_STRVAR(index__doc__,
7704             "S.index(sub[, start[, end]]) -> int\n\
7705\n\
7706Like S.find() but raise ValueError when the substring is not found.");
7707
7708static PyObject *
7709unicode_index(PyUnicodeObject *self, PyObject *args)
7710{
7711    Py_ssize_t result;
7712    PyObject *substring;
7713    Py_ssize_t start;
7714    Py_ssize_t end;
7715
7716    if (!_ParseTupleFinds(args, &substring, &start, &end))
7717        return NULL;
7718
7719    result = stringlib_find_slice(
7720        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7721        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7722        start, end
7723        );
7724
7725    Py_DECREF(substring);
7726
7727    if (result < 0) {
7728        PyErr_SetString(PyExc_ValueError, "substring not found");
7729        return NULL;
7730    }
7731
7732    return PyLong_FromSsize_t(result);
7733}
7734
7735PyDoc_STRVAR(islower__doc__,
7736             "S.islower() -> bool\n\
7737\n\
7738Return True if all cased characters in S are lowercase and there is\n\
7739at least one cased character in S, False otherwise.");
7740
7741static PyObject*
7742unicode_islower(PyUnicodeObject *self)
7743{
7744    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7745    register const Py_UNICODE *e;
7746    int cased;
7747
7748    /* Shortcut for single character strings */
7749    if (PyUnicode_GET_SIZE(self) == 1)
7750        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7751
7752    /* Special case for empty strings */
7753    if (PyUnicode_GET_SIZE(self) == 0)
7754        return PyBool_FromLong(0);
7755
7756    e = p + PyUnicode_GET_SIZE(self);
7757    cased = 0;
7758    for (; p < e; p++) {
7759        register const Py_UNICODE ch = *p;
7760
7761        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7762            return PyBool_FromLong(0);
7763        else if (!cased && Py_UNICODE_ISLOWER(ch))
7764            cased = 1;
7765    }
7766    return PyBool_FromLong(cased);
7767}
7768
7769PyDoc_STRVAR(isupper__doc__,
7770             "S.isupper() -> bool\n\
7771\n\
7772Return True if all cased characters in S are uppercase and there is\n\
7773at least one cased character in S, False otherwise.");
7774
7775static PyObject*
7776unicode_isupper(PyUnicodeObject *self)
7777{
7778    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7779    register const Py_UNICODE *e;
7780    int cased;
7781
7782    /* Shortcut for single character strings */
7783    if (PyUnicode_GET_SIZE(self) == 1)
7784        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7785
7786    /* Special case for empty strings */
7787    if (PyUnicode_GET_SIZE(self) == 0)
7788        return PyBool_FromLong(0);
7789
7790    e = p + PyUnicode_GET_SIZE(self);
7791    cased = 0;
7792    for (; p < e; p++) {
7793        register const Py_UNICODE ch = *p;
7794
7795        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7796            return PyBool_FromLong(0);
7797        else if (!cased && Py_UNICODE_ISUPPER(ch))
7798            cased = 1;
7799    }
7800    return PyBool_FromLong(cased);
7801}
7802
7803PyDoc_STRVAR(istitle__doc__,
7804             "S.istitle() -> bool\n\
7805\n\
7806Return True if S is a titlecased string and there is at least one\n\
7807character in S, i.e. upper- and titlecase characters may only\n\
7808follow uncased characters and lowercase characters only cased ones.\n\
7809Return False otherwise.");
7810
7811static PyObject*
7812unicode_istitle(PyUnicodeObject *self)
7813{
7814    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7815    register const Py_UNICODE *e;
7816    int cased, previous_is_cased;
7817
7818    /* Shortcut for single character strings */
7819    if (PyUnicode_GET_SIZE(self) == 1)
7820        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7821                               (Py_UNICODE_ISUPPER(*p) != 0));
7822
7823    /* Special case for empty strings */
7824    if (PyUnicode_GET_SIZE(self) == 0)
7825        return PyBool_FromLong(0);
7826
7827    e = p + PyUnicode_GET_SIZE(self);
7828    cased = 0;
7829    previous_is_cased = 0;
7830    for (; p < e; p++) {
7831        register const Py_UNICODE ch = *p;
7832
7833        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7834            if (previous_is_cased)
7835                return PyBool_FromLong(0);
7836            previous_is_cased = 1;
7837            cased = 1;
7838        }
7839        else if (Py_UNICODE_ISLOWER(ch)) {
7840            if (!previous_is_cased)
7841                return PyBool_FromLong(0);
7842            previous_is_cased = 1;
7843            cased = 1;
7844        }
7845        else
7846            previous_is_cased = 0;
7847    }
7848    return PyBool_FromLong(cased);
7849}
7850
7851PyDoc_STRVAR(isspace__doc__,
7852             "S.isspace() -> bool\n\
7853\n\
7854Return True if all characters in S are whitespace\n\
7855and there is at least one character in S, False otherwise.");
7856
7857static PyObject*
7858unicode_isspace(PyUnicodeObject *self)
7859{
7860    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7861    register const Py_UNICODE *e;
7862
7863    /* Shortcut for single character strings */
7864    if (PyUnicode_GET_SIZE(self) == 1 &&
7865        Py_UNICODE_ISSPACE(*p))
7866        return PyBool_FromLong(1);
7867
7868    /* Special case for empty strings */
7869    if (PyUnicode_GET_SIZE(self) == 0)
7870        return PyBool_FromLong(0);
7871
7872    e = p + PyUnicode_GET_SIZE(self);
7873    for (; p < e; p++) {
7874        if (!Py_UNICODE_ISSPACE(*p))
7875            return PyBool_FromLong(0);
7876    }
7877    return PyBool_FromLong(1);
7878}
7879
7880PyDoc_STRVAR(isalpha__doc__,
7881             "S.isalpha() -> bool\n\
7882\n\
7883Return True if all characters in S are alphabetic\n\
7884and there is at least one character in S, False otherwise.");
7885
7886static PyObject*
7887unicode_isalpha(PyUnicodeObject *self)
7888{
7889    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7890    register const Py_UNICODE *e;
7891
7892    /* Shortcut for single character strings */
7893    if (PyUnicode_GET_SIZE(self) == 1 &&
7894        Py_UNICODE_ISALPHA(*p))
7895        return PyBool_FromLong(1);
7896
7897    /* Special case for empty strings */
7898    if (PyUnicode_GET_SIZE(self) == 0)
7899        return PyBool_FromLong(0);
7900
7901    e = p + PyUnicode_GET_SIZE(self);
7902    for (; p < e; p++) {
7903        if (!Py_UNICODE_ISALPHA(*p))
7904            return PyBool_FromLong(0);
7905    }
7906    return PyBool_FromLong(1);
7907}
7908
7909PyDoc_STRVAR(isalnum__doc__,
7910             "S.isalnum() -> bool\n\
7911\n\
7912Return True if all characters in S are alphanumeric\n\
7913and there is at least one character in S, False otherwise.");
7914
7915static PyObject*
7916unicode_isalnum(PyUnicodeObject *self)
7917{
7918    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7919    register const Py_UNICODE *e;
7920
7921    /* Shortcut for single character strings */
7922    if (PyUnicode_GET_SIZE(self) == 1 &&
7923        Py_UNICODE_ISALNUM(*p))
7924        return PyBool_FromLong(1);
7925
7926    /* Special case for empty strings */
7927    if (PyUnicode_GET_SIZE(self) == 0)
7928        return PyBool_FromLong(0);
7929
7930    e = p + PyUnicode_GET_SIZE(self);
7931    for (; p < e; p++) {
7932        if (!Py_UNICODE_ISALNUM(*p))
7933            return PyBool_FromLong(0);
7934    }
7935    return PyBool_FromLong(1);
7936}
7937
7938PyDoc_STRVAR(isdecimal__doc__,
7939             "S.isdecimal() -> bool\n\
7940\n\
7941Return True if there are only decimal characters in S,\n\
7942False otherwise.");
7943
7944static PyObject*
7945unicode_isdecimal(PyUnicodeObject *self)
7946{
7947    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7948    register const Py_UNICODE *e;
7949
7950    /* Shortcut for single character strings */
7951    if (PyUnicode_GET_SIZE(self) == 1 &&
7952        Py_UNICODE_ISDECIMAL(*p))
7953        return PyBool_FromLong(1);
7954
7955    /* Special case for empty strings */
7956    if (PyUnicode_GET_SIZE(self) == 0)
7957        return PyBool_FromLong(0);
7958
7959    e = p + PyUnicode_GET_SIZE(self);
7960    for (; p < e; p++) {
7961        if (!Py_UNICODE_ISDECIMAL(*p))
7962            return PyBool_FromLong(0);
7963    }
7964    return PyBool_FromLong(1);
7965}
7966
7967PyDoc_STRVAR(isdigit__doc__,
7968             "S.isdigit() -> bool\n\
7969\n\
7970Return True if all characters in S are digits\n\
7971and there is at least one character in S, False otherwise.");
7972
7973static PyObject*
7974unicode_isdigit(PyUnicodeObject *self)
7975{
7976    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7977    register const Py_UNICODE *e;
7978
7979    /* Shortcut for single character strings */
7980    if (PyUnicode_GET_SIZE(self) == 1 &&
7981        Py_UNICODE_ISDIGIT(*p))
7982        return PyBool_FromLong(1);
7983
7984    /* Special case for empty strings */
7985    if (PyUnicode_GET_SIZE(self) == 0)
7986        return PyBool_FromLong(0);
7987
7988    e = p + PyUnicode_GET_SIZE(self);
7989    for (; p < e; p++) {
7990        if (!Py_UNICODE_ISDIGIT(*p))
7991            return PyBool_FromLong(0);
7992    }
7993    return PyBool_FromLong(1);
7994}
7995
7996PyDoc_STRVAR(isnumeric__doc__,
7997             "S.isnumeric() -> bool\n\
7998\n\
7999Return True if there are only numeric characters in S,\n\
8000False otherwise.");
8001
8002static PyObject*
8003unicode_isnumeric(PyUnicodeObject *self)
8004{
8005    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8006    register const Py_UNICODE *e;
8007
8008    /* Shortcut for single character strings */
8009    if (PyUnicode_GET_SIZE(self) == 1 &&
8010        Py_UNICODE_ISNUMERIC(*p))
8011        return PyBool_FromLong(1);
8012
8013    /* Special case for empty strings */
8014    if (PyUnicode_GET_SIZE(self) == 0)
8015        return PyBool_FromLong(0);
8016
8017    e = p + PyUnicode_GET_SIZE(self);
8018    for (; p < e; p++) {
8019        if (!Py_UNICODE_ISNUMERIC(*p))
8020            return PyBool_FromLong(0);
8021    }
8022    return PyBool_FromLong(1);
8023}
8024
8025int
8026PyUnicode_IsIdentifier(PyObject *self)
8027{
8028    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8029    register const Py_UNICODE *e;
8030
8031    /* Special case for empty strings */
8032    if (PyUnicode_GET_SIZE(self) == 0)
8033        return 0;
8034
8035    /* PEP 3131 says that the first character must be in
8036       XID_Start and subsequent characters in XID_Continue,
8037       and for the ASCII range, the 2.x rules apply (i.e
8038       start with letters and underscore, continue with
8039       letters, digits, underscore). However, given the current
8040       definition of XID_Start and XID_Continue, it is sufficient
8041       to check just for these, except that _ must be allowed
8042       as starting an identifier.  */
8043    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8044        return 0;
8045
8046    e = p + PyUnicode_GET_SIZE(self);
8047    for (p++; p < e; p++) {
8048        if (!_PyUnicode_IsXidContinue(*p))
8049            return 0;
8050    }
8051    return 1;
8052}
8053
8054PyDoc_STRVAR(isidentifier__doc__,
8055             "S.isidentifier() -> bool\n\
8056\n\
8057Return True if S is a valid identifier according\n\
8058to the language definition.");
8059
8060static PyObject*
8061unicode_isidentifier(PyObject *self)
8062{
8063    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8064}
8065
8066PyDoc_STRVAR(isprintable__doc__,
8067             "S.isprintable() -> bool\n\
8068\n\
8069Return True if all characters in S are considered\n\
8070printable in repr() or S is empty, False otherwise.");
8071
8072static PyObject*
8073unicode_isprintable(PyObject *self)
8074{
8075    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8076    register const Py_UNICODE *e;
8077
8078    /* Shortcut for single character strings */
8079    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8080        Py_RETURN_TRUE;
8081    }
8082
8083    e = p + PyUnicode_GET_SIZE(self);
8084    for (; p < e; p++) {
8085        if (!Py_UNICODE_ISPRINTABLE(*p)) {
8086            Py_RETURN_FALSE;
8087        }
8088    }
8089    Py_RETURN_TRUE;
8090}
8091
8092PyDoc_STRVAR(join__doc__,
8093             "S.join(iterable) -> str\n\
8094\n\
8095Return a string which is the concatenation of the strings in the\n\
8096iterable.  The separator between elements is S.");
8097
8098static PyObject*
8099unicode_join(PyObject *self, PyObject *data)
8100{
8101    return PyUnicode_Join(self, data);
8102}
8103
8104static Py_ssize_t
8105unicode_length(PyUnicodeObject *self)
8106{
8107    return self->length;
8108}
8109
8110PyDoc_STRVAR(ljust__doc__,
8111             "S.ljust(width[, fillchar]) -> str\n\
8112\n\
8113Return S left-justified in a Unicode string of length width. Padding is\n\
8114done using the specified fill character (default is a space).");
8115
8116static PyObject *
8117unicode_ljust(PyUnicodeObject *self, PyObject *args)
8118{
8119    Py_ssize_t width;
8120    Py_UNICODE fillchar = ' ';
8121
8122    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
8123        return NULL;
8124
8125    if (self->length >= width && PyUnicode_CheckExact(self)) {
8126        Py_INCREF(self);
8127        return (PyObject*) self;
8128    }
8129
8130    return (PyObject*) pad(self, 0, width - self->length, fillchar);
8131}
8132
8133PyDoc_STRVAR(lower__doc__,
8134             "S.lower() -> str\n\
8135\n\
8136Return a copy of the string S converted to lowercase.");
8137
8138static PyObject*
8139unicode_lower(PyUnicodeObject *self)
8140{
8141    return fixup(self, fixlower);
8142}
8143
8144#define LEFTSTRIP 0
8145#define RIGHTSTRIP 1
8146#define BOTHSTRIP 2
8147
8148/* Arrays indexed by above */
8149static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8150
8151#define STRIPNAME(i) (stripformat[i]+3)
8152
8153/* externally visible for str.strip(unicode) */
8154PyObject *
8155_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8156{
8157    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8158    Py_ssize_t len = PyUnicode_GET_SIZE(self);
8159    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8160    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8161    Py_ssize_t i, j;
8162
8163    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
8164
8165    i = 0;
8166    if (striptype != RIGHTSTRIP) {
8167        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8168            i++;
8169        }
8170    }
8171
8172    j = len;
8173    if (striptype != LEFTSTRIP) {
8174        do {
8175            j--;
8176        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8177        j++;
8178    }
8179
8180    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8181        Py_INCREF(self);
8182        return (PyObject*)self;
8183    }
8184    else
8185        return PyUnicode_FromUnicode(s+i, j-i);
8186}
8187
8188
8189static PyObject *
8190do_strip(PyUnicodeObject *self, int striptype)
8191{
8192    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8193    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
8194
8195    i = 0;
8196    if (striptype != RIGHTSTRIP) {
8197        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8198            i++;
8199        }
8200    }
8201
8202    j = len;
8203    if (striptype != LEFTSTRIP) {
8204        do {
8205            j--;
8206        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8207        j++;
8208    }
8209
8210    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8211        Py_INCREF(self);
8212        return (PyObject*)self;
8213    }
8214    else
8215        return PyUnicode_FromUnicode(s+i, j-i);
8216}
8217
8218
8219static PyObject *
8220do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8221{
8222    PyObject *sep = NULL;
8223
8224    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8225        return NULL;
8226
8227    if (sep != NULL && sep != Py_None) {
8228        if (PyUnicode_Check(sep))
8229            return _PyUnicode_XStrip(self, striptype, sep);
8230        else {
8231            PyErr_Format(PyExc_TypeError,
8232                         "%s arg must be None or str",
8233                         STRIPNAME(striptype));
8234            return NULL;
8235        }
8236    }
8237
8238    return do_strip(self, striptype);
8239}
8240
8241
8242PyDoc_STRVAR(strip__doc__,
8243             "S.strip([chars]) -> str\n\
8244\n\
8245Return a copy of the string S with leading and trailing\n\
8246whitespace removed.\n\
8247If chars is given and not None, remove characters in chars instead.");
8248
8249static PyObject *
8250unicode_strip(PyUnicodeObject *self, PyObject *args)
8251{
8252    if (PyTuple_GET_SIZE(args) == 0)
8253        return do_strip(self, BOTHSTRIP); /* Common case */
8254    else
8255        return do_argstrip(self, BOTHSTRIP, args);
8256}
8257
8258
8259PyDoc_STRVAR(lstrip__doc__,
8260             "S.lstrip([chars]) -> str\n\
8261\n\
8262Return a copy of the string S with leading whitespace removed.\n\
8263If chars is given and not None, remove characters in chars instead.");
8264
8265static PyObject *
8266unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8267{
8268    if (PyTuple_GET_SIZE(args) == 0)
8269        return do_strip(self, LEFTSTRIP); /* Common case */
8270    else
8271        return do_argstrip(self, LEFTSTRIP, args);
8272}
8273
8274
8275PyDoc_STRVAR(rstrip__doc__,
8276             "S.rstrip([chars]) -> str\n\
8277\n\
8278Return a copy of the string S with trailing whitespace removed.\n\
8279If chars is given and not None, remove characters in chars instead.");
8280
8281static PyObject *
8282unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8283{
8284    if (PyTuple_GET_SIZE(args) == 0)
8285        return do_strip(self, RIGHTSTRIP); /* Common case */
8286    else
8287        return do_argstrip(self, RIGHTSTRIP, args);
8288}
8289
8290
8291static PyObject*
8292unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
8293{
8294    PyUnicodeObject *u;
8295    Py_UNICODE *p;
8296    Py_ssize_t nchars;
8297    size_t nbytes;
8298
8299    if (len < 1) {
8300        Py_INCREF(unicode_empty);
8301        return (PyObject *)unicode_empty;
8302    }
8303
8304    if (len == 1 && PyUnicode_CheckExact(str)) {
8305        /* no repeat, return original string */
8306        Py_INCREF(str);
8307        return (PyObject*) str;
8308    }
8309
8310    /* ensure # of chars needed doesn't overflow int and # of bytes
8311     * needed doesn't overflow size_t
8312     */
8313    nchars = len * str->length;
8314    if (nchars / len != str->length) {
8315        PyErr_SetString(PyExc_OverflowError,
8316                        "repeated string is too long");
8317        return NULL;
8318    }
8319    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8320    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8321        PyErr_SetString(PyExc_OverflowError,
8322                        "repeated string is too long");
8323        return NULL;
8324    }
8325    u = _PyUnicode_New(nchars);
8326    if (!u)
8327        return NULL;
8328
8329    p = u->str;
8330
8331    if (str->length == 1) {
8332        Py_UNICODE_FILL(p, str->str[0], len);
8333    } else {
8334        Py_ssize_t done = str->length; /* number of characters copied this far */
8335        Py_UNICODE_COPY(p, str->str, str->length);
8336        while (done < nchars) {
8337            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
8338            Py_UNICODE_COPY(p+done, p, n);
8339            done += n;
8340        }
8341    }
8342
8343    return (PyObject*) u;
8344}
8345
8346PyObject *
8347PyUnicode_Replace(PyObject *obj,
8348                  PyObject *subobj,
8349                  PyObject *replobj,
8350                  Py_ssize_t maxcount)
8351{
8352    PyObject *self;
8353    PyObject *str1;
8354    PyObject *str2;
8355    PyObject *result;
8356
8357    self = PyUnicode_FromObject(obj);
8358    if (self == NULL)
8359        return NULL;
8360    str1 = PyUnicode_FromObject(subobj);
8361    if (str1 == NULL) {
8362        Py_DECREF(self);
8363        return NULL;
8364    }
8365    str2 = PyUnicode_FromObject(replobj);
8366    if (str2 == NULL) {
8367        Py_DECREF(self);
8368        Py_DECREF(str1);
8369        return NULL;
8370    }
8371    result = replace((PyUnicodeObject *)self,
8372                     (PyUnicodeObject *)str1,
8373                     (PyUnicodeObject *)str2,
8374                     maxcount);
8375    Py_DECREF(self);
8376    Py_DECREF(str1);
8377    Py_DECREF(str2);
8378    return result;
8379}
8380
8381PyDoc_STRVAR(replace__doc__,
8382             "S.replace(old, new[, count]) -> str\n\
8383\n\
8384Return a copy of S with all occurrences of substring\n\
8385old replaced by new.  If the optional argument count is\n\
8386given, only the first count occurrences are replaced.");
8387
8388static PyObject*
8389unicode_replace(PyUnicodeObject *self, PyObject *args)
8390{
8391    PyUnicodeObject *str1;
8392    PyUnicodeObject *str2;
8393    Py_ssize_t maxcount = -1;
8394    PyObject *result;
8395
8396    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8397        return NULL;
8398    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8399    if (str1 == NULL)
8400        return NULL;
8401    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
8402    if (str2 == NULL) {
8403        Py_DECREF(str1);
8404        return NULL;
8405    }
8406
8407    result = replace(self, str1, str2, maxcount);
8408
8409    Py_DECREF(str1);
8410    Py_DECREF(str2);
8411    return result;
8412}
8413
8414static PyObject *
8415unicode_repr(PyObject *unicode)
8416{
8417    PyObject *repr;
8418    Py_UNICODE *p;
8419    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8420    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8421
8422    /* XXX(nnorwitz): rather than over-allocating, it would be
8423       better to choose a different scheme.  Perhaps scan the
8424       first N-chars of the string and allocate based on that size.
8425    */
8426    /* Initial allocation is based on the longest-possible unichr
8427       escape.
8428
8429       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8430       unichr, so in this case it's the longest unichr escape. In
8431       narrow (UTF-16) builds this is five chars per source unichr
8432       since there are two unichrs in the surrogate pair, so in narrow
8433       (UTF-16) builds it's not the longest unichr escape.
8434
8435       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8436       so in the narrow (UTF-16) build case it's the longest unichr
8437       escape.
8438    */
8439
8440    repr = PyUnicode_FromUnicode(NULL,
8441                                 2 /* quotes */
8442#ifdef Py_UNICODE_WIDE
8443                                 + 10*size
8444#else
8445                                 + 6*size
8446#endif
8447                                 + 1);
8448    if (repr == NULL)
8449        return NULL;
8450
8451    p = PyUnicode_AS_UNICODE(repr);
8452
8453    /* Add quote */
8454    *p++ = (findchar(s, size, '\'') &&
8455            !findchar(s, size, '"')) ? '"' : '\'';
8456    while (size-- > 0) {
8457        Py_UNICODE ch = *s++;
8458
8459        /* Escape quotes and backslashes */
8460        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8461            *p++ = '\\';
8462            *p++ = ch;
8463            continue;
8464        }
8465
8466        /* Map special whitespace to '\t', \n', '\r' */
8467        if (ch == '\t') {
8468            *p++ = '\\';
8469            *p++ = 't';
8470        }
8471        else if (ch == '\n') {
8472            *p++ = '\\';
8473            *p++ = 'n';
8474        }
8475        else if (ch == '\r') {
8476            *p++ = '\\';
8477            *p++ = 'r';
8478        }
8479
8480        /* Map non-printable US ASCII to '\xhh' */
8481        else if (ch < ' ' || ch == 0x7F) {
8482            *p++ = '\\';
8483            *p++ = 'x';
8484            *p++ = hexdigits[(ch >> 4) & 0x000F];
8485            *p++ = hexdigits[ch & 0x000F];
8486        }
8487
8488        /* Copy ASCII characters as-is */
8489        else if (ch < 0x7F) {
8490            *p++ = ch;
8491        }
8492
8493        /* Non-ASCII characters */
8494        else {
8495            Py_UCS4 ucs = ch;
8496
8497#ifndef Py_UNICODE_WIDE
8498            Py_UNICODE ch2 = 0;
8499            /* Get code point from surrogate pair */
8500            if (size > 0) {
8501                ch2 = *s;
8502                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
8503                    && ch2 <= 0xDFFF) {
8504                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
8505                        + 0x00010000;
8506                    s++;
8507                    size--;
8508                }
8509            }
8510#endif
8511            /* Map Unicode whitespace and control characters
8512               (categories Z* and C* except ASCII space)
8513            */
8514            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8515                /* Map 8-bit characters to '\xhh' */
8516                if (ucs <= 0xff) {
8517                    *p++ = '\\';
8518                    *p++ = 'x';
8519                    *p++ = hexdigits[(ch >> 4) & 0x000F];
8520                    *p++ = hexdigits[ch & 0x000F];
8521                }
8522                /* Map 21-bit characters to '\U00xxxxxx' */
8523                else if (ucs >= 0x10000) {
8524                    *p++ = '\\';
8525                    *p++ = 'U';
8526                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8527                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8528                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8529                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8530                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8531                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8532                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8533                    *p++ = hexdigits[ucs & 0x0000000F];
8534                }
8535                /* Map 16-bit characters to '\uxxxx' */
8536                else {
8537                    *p++ = '\\';
8538                    *p++ = 'u';
8539                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
8540                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
8541                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
8542                    *p++ = hexdigits[ucs & 0x000F];
8543                }
8544            }
8545            /* Copy characters as-is */
8546            else {
8547                *p++ = ch;
8548#ifndef Py_UNICODE_WIDE
8549                if (ucs >= 0x10000)
8550                    *p++ = ch2;
8551#endif
8552            }
8553        }
8554    }
8555    /* Add quote */
8556    *p++ = PyUnicode_AS_UNICODE(repr)[0];
8557
8558    *p = '\0';
8559    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8560    return repr;
8561}
8562
8563PyDoc_STRVAR(rfind__doc__,
8564             "S.rfind(sub[, start[, end]]) -> int\n\
8565\n\
8566Return the highest index in S where substring sub is found,\n\
8567such that sub is contained within s[start:end].  Optional\n\
8568arguments start and end are interpreted as in slice notation.\n\
8569\n\
8570Return -1 on failure.");
8571
8572static PyObject *
8573unicode_rfind(PyUnicodeObject *self, PyObject *args)
8574{
8575    PyObject *substring;
8576    Py_ssize_t start;
8577    Py_ssize_t end;
8578    Py_ssize_t result;
8579
8580    if (!_ParseTupleFinds(args, &substring, &start, &end))
8581        return NULL;
8582
8583    result = stringlib_rfind_slice(
8584        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8585        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8586        start, end
8587        );
8588
8589    Py_DECREF(substring);
8590
8591    return PyLong_FromSsize_t(result);
8592}
8593
8594PyDoc_STRVAR(rindex__doc__,
8595             "S.rindex(sub[, start[, end]]) -> int\n\
8596\n\
8597Like S.rfind() but raise ValueError when the substring is not found.");
8598
8599static PyObject *
8600unicode_rindex(PyUnicodeObject *self, PyObject *args)
8601{
8602    PyObject *substring;
8603    Py_ssize_t start;
8604    Py_ssize_t end;
8605    Py_ssize_t result;
8606
8607    if (!_ParseTupleFinds(args, &substring, &start, &end))
8608        return NULL;
8609
8610    result = stringlib_rfind_slice(
8611        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8612        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8613        start, end
8614        );
8615
8616    Py_DECREF(substring);
8617
8618    if (result < 0) {
8619        PyErr_SetString(PyExc_ValueError, "substring not found");
8620        return NULL;
8621    }
8622    return PyLong_FromSsize_t(result);
8623}
8624
8625PyDoc_STRVAR(rjust__doc__,
8626             "S.rjust(width[, fillchar]) -> str\n\
8627\n\
8628Return S right-justified in a string of length width. Padding is\n\
8629done using the specified fill character (default is a space).");
8630
8631static PyObject *
8632unicode_rjust(PyUnicodeObject *self, PyObject *args)
8633{
8634    Py_ssize_t width;
8635    Py_UNICODE fillchar = ' ';
8636
8637    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8638        return NULL;
8639
8640    if (self->length >= width && PyUnicode_CheckExact(self)) {
8641        Py_INCREF(self);
8642        return (PyObject*) self;
8643    }
8644
8645    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8646}
8647
8648PyObject *
8649PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
8650{
8651    PyObject *result;
8652
8653    s = PyUnicode_FromObject(s);
8654    if (s == NULL)
8655        return NULL;
8656    if (sep != NULL) {
8657        sep = PyUnicode_FromObject(sep);
8658        if (sep == NULL) {
8659            Py_DECREF(s);
8660            return NULL;
8661        }
8662    }
8663
8664    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8665
8666    Py_DECREF(s);
8667    Py_XDECREF(sep);
8668    return result;
8669}
8670
8671PyDoc_STRVAR(split__doc__,
8672             "S.split([sep[, maxsplit]]) -> list of strings\n\
8673\n\
8674Return a list of the words in S, using sep as the\n\
8675delimiter string.  If maxsplit is given, at most maxsplit\n\
8676splits are done. If sep is not specified or is None, any\n\
8677whitespace string is a separator and empty strings are\n\
8678removed from the result.");
8679
8680static PyObject*
8681unicode_split(PyUnicodeObject *self, PyObject *args)
8682{
8683    PyObject *substring = Py_None;
8684    Py_ssize_t maxcount = -1;
8685
8686    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8687        return NULL;
8688
8689    if (substring == Py_None)
8690        return split(self, NULL, maxcount);
8691    else if (PyUnicode_Check(substring))
8692        return split(self, (PyUnicodeObject *)substring, maxcount);
8693    else
8694        return PyUnicode_Split((PyObject *)self, substring, maxcount);
8695}
8696
8697PyObject *
8698PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8699{
8700    PyObject* str_obj;
8701    PyObject* sep_obj;
8702    PyObject* out;
8703
8704    str_obj = PyUnicode_FromObject(str_in);
8705    if (!str_obj)
8706        return NULL;
8707    sep_obj = PyUnicode_FromObject(sep_in);
8708    if (!sep_obj) {
8709        Py_DECREF(str_obj);
8710        return NULL;
8711    }
8712
8713    out = stringlib_partition(
8714        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8715        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8716        );
8717
8718    Py_DECREF(sep_obj);
8719    Py_DECREF(str_obj);
8720
8721    return out;
8722}
8723
8724
8725PyObject *
8726PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8727{
8728    PyObject* str_obj;
8729    PyObject* sep_obj;
8730    PyObject* out;
8731
8732    str_obj = PyUnicode_FromObject(str_in);
8733    if (!str_obj)
8734        return NULL;
8735    sep_obj = PyUnicode_FromObject(sep_in);
8736    if (!sep_obj) {
8737        Py_DECREF(str_obj);
8738        return NULL;
8739    }
8740
8741    out = stringlib_rpartition(
8742        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8743        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8744        );
8745
8746    Py_DECREF(sep_obj);
8747    Py_DECREF(str_obj);
8748
8749    return out;
8750}
8751
8752PyDoc_STRVAR(partition__doc__,
8753             "S.partition(sep) -> (head, sep, tail)\n\
8754\n\
8755Search for the separator sep in S, and return the part before it,\n\
8756the separator itself, and the part after it.  If the separator is not\n\
8757found, return S and two empty strings.");
8758
8759static PyObject*
8760unicode_partition(PyUnicodeObject *self, PyObject *separator)
8761{
8762    return PyUnicode_Partition((PyObject *)self, separator);
8763}
8764
8765PyDoc_STRVAR(rpartition__doc__,
8766             "S.rpartition(sep) -> (head, sep, tail)\n\
8767\n\
8768Search for the separator sep in S, starting at the end of S, and return\n\
8769the part before it, the separator itself, and the part after it.  If the\n\
8770separator is not found, return two empty strings and S.");
8771
8772static PyObject*
8773unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8774{
8775    return PyUnicode_RPartition((PyObject *)self, separator);
8776}
8777
8778PyObject *
8779PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
8780{
8781    PyObject *result;
8782
8783    s = PyUnicode_FromObject(s);
8784    if (s == NULL)
8785        return NULL;
8786    if (sep != NULL) {
8787        sep = PyUnicode_FromObject(sep);
8788        if (sep == NULL) {
8789            Py_DECREF(s);
8790            return NULL;
8791        }
8792    }
8793
8794    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8795
8796    Py_DECREF(s);
8797    Py_XDECREF(sep);
8798    return result;
8799}
8800
8801PyDoc_STRVAR(rsplit__doc__,
8802             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8803\n\
8804Return a list of the words in S, using sep as the\n\
8805delimiter string, starting at the end of the string and\n\
8806working to the front.  If maxsplit is given, at most maxsplit\n\
8807splits are done. If sep is not specified, any whitespace string\n\
8808is a separator.");
8809
8810static PyObject*
8811unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8812{
8813    PyObject *substring = Py_None;
8814    Py_ssize_t maxcount = -1;
8815
8816    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8817        return NULL;
8818
8819    if (substring == Py_None)
8820        return rsplit(self, NULL, maxcount);
8821    else if (PyUnicode_Check(substring))
8822        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8823    else
8824        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8825}
8826
8827PyDoc_STRVAR(splitlines__doc__,
8828             "S.splitlines([keepends]) -> list of strings\n\
8829\n\
8830Return a list of the lines in S, breaking at line boundaries.\n\
8831Line breaks are not included in the resulting list unless keepends\n\
8832is given and true.");
8833
8834static PyObject*
8835unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8836{
8837    int keepends = 0;
8838
8839    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8840        return NULL;
8841
8842    return PyUnicode_Splitlines((PyObject *)self, keepends);
8843}
8844
8845static
8846PyObject *unicode_str(PyObject *self)
8847{
8848    if (PyUnicode_CheckExact(self)) {
8849        Py_INCREF(self);
8850        return self;
8851    } else
8852        /* Subtype -- return genuine unicode string with the same value. */
8853        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8854                                     PyUnicode_GET_SIZE(self));
8855}
8856
8857PyDoc_STRVAR(swapcase__doc__,
8858             "S.swapcase() -> str\n\
8859\n\
8860Return a copy of S with uppercase characters converted to lowercase\n\
8861and vice versa.");
8862
8863static PyObject*
8864unicode_swapcase(PyUnicodeObject *self)
8865{
8866    return fixup(self, fixswapcase);
8867}
8868
8869PyDoc_STRVAR(maketrans__doc__,
8870             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8871\n\
8872Return a translation table usable for str.translate().\n\
8873If there is only one argument, it must be a dictionary mapping Unicode\n\
8874ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8875Character keys will be then converted to ordinals.\n\
8876If there are two arguments, they must be strings of equal length, and\n\
8877in the resulting dictionary, each character in x will be mapped to the\n\
8878character at the same position in y. If there is a third argument, it\n\
8879must be a string, whose characters will be mapped to None in the result.");
8880
8881static PyObject*
8882unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8883{
8884    PyObject *x, *y = NULL, *z = NULL;
8885    PyObject *new = NULL, *key, *value;
8886    Py_ssize_t i = 0;
8887    int res;
8888
8889    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8890        return NULL;
8891    new = PyDict_New();
8892    if (!new)
8893        return NULL;
8894    if (y != NULL) {
8895        /* x must be a string too, of equal length */
8896        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8897        if (!PyUnicode_Check(x)) {
8898            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8899                            "be a string if there is a second argument");
8900            goto err;
8901        }
8902        if (PyUnicode_GET_SIZE(x) != ylen) {
8903            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8904                            "arguments must have equal length");
8905            goto err;
8906        }
8907        /* create entries for translating chars in x to those in y */
8908        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8909            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8910            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8911            if (!key || !value)
8912                goto err;
8913            res = PyDict_SetItem(new, key, value);
8914            Py_DECREF(key);
8915            Py_DECREF(value);
8916            if (res < 0)
8917                goto err;
8918        }
8919        /* create entries for deleting chars in z */
8920        if (z != NULL) {
8921            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8922                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8923                if (!key)
8924                    goto err;
8925                res = PyDict_SetItem(new, key, Py_None);
8926                Py_DECREF(key);
8927                if (res < 0)
8928                    goto err;
8929            }
8930        }
8931    } else {
8932        /* x must be a dict */
8933        if (!PyDict_CheckExact(x)) {
8934            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8935                            "to maketrans it must be a dict");
8936            goto err;
8937        }
8938        /* copy entries into the new dict, converting string keys to int keys */
8939        while (PyDict_Next(x, &i, &key, &value)) {
8940            if (PyUnicode_Check(key)) {
8941                /* convert string keys to integer keys */
8942                PyObject *newkey;
8943                if (PyUnicode_GET_SIZE(key) != 1) {
8944                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8945                                    "table must be of length 1");
8946                    goto err;
8947                }
8948                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8949                if (!newkey)
8950                    goto err;
8951                res = PyDict_SetItem(new, newkey, value);
8952                Py_DECREF(newkey);
8953                if (res < 0)
8954                    goto err;
8955            } else if (PyLong_Check(key)) {
8956                /* just keep integer keys */
8957                if (PyDict_SetItem(new, key, value) < 0)
8958                    goto err;
8959            } else {
8960                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8961                                "be strings or integers");
8962                goto err;
8963            }
8964        }
8965    }
8966    return new;
8967  err:
8968    Py_DECREF(new);
8969    return NULL;
8970}
8971
8972PyDoc_STRVAR(translate__doc__,
8973             "S.translate(table) -> str\n\
8974\n\
8975Return a copy of the string S, where all characters have been mapped\n\
8976through the given translation table, which must be a mapping of\n\
8977Unicode ordinals to Unicode ordinals, strings, or None.\n\
8978Unmapped characters are left untouched. Characters mapped to None\n\
8979are deleted.");
8980
8981static PyObject*
8982unicode_translate(PyUnicodeObject *self, PyObject *table)
8983{
8984    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8985}
8986
8987PyDoc_STRVAR(upper__doc__,
8988             "S.upper() -> str\n\
8989\n\
8990Return a copy of S converted to uppercase.");
8991
8992static PyObject*
8993unicode_upper(PyUnicodeObject *self)
8994{
8995    return fixup(self, fixupper);
8996}
8997
8998PyDoc_STRVAR(zfill__doc__,
8999             "S.zfill(width) -> str\n\
9000\n\
9001Pad a numeric string S with zeros on the left, to fill a field\n\
9002of the specified width. The string S is never truncated.");
9003
9004static PyObject *
9005unicode_zfill(PyUnicodeObject *self, PyObject *args)
9006{
9007    Py_ssize_t fill;
9008    PyUnicodeObject *u;
9009
9010    Py_ssize_t width;
9011    if (!PyArg_ParseTuple(args, "n:zfill", &width))
9012        return NULL;
9013
9014    if (self->length >= width) {
9015        if (PyUnicode_CheckExact(self)) {
9016            Py_INCREF(self);
9017            return (PyObject*) self;
9018        }
9019        else
9020            return PyUnicode_FromUnicode(
9021                PyUnicode_AS_UNICODE(self),
9022                PyUnicode_GET_SIZE(self)
9023                );
9024    }
9025
9026    fill = width - self->length;
9027
9028    u = pad(self, fill, 0, '0');
9029
9030    if (u == NULL)
9031        return NULL;
9032
9033    if (u->str[fill] == '+' || u->str[fill] == '-') {
9034        /* move sign to beginning of string */
9035        u->str[0] = u->str[fill];
9036        u->str[fill] = '0';
9037    }
9038
9039    return (PyObject*) u;
9040}
9041
9042#if 0
9043static PyObject*
9044unicode_freelistsize(PyUnicodeObject *self)
9045{
9046    return PyLong_FromLong(numfree);
9047}
9048
9049static PyObject *
9050unicode__decimal2ascii(PyObject *self)
9051{
9052    return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9053                                             PyUnicode_GET_SIZE(self));
9054}
9055#endif
9056
9057PyDoc_STRVAR(startswith__doc__,
9058             "S.startswith(prefix[, start[, end]]) -> bool\n\
9059\n\
9060Return True if S starts with the specified prefix, False otherwise.\n\
9061With optional start, test S beginning at that position.\n\
9062With optional end, stop comparing S at that position.\n\
9063prefix can also be a tuple of strings to try.");
9064
9065static PyObject *
9066unicode_startswith(PyUnicodeObject *self,
9067                   PyObject *args)
9068{
9069    PyObject *subobj;
9070    PyUnicodeObject *substring;
9071    Py_ssize_t start = 0;
9072    Py_ssize_t end = PY_SSIZE_T_MAX;
9073    int result;
9074
9075    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
9076                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9077        return NULL;
9078    if (PyTuple_Check(subobj)) {
9079        Py_ssize_t i;
9080        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9081            substring = (PyUnicodeObject *)PyUnicode_FromObject(
9082                PyTuple_GET_ITEM(subobj, i));
9083            if (substring == NULL)
9084                return NULL;
9085            result = tailmatch(self, substring, start, end, -1);
9086            Py_DECREF(substring);
9087            if (result) {
9088                Py_RETURN_TRUE;
9089            }
9090        }
9091        /* nothing matched */
9092        Py_RETURN_FALSE;
9093    }
9094    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9095    if (substring == NULL)
9096        return NULL;
9097    result = tailmatch(self, substring, start, end, -1);
9098    Py_DECREF(substring);
9099    return PyBool_FromLong(result);
9100}
9101
9102
9103PyDoc_STRVAR(endswith__doc__,
9104             "S.endswith(suffix[, start[, end]]) -> bool\n\
9105\n\
9106Return True if S ends with the specified suffix, False otherwise.\n\
9107With optional start, test S beginning at that position.\n\
9108With optional end, stop comparing S at that position.\n\
9109suffix can also be a tuple of strings to try.");
9110
9111static PyObject *
9112unicode_endswith(PyUnicodeObject *self,
9113                 PyObject *args)
9114{
9115    PyObject *subobj;
9116    PyUnicodeObject *substring;
9117    Py_ssize_t start = 0;
9118    Py_ssize_t end = PY_SSIZE_T_MAX;
9119    int result;
9120
9121    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
9122                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9123        return NULL;
9124    if (PyTuple_Check(subobj)) {
9125        Py_ssize_t i;
9126        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9127            substring = (PyUnicodeObject *)PyUnicode_FromObject(
9128                PyTuple_GET_ITEM(subobj, i));
9129            if (substring == NULL)
9130                return NULL;
9131            result = tailmatch(self, substring, start, end, +1);
9132            Py_DECREF(substring);
9133            if (result) {
9134                Py_RETURN_TRUE;
9135            }
9136        }
9137        Py_RETURN_FALSE;
9138    }
9139    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9140    if (substring == NULL)
9141        return NULL;
9142
9143    result = tailmatch(self, substring, start, end, +1);
9144    Py_DECREF(substring);
9145    return PyBool_FromLong(result);
9146}
9147
9148#include "stringlib/string_format.h"
9149
9150PyDoc_STRVAR(format__doc__,
9151             "S.format(*args, **kwargs) -> str\n\
9152\n\
9153Return a formatted version of S, using substitutions from args and kwargs.\n\
9154The substitutions are identified by braces ('{' and '}').");
9155
9156PyDoc_STRVAR(format_map__doc__,
9157             "S.format_map(mapping) -> str\n\
9158\n\
9159Return a formatted version of S, using substitutions from mapping.\n\
9160The substitutions are identified by braces ('{' and '}').");
9161
9162static PyObject *
9163unicode__format__(PyObject* self, PyObject* args)
9164{
9165    PyObject *format_spec;
9166
9167    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9168        return NULL;
9169
9170    return _PyUnicode_FormatAdvanced(self,
9171                                     PyUnicode_AS_UNICODE(format_spec),
9172                                     PyUnicode_GET_SIZE(format_spec));
9173}
9174
9175PyDoc_STRVAR(p_format__doc__,
9176             "S.__format__(format_spec) -> str\n\
9177\n\
9178Return a formatted version of S as described by format_spec.");
9179
9180static PyObject *
9181unicode__sizeof__(PyUnicodeObject *v)
9182{
9183    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9184                              sizeof(Py_UNICODE) * (v->length + 1));
9185}
9186
9187PyDoc_STRVAR(sizeof__doc__,
9188             "S.__sizeof__() -> size of S in memory, in bytes");
9189
9190static PyObject *
9191unicode_getnewargs(PyUnicodeObject *v)
9192{
9193    return Py_BuildValue("(u#)", v->str, v->length);
9194}
9195
9196static PyMethodDef unicode_methods[] = {
9197
9198    /* Order is according to common usage: often used methods should
9199       appear first, since lookup is done sequentially. */
9200
9201    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
9202    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9203    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
9204    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
9205    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9206    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9207    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9208    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9209    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9210    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9211    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
9212    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
9213    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9214    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9215    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
9216    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
9217    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9218    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9219    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
9220    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
9221    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
9222    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
9223    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
9224    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9225    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9226    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9227    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9228    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9229    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9230    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9231    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9232    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9233    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9234    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9235    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9236    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9237    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
9238    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
9239    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
9240    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
9241    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
9242    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
9243    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
9244    {"maketrans", (PyCFunction) unicode_maketrans,
9245     METH_VARARGS | METH_STATIC, maketrans__doc__},
9246    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
9247#if 0
9248    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
9249#endif
9250
9251#if 0
9252    /* These methods are just used for debugging the implementation. */
9253    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
9254    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
9255#endif
9256
9257    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
9258    {NULL, NULL}
9259};
9260
9261static PyObject *
9262unicode_mod(PyObject *v, PyObject *w)
9263{
9264    if (!PyUnicode_Check(v)) {
9265        Py_INCREF(Py_NotImplemented);
9266        return Py_NotImplemented;
9267    }
9268    return PyUnicode_Format(v, w);
9269}
9270
9271static PyNumberMethods unicode_as_number = {
9272    0,              /*nb_add*/
9273    0,              /*nb_subtract*/
9274    0,              /*nb_multiply*/
9275    unicode_mod,            /*nb_remainder*/
9276};
9277
9278static PySequenceMethods unicode_as_sequence = {
9279    (lenfunc) unicode_length,       /* sq_length */
9280    PyUnicode_Concat,           /* sq_concat */
9281    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
9282    (ssizeargfunc) unicode_getitem,     /* sq_item */
9283    0,                  /* sq_slice */
9284    0,                  /* sq_ass_item */
9285    0,                  /* sq_ass_slice */
9286    PyUnicode_Contains,         /* sq_contains */
9287};
9288
9289static PyObject*
9290unicode_subscript(PyUnicodeObject* self, PyObject* item)
9291{
9292    if (PyIndex_Check(item)) {
9293        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
9294        if (i == -1 && PyErr_Occurred())
9295            return NULL;
9296        if (i < 0)
9297            i += PyUnicode_GET_SIZE(self);
9298        return unicode_getitem(self, i);
9299    } else if (PySlice_Check(item)) {
9300        Py_ssize_t start, stop, step, slicelength, cur, i;
9301        Py_UNICODE* source_buf;
9302        Py_UNICODE* result_buf;
9303        PyObject* result;
9304
9305        if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
9306                                 &start, &stop, &step, &slicelength) < 0) {
9307            return NULL;
9308        }
9309
9310        if (slicelength <= 0) {
9311            return PyUnicode_FromUnicode(NULL, 0);
9312        } else if (start == 0 && step == 1 && slicelength == self->length &&
9313                   PyUnicode_CheckExact(self)) {
9314            Py_INCREF(self);
9315            return (PyObject *)self;
9316        } else if (step == 1) {
9317            return PyUnicode_FromUnicode(self->str + start, slicelength);
9318        } else {
9319            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
9320            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9321                                                       sizeof(Py_UNICODE));
9322
9323            if (result_buf == NULL)
9324                return PyErr_NoMemory();
9325
9326            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9327                result_buf[i] = source_buf[cur];
9328            }
9329
9330            result = PyUnicode_FromUnicode(result_buf, slicelength);
9331            PyObject_FREE(result_buf);
9332            return result;
9333        }
9334    } else {
9335        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9336        return NULL;
9337    }
9338}
9339
9340static PyMappingMethods unicode_as_mapping = {
9341    (lenfunc)unicode_length,        /* mp_length */
9342    (binaryfunc)unicode_subscript,  /* mp_subscript */
9343    (objobjargproc)0,           /* mp_ass_subscript */
9344};
9345
9346
9347/* Helpers for PyUnicode_Format() */
9348
9349static PyObject *
9350getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
9351{
9352    Py_ssize_t argidx = *p_argidx;
9353    if (argidx < arglen) {
9354        (*p_argidx)++;
9355        if (arglen < 0)
9356            return args;
9357        else
9358            return PyTuple_GetItem(args, argidx);
9359    }
9360    PyErr_SetString(PyExc_TypeError,
9361                    "not enough arguments for format string");
9362    return NULL;
9363}
9364
9365/* Returns a new reference to a PyUnicode object, or NULL on failure. */
9366
9367static PyObject *
9368formatfloat(PyObject *v, int flags, int prec, int type)
9369{
9370    char *p;
9371    PyObject *result;
9372    double x;
9373
9374    x = PyFloat_AsDouble(v);
9375    if (x == -1.0 && PyErr_Occurred())
9376        return NULL;
9377
9378    if (prec < 0)
9379        prec = 6;
9380
9381    p = PyOS_double_to_string(x, type, prec,
9382                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
9383    if (p == NULL)
9384        return NULL;
9385    result = PyUnicode_FromStringAndSize(p, strlen(p));
9386    PyMem_Free(p);
9387    return result;
9388}
9389
9390static PyObject*
9391formatlong(PyObject *val, int flags, int prec, int type)
9392{
9393    char *buf;
9394    int len;
9395    PyObject *str; /* temporary string object. */
9396    PyObject *result;
9397
9398    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9399    if (!str)
9400        return NULL;
9401    result = PyUnicode_FromStringAndSize(buf, len);
9402    Py_DECREF(str);
9403    return result;
9404}
9405
9406static int
9407formatchar(Py_UNICODE *buf,
9408           size_t buflen,
9409           PyObject *v)
9410{
9411    /* presume that the buffer is at least 3 characters long */
9412    if (PyUnicode_Check(v)) {
9413        if (PyUnicode_GET_SIZE(v) == 1) {
9414            buf[0] = PyUnicode_AS_UNICODE(v)[0];
9415            buf[1] = '\0';
9416            return 1;
9417        }
9418#ifndef Py_UNICODE_WIDE
9419        if (PyUnicode_GET_SIZE(v) == 2) {
9420            /* Decode a valid surrogate pair */
9421            int c0 = PyUnicode_AS_UNICODE(v)[0];
9422            int c1 = PyUnicode_AS_UNICODE(v)[1];
9423            if (0xD800 <= c0 && c0 <= 0xDBFF &&
9424                0xDC00 <= c1 && c1 <= 0xDFFF) {
9425                buf[0] = c0;
9426                buf[1] = c1;
9427                buf[2] = '\0';
9428                return 2;
9429            }
9430        }
9431#endif
9432        goto onError;
9433    }
9434    else {
9435        /* Integer input truncated to a character */
9436        long x;
9437        x = PyLong_AsLong(v);
9438        if (x == -1 && PyErr_Occurred())
9439            goto onError;
9440
9441        if (x < 0 || x > 0x10ffff) {
9442            PyErr_SetString(PyExc_OverflowError,
9443                            "%c arg not in range(0x110000)");
9444            return -1;
9445        }
9446
9447#ifndef Py_UNICODE_WIDE
9448        if (x > 0xffff) {
9449            x -= 0x10000;
9450            buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9451            buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9452            return 2;
9453        }
9454#endif
9455        buf[0] = (Py_UNICODE) x;
9456        buf[1] = '\0';
9457        return 1;
9458    }
9459
9460  onError:
9461    PyErr_SetString(PyExc_TypeError,
9462                    "%c requires int or char");
9463    return -1;
9464}
9465
9466/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9467   FORMATBUFLEN is the length of the buffer in which chars are formatted.
9468*/
9469#define FORMATBUFLEN (size_t)10
9470
9471PyObject *
9472PyUnicode_Format(PyObject *format, PyObject *args)
9473{
9474    Py_UNICODE *fmt, *res;
9475    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
9476    int args_owned = 0;
9477    PyUnicodeObject *result = NULL;
9478    PyObject *dict = NULL;
9479    PyObject *uformat;
9480
9481    if (format == NULL || args == NULL) {
9482        PyErr_BadInternalCall();
9483        return NULL;
9484    }
9485    uformat = PyUnicode_FromObject(format);
9486    if (uformat == NULL)
9487        return NULL;
9488    fmt = PyUnicode_AS_UNICODE(uformat);
9489    fmtcnt = PyUnicode_GET_SIZE(uformat);
9490
9491    reslen = rescnt = fmtcnt + 100;
9492    result = _PyUnicode_New(reslen);
9493    if (result == NULL)
9494        goto onError;
9495    res = PyUnicode_AS_UNICODE(result);
9496
9497    if (PyTuple_Check(args)) {
9498        arglen = PyTuple_Size(args);
9499        argidx = 0;
9500    }
9501    else {
9502        arglen = -1;
9503        argidx = -2;
9504    }
9505    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9506        !PyUnicode_Check(args))
9507        dict = args;
9508
9509    while (--fmtcnt >= 0) {
9510        if (*fmt != '%') {
9511            if (--rescnt < 0) {
9512                rescnt = fmtcnt + 100;
9513                reslen += rescnt;
9514                if (_PyUnicode_Resize(&result, reslen) < 0)
9515                    goto onError;
9516                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9517                --rescnt;
9518            }
9519            *res++ = *fmt++;
9520        }
9521        else {
9522            /* Got a format specifier */
9523            int flags = 0;
9524            Py_ssize_t width = -1;
9525            int prec = -1;
9526            Py_UNICODE c = '\0';
9527            Py_UNICODE fill;
9528            int isnumok;
9529            PyObject *v = NULL;
9530            PyObject *temp = NULL;
9531            Py_UNICODE *pbuf;
9532            Py_UNICODE sign;
9533            Py_ssize_t len;
9534            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
9535
9536            fmt++;
9537            if (*fmt == '(') {
9538                Py_UNICODE *keystart;
9539                Py_ssize_t keylen;
9540                PyObject *key;
9541                int pcount = 1;
9542
9543                if (dict == NULL) {
9544                    PyErr_SetString(PyExc_TypeError,
9545                                    "format requires a mapping");
9546                    goto onError;
9547                }
9548                ++fmt;
9549                --fmtcnt;
9550                keystart = fmt;
9551                /* Skip over balanced parentheses */
9552                while (pcount > 0 && --fmtcnt >= 0) {
9553                    if (*fmt == ')')
9554                        --pcount;
9555                    else if (*fmt == '(')
9556                        ++pcount;
9557                    fmt++;
9558                }
9559                keylen = fmt - keystart - 1;
9560                if (fmtcnt < 0 || pcount > 0) {
9561                    PyErr_SetString(PyExc_ValueError,
9562                                    "incomplete format key");
9563                    goto onError;
9564                }
9565#if 0
9566                /* keys are converted to strings using UTF-8 and
9567                   then looked up since Python uses strings to hold
9568                   variables names etc. in its namespaces and we
9569                   wouldn't want to break common idioms. */
9570                key = PyUnicode_EncodeUTF8(keystart,
9571                                           keylen,
9572                                           NULL);
9573#else
9574                key = PyUnicode_FromUnicode(keystart, keylen);
9575#endif
9576                if (key == NULL)
9577                    goto onError;
9578                if (args_owned) {
9579                    Py_DECREF(args);
9580                    args_owned = 0;
9581                }
9582                args = PyObject_GetItem(dict, key);
9583                Py_DECREF(key);
9584                if (args == NULL) {
9585                    goto onError;
9586                }
9587                args_owned = 1;
9588                arglen = -1;
9589                argidx = -2;
9590            }
9591            while (--fmtcnt >= 0) {
9592                switch (c = *fmt++) {
9593                case '-': flags |= F_LJUST; continue;
9594                case '+': flags |= F_SIGN; continue;
9595                case ' ': flags |= F_BLANK; continue;
9596                case '#': flags |= F_ALT; continue;
9597                case '0': flags |= F_ZERO; continue;
9598                }
9599                break;
9600            }
9601            if (c == '*') {
9602                v = getnextarg(args, arglen, &argidx);
9603                if (v == NULL)
9604                    goto onError;
9605                if (!PyLong_Check(v)) {
9606                    PyErr_SetString(PyExc_TypeError,
9607                                    "* wants int");
9608                    goto onError;
9609                }
9610                width = PyLong_AsLong(v);
9611                if (width == -1 && PyErr_Occurred())
9612                    goto onError;
9613                if (width < 0) {
9614                    flags |= F_LJUST;
9615                    width = -width;
9616                }
9617                if (--fmtcnt >= 0)
9618                    c = *fmt++;
9619            }
9620            else if (c >= '0' && c <= '9') {
9621                width = c - '0';
9622                while (--fmtcnt >= 0) {
9623                    c = *fmt++;
9624                    if (c < '0' || c > '9')
9625                        break;
9626                    if ((width*10) / 10 != width) {
9627                        PyErr_SetString(PyExc_ValueError,
9628                                        "width too big");
9629                        goto onError;
9630                    }
9631                    width = width*10 + (c - '0');
9632                }
9633            }
9634            if (c == '.') {
9635                prec = 0;
9636                if (--fmtcnt >= 0)
9637                    c = *fmt++;
9638                if (c == '*') {
9639                    v = getnextarg(args, arglen, &argidx);
9640                    if (v == NULL)
9641                        goto onError;
9642                    if (!PyLong_Check(v)) {
9643                        PyErr_SetString(PyExc_TypeError,
9644                                        "* wants int");
9645                        goto onError;
9646                    }
9647                    prec = PyLong_AsLong(v);
9648                    if (prec == -1 && PyErr_Occurred())
9649                        goto onError;
9650                    if (prec < 0)
9651                        prec = 0;
9652                    if (--fmtcnt >= 0)
9653                        c = *fmt++;
9654                }
9655                else if (c >= '0' && c <= '9') {
9656                    prec = c - '0';
9657                    while (--fmtcnt >= 0) {
9658                        c = *fmt++;
9659                        if (c < '0' || c > '9')
9660                            break;
9661                        if ((prec*10) / 10 != prec) {
9662                            PyErr_SetString(PyExc_ValueError,
9663                                            "prec too big");
9664                            goto onError;
9665                        }
9666                        prec = prec*10 + (c - '0');
9667                    }
9668                }
9669            } /* prec */
9670            if (fmtcnt >= 0) {
9671                if (c == 'h' || c == 'l' || c == 'L') {
9672                    if (--fmtcnt >= 0)
9673                        c = *fmt++;
9674                }
9675            }
9676            if (fmtcnt < 0) {
9677                PyErr_SetString(PyExc_ValueError,
9678                                "incomplete format");
9679                goto onError;
9680            }
9681            if (c != '%') {
9682                v = getnextarg(args, arglen, &argidx);
9683                if (v == NULL)
9684                    goto onError;
9685            }
9686            sign = 0;
9687            fill = ' ';
9688            switch (c) {
9689
9690            case '%':
9691                pbuf = formatbuf;
9692                /* presume that buffer length is at least 1 */
9693                pbuf[0] = '%';
9694                len = 1;
9695                break;
9696
9697            case 's':
9698            case 'r':
9699            case 'a':
9700                if (PyUnicode_CheckExact(v) && c == 's') {
9701                    temp = v;
9702                    Py_INCREF(temp);
9703                }
9704                else {
9705                    if (c == 's')
9706                        temp = PyObject_Str(v);
9707                    else if (c == 'r')
9708                        temp = PyObject_Repr(v);
9709                    else
9710                        temp = PyObject_ASCII(v);
9711                    if (temp == NULL)
9712                        goto onError;
9713                    if (PyUnicode_Check(temp))
9714                        /* nothing to do */;
9715                    else {
9716                        Py_DECREF(temp);
9717                        PyErr_SetString(PyExc_TypeError,
9718                                        "%s argument has non-string str()");
9719                        goto onError;
9720                    }
9721                }
9722                pbuf = PyUnicode_AS_UNICODE(temp);
9723                len = PyUnicode_GET_SIZE(temp);
9724                if (prec >= 0 && len > prec)
9725                    len = prec;
9726                break;
9727
9728            case 'i':
9729            case 'd':
9730            case 'u':
9731            case 'o':
9732            case 'x':
9733            case 'X':
9734                if (c == 'i')
9735                    c = 'd';
9736                isnumok = 0;
9737                if (PyNumber_Check(v)) {
9738                    PyObject *iobj=NULL;
9739
9740                    if (PyLong_Check(v)) {
9741                        iobj = v;
9742                        Py_INCREF(iobj);
9743                    }
9744                    else {
9745                        iobj = PyNumber_Long(v);
9746                    }
9747                    if (iobj!=NULL) {
9748                        if (PyLong_Check(iobj)) {
9749                            isnumok = 1;
9750                            temp = formatlong(iobj, flags, prec, c);
9751                            Py_DECREF(iobj);
9752                            if (!temp)
9753                                goto onError;
9754                            pbuf = PyUnicode_AS_UNICODE(temp);
9755                            len = PyUnicode_GET_SIZE(temp);
9756                            sign = 1;
9757                        }
9758                        else {
9759                            Py_DECREF(iobj);
9760                        }
9761                    }
9762                }
9763                if (!isnumok) {
9764                    PyErr_Format(PyExc_TypeError,
9765                                 "%%%c format: a number is required, "
9766                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9767                    goto onError;
9768                }
9769                if (flags & F_ZERO)
9770                    fill = '0';
9771                break;
9772
9773            case 'e':
9774            case 'E':
9775            case 'f':
9776            case 'F':
9777            case 'g':
9778            case 'G':
9779                temp = formatfloat(v, flags, prec, c);
9780                if (!temp)
9781                    goto onError;
9782                pbuf = PyUnicode_AS_UNICODE(temp);
9783                len = PyUnicode_GET_SIZE(temp);
9784                sign = 1;
9785                if (flags & F_ZERO)
9786                    fill = '0';
9787                break;
9788
9789            case 'c':
9790                pbuf = formatbuf;
9791                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9792                if (len < 0)
9793                    goto onError;
9794                break;
9795
9796            default:
9797                PyErr_Format(PyExc_ValueError,
9798                             "unsupported format character '%c' (0x%x) "
9799                             "at index %zd",
9800                             (31<=c && c<=126) ? (char)c : '?',
9801                             (int)c,
9802                             (Py_ssize_t)(fmt - 1 -
9803                                          PyUnicode_AS_UNICODE(uformat)));
9804                goto onError;
9805            }
9806            if (sign) {
9807                if (*pbuf == '-' || *pbuf == '+') {
9808                    sign = *pbuf++;
9809                    len--;
9810                }
9811                else if (flags & F_SIGN)
9812                    sign = '+';
9813                else if (flags & F_BLANK)
9814                    sign = ' ';
9815                else
9816                    sign = 0;
9817            }
9818            if (width < len)
9819                width = len;
9820            if (rescnt - (sign != 0) < width) {
9821                reslen -= rescnt;
9822                rescnt = width + fmtcnt + 100;
9823                reslen += rescnt;
9824                if (reslen < 0) {
9825                    Py_XDECREF(temp);
9826                    PyErr_NoMemory();
9827                    goto onError;
9828                }
9829                if (_PyUnicode_Resize(&result, reslen) < 0) {
9830                    Py_XDECREF(temp);
9831                    goto onError;
9832                }
9833                res = PyUnicode_AS_UNICODE(result)
9834                    + reslen - rescnt;
9835            }
9836            if (sign) {
9837                if (fill != ' ')
9838                    *res++ = sign;
9839                rescnt--;
9840                if (width > len)
9841                    width--;
9842            }
9843            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9844                assert(pbuf[0] == '0');
9845                assert(pbuf[1] == c);
9846                if (fill != ' ') {
9847                    *res++ = *pbuf++;
9848                    *res++ = *pbuf++;
9849                }
9850                rescnt -= 2;
9851                width -= 2;
9852                if (width < 0)
9853                    width = 0;
9854                len -= 2;
9855            }
9856            if (width > len && !(flags & F_LJUST)) {
9857                do {
9858                    --rescnt;
9859                    *res++ = fill;
9860                } while (--width > len);
9861            }
9862            if (fill == ' ') {
9863                if (sign)
9864                    *res++ = sign;
9865                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9866                    assert(pbuf[0] == '0');
9867                    assert(pbuf[1] == c);
9868                    *res++ = *pbuf++;
9869                    *res++ = *pbuf++;
9870                }
9871            }
9872            Py_UNICODE_COPY(res, pbuf, len);
9873            res += len;
9874            rescnt -= len;
9875            while (--width >= len) {
9876                --rescnt;
9877                *res++ = ' ';
9878            }
9879            if (dict && (argidx < arglen) && c != '%') {
9880                PyErr_SetString(PyExc_TypeError,
9881                                "not all arguments converted during string formatting");
9882                Py_XDECREF(temp);
9883                goto onError;
9884            }
9885            Py_XDECREF(temp);
9886        } /* '%' */
9887    } /* until end */
9888    if (argidx < arglen && !dict) {
9889        PyErr_SetString(PyExc_TypeError,
9890                        "not all arguments converted during string formatting");
9891        goto onError;
9892    }
9893
9894    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9895        goto onError;
9896    if (args_owned) {
9897        Py_DECREF(args);
9898    }
9899    Py_DECREF(uformat);
9900    return (PyObject *)result;
9901
9902  onError:
9903    Py_XDECREF(result);
9904    Py_DECREF(uformat);
9905    if (args_owned) {
9906        Py_DECREF(args);
9907    }
9908    return NULL;
9909}
9910
9911static PyObject *
9912unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9913
9914static PyObject *
9915unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9916{
9917    PyObject *x = NULL;
9918    static char *kwlist[] = {"object", "encoding", "errors", 0};
9919    char *encoding = NULL;
9920    char *errors = NULL;
9921
9922    if (type != &PyUnicode_Type)
9923        return unicode_subtype_new(type, args, kwds);
9924    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9925                                     kwlist, &x, &encoding, &errors))
9926        return NULL;
9927    if (x == NULL)
9928        return (PyObject *)_PyUnicode_New(0);
9929    if (encoding == NULL && errors == NULL)
9930        return PyObject_Str(x);
9931    else
9932        return PyUnicode_FromEncodedObject(x, encoding, errors);
9933}
9934
9935static PyObject *
9936unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9937{
9938    PyUnicodeObject *tmp, *pnew;
9939    Py_ssize_t n;
9940
9941    assert(PyType_IsSubtype(type, &PyUnicode_Type));
9942    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9943    if (tmp == NULL)
9944        return NULL;
9945    assert(PyUnicode_Check(tmp));
9946    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9947    if (pnew == NULL) {
9948        Py_DECREF(tmp);
9949        return NULL;
9950    }
9951    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9952    if (pnew->str == NULL) {
9953        _Py_ForgetReference((PyObject *)pnew);
9954        PyObject_Del(pnew);
9955        Py_DECREF(tmp);
9956        return PyErr_NoMemory();
9957    }
9958    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9959    pnew->length = n;
9960    pnew->hash = tmp->hash;
9961    Py_DECREF(tmp);
9962    return (PyObject *)pnew;
9963}
9964
9965PyDoc_STRVAR(unicode_doc,
9966             "str(string[, encoding[, errors]]) -> str\n\
9967\n\
9968Create a new string object from the given encoded string.\n\
9969encoding defaults to the current default string encoding.\n\
9970errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9971
9972static PyObject *unicode_iter(PyObject *seq);
9973
9974PyTypeObject PyUnicode_Type = {
9975    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9976    "str",              /* tp_name */
9977    sizeof(PyUnicodeObject),        /* tp_size */
9978    0,                  /* tp_itemsize */
9979    /* Slots */
9980    (destructor)unicode_dealloc,    /* tp_dealloc */
9981    0,                  /* tp_print */
9982    0,                  /* tp_getattr */
9983    0,                  /* tp_setattr */
9984    0,                  /* tp_reserved */
9985    unicode_repr,           /* tp_repr */
9986    &unicode_as_number,         /* tp_as_number */
9987    &unicode_as_sequence,       /* tp_as_sequence */
9988    &unicode_as_mapping,        /* tp_as_mapping */
9989    (hashfunc) unicode_hash,        /* tp_hash*/
9990    0,                  /* tp_call*/
9991    (reprfunc) unicode_str,     /* tp_str */
9992    PyObject_GenericGetAttr,        /* tp_getattro */
9993    0,                  /* tp_setattro */
9994    0,                  /* tp_as_buffer */
9995    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9996    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
9997    unicode_doc,            /* tp_doc */
9998    0,                  /* tp_traverse */
9999    0,                  /* tp_clear */
10000    PyUnicode_RichCompare,      /* tp_richcompare */
10001    0,                  /* tp_weaklistoffset */
10002    unicode_iter,           /* tp_iter */
10003    0,                  /* tp_iternext */
10004    unicode_methods,            /* tp_methods */
10005    0,                  /* tp_members */
10006    0,                  /* tp_getset */
10007    &PyBaseObject_Type,         /* tp_base */
10008    0,                  /* tp_dict */
10009    0,                  /* tp_descr_get */
10010    0,                  /* tp_descr_set */
10011    0,                  /* tp_dictoffset */
10012    0,                  /* tp_init */
10013    0,                  /* tp_alloc */
10014    unicode_new,            /* tp_new */
10015    PyObject_Del,           /* tp_free */
10016};
10017
10018/* Initialize the Unicode implementation */
10019
10020void _PyUnicode_Init(void)
10021{
10022    int i;
10023
10024    /* XXX - move this array to unicodectype.c ? */
10025    Py_UNICODE linebreak[] = {
10026        0x000A, /* LINE FEED */
10027        0x000D, /* CARRIAGE RETURN */
10028        0x001C, /* FILE SEPARATOR */
10029        0x001D, /* GROUP SEPARATOR */
10030        0x001E, /* RECORD SEPARATOR */
10031        0x0085, /* NEXT LINE */
10032        0x2028, /* LINE SEPARATOR */
10033        0x2029, /* PARAGRAPH SEPARATOR */
10034    };
10035
10036    /* Init the implementation */
10037    free_list = NULL;
10038    numfree = 0;
10039    unicode_empty = _PyUnicode_New(0);
10040    if (!unicode_empty)
10041        return;
10042
10043    for (i = 0; i < 256; i++)
10044        unicode_latin1[i] = NULL;
10045    if (PyType_Ready(&PyUnicode_Type) < 0)
10046        Py_FatalError("Can't initialize 'unicode'");
10047
10048    /* initialize the linebreak bloom filter */
10049    bloom_linebreak = make_bloom_mask(
10050        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10051        );
10052
10053    PyType_Ready(&EncodingMapType);
10054}
10055
10056/* Finalize the Unicode implementation */
10057
10058int
10059PyUnicode_ClearFreeList(void)
10060{
10061    int freelist_size = numfree;
10062    PyUnicodeObject *u;
10063
10064    for (u = free_list; u != NULL;) {
10065        PyUnicodeObject *v = u;
10066        u = *(PyUnicodeObject **)u;
10067        if (v->str)
10068            PyObject_DEL(v->str);
10069        Py_XDECREF(v->defenc);
10070        PyObject_Del(v);
10071        numfree--;
10072    }
10073    free_list = NULL;
10074    assert(numfree == 0);
10075    return freelist_size;
10076}
10077
10078void
10079_PyUnicode_Fini(void)
10080{
10081    int i;
10082
10083    Py_XDECREF(unicode_empty);
10084    unicode_empty = NULL;
10085
10086    for (i = 0; i < 256; i++) {
10087        if (unicode_latin1[i]) {
10088            Py_DECREF(unicode_latin1[i]);
10089            unicode_latin1[i] = NULL;
10090        }
10091    }
10092    (void)PyUnicode_ClearFreeList();
10093}
10094
10095void
10096PyUnicode_InternInPlace(PyObject **p)
10097{
10098    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10099    PyObject *t;
10100    if (s == NULL || !PyUnicode_Check(s))
10101        Py_FatalError(
10102            "PyUnicode_InternInPlace: unicode strings only please!");
10103    /* If it's a subclass, we don't really know what putting
10104       it in the interned dict might do. */
10105    if (!PyUnicode_CheckExact(s))
10106        return;
10107    if (PyUnicode_CHECK_INTERNED(s))
10108        return;
10109    if (interned == NULL) {
10110        interned = PyDict_New();
10111        if (interned == NULL) {
10112            PyErr_Clear(); /* Don't leave an exception */
10113            return;
10114        }
10115    }
10116    /* It might be that the GetItem call fails even
10117       though the key is present in the dictionary,
10118       namely when this happens during a stack overflow. */
10119    Py_ALLOW_RECURSION
10120        t = PyDict_GetItem(interned, (PyObject *)s);
10121    Py_END_ALLOW_RECURSION
10122
10123        if (t) {
10124            Py_INCREF(t);
10125            Py_DECREF(*p);
10126            *p = t;
10127            return;
10128        }
10129
10130    PyThreadState_GET()->recursion_critical = 1;
10131    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10132        PyErr_Clear();
10133        PyThreadState_GET()->recursion_critical = 0;
10134        return;
10135    }
10136    PyThreadState_GET()->recursion_critical = 0;
10137    /* The two references in interned are not counted by refcnt.
10138       The deallocator will take care of this */
10139    Py_REFCNT(s) -= 2;
10140    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
10141}
10142
10143void
10144PyUnicode_InternImmortal(PyObject **p)
10145{
10146    PyUnicode_InternInPlace(p);
10147    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10148        PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10149        Py_INCREF(*p);
10150    }
10151}
10152
10153PyObject *
10154PyUnicode_InternFromString(const char *cp)
10155{
10156    PyObject *s = PyUnicode_FromString(cp);
10157    if (s == NULL)
10158        return NULL;
10159    PyUnicode_InternInPlace(&s);
10160    return s;
10161}
10162
10163void
10164_Py_ReleaseInternedUnicodeStrings(void)
10165{
10166    PyObject *keys;
10167    PyUnicodeObject *s;
10168    Py_ssize_t i, n;
10169    Py_ssize_t immortal_size = 0, mortal_size = 0;
10170
10171    if (interned == NULL || !PyDict_Check(interned))
10172        return;
10173    keys = PyDict_Keys(interned);
10174    if (keys == NULL || !PyList_Check(keys)) {
10175        PyErr_Clear();
10176        return;
10177    }
10178
10179    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10180       detector, interned unicode strings are not forcibly deallocated;
10181       rather, we give them their stolen references back, and then clear
10182       and DECREF the interned dict. */
10183
10184    n = PyList_GET_SIZE(keys);
10185    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
10186            n);
10187    for (i = 0; i < n; i++) {
10188        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10189        switch (s->state) {
10190        case SSTATE_NOT_INTERNED:
10191            /* XXX Shouldn't happen */
10192            break;
10193        case SSTATE_INTERNED_IMMORTAL:
10194            Py_REFCNT(s) += 1;
10195            immortal_size += s->length;
10196            break;
10197        case SSTATE_INTERNED_MORTAL:
10198            Py_REFCNT(s) += 2;
10199            mortal_size += s->length;
10200            break;
10201        default:
10202            Py_FatalError("Inconsistent interned string state.");
10203        }
10204        s->state = SSTATE_NOT_INTERNED;
10205    }
10206    fprintf(stderr, "total size of all interned strings: "
10207            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10208            "mortal/immortal\n", mortal_size, immortal_size);
10209    Py_DECREF(keys);
10210    PyDict_Clear(interned);
10211    Py_DECREF(interned);
10212    interned = NULL;
10213}
10214
10215
10216/********************* Unicode Iterator **************************/
10217
10218typedef struct {
10219    PyObject_HEAD
10220    Py_ssize_t it_index;
10221    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
10222} unicodeiterobject;
10223
10224static void
10225unicodeiter_dealloc(unicodeiterobject *it)
10226{
10227    _PyObject_GC_UNTRACK(it);
10228    Py_XDECREF(it->it_seq);
10229    PyObject_GC_Del(it);
10230}
10231
10232static int
10233unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10234{
10235    Py_VISIT(it->it_seq);
10236    return 0;
10237}
10238
10239static PyObject *
10240unicodeiter_next(unicodeiterobject *it)
10241{
10242    PyUnicodeObject *seq;
10243    PyObject *item;
10244
10245    assert(it != NULL);
10246    seq = it->it_seq;
10247    if (seq == NULL)
10248        return NULL;
10249    assert(PyUnicode_Check(seq));
10250
10251    if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10252        item = PyUnicode_FromUnicode(
10253            PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
10254        if (item != NULL)
10255            ++it->it_index;
10256        return item;
10257    }
10258
10259    Py_DECREF(seq);
10260    it->it_seq = NULL;
10261    return NULL;
10262}
10263
10264static PyObject *
10265unicodeiter_len(unicodeiterobject *it)
10266{
10267    Py_ssize_t len = 0;
10268    if (it->it_seq)
10269        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10270    return PyLong_FromSsize_t(len);
10271}
10272
10273PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10274
10275static PyMethodDef unicodeiter_methods[] = {
10276    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
10277     length_hint_doc},
10278    {NULL,      NULL}       /* sentinel */
10279};
10280
10281PyTypeObject PyUnicodeIter_Type = {
10282    PyVarObject_HEAD_INIT(&PyType_Type, 0)
10283    "str_iterator",         /* tp_name */
10284    sizeof(unicodeiterobject),      /* tp_basicsize */
10285    0,                  /* tp_itemsize */
10286    /* methods */
10287    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
10288    0,                  /* tp_print */
10289    0,                  /* tp_getattr */
10290    0,                  /* tp_setattr */
10291    0,                  /* tp_reserved */
10292    0,                  /* tp_repr */
10293    0,                  /* tp_as_number */
10294    0,                  /* tp_as_sequence */
10295    0,                  /* tp_as_mapping */
10296    0,                  /* tp_hash */
10297    0,                  /* tp_call */
10298    0,                  /* tp_str */
10299    PyObject_GenericGetAttr,        /* tp_getattro */
10300    0,                  /* tp_setattro */
10301    0,                  /* tp_as_buffer */
10302    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10303    0,                  /* tp_doc */
10304    (traverseproc)unicodeiter_traverse, /* tp_traverse */
10305    0,                  /* tp_clear */
10306    0,                  /* tp_richcompare */
10307    0,                  /* tp_weaklistoffset */
10308    PyObject_SelfIter,          /* tp_iter */
10309    (iternextfunc)unicodeiter_next,     /* tp_iternext */
10310    unicodeiter_methods,            /* tp_methods */
10311    0,
10312};
10313
10314static PyObject *
10315unicode_iter(PyObject *seq)
10316{
10317    unicodeiterobject *it;
10318
10319    if (!PyUnicode_Check(seq)) {
10320        PyErr_BadInternalCall();
10321        return NULL;
10322    }
10323    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10324    if (it == NULL)
10325        return NULL;
10326    it->it_index = 0;
10327    Py_INCREF(seq);
10328    it->it_seq = (PyUnicodeObject *)seq;
10329    _PyObject_GC_TRACK(it);
10330    return (PyObject *)it;
10331}
10332
10333size_t
10334Py_UNICODE_strlen(const Py_UNICODE *u)
10335{
10336    int res = 0;
10337    while(*u++)
10338        res++;
10339    return res;
10340}
10341
10342Py_UNICODE*
10343Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10344{
10345    Py_UNICODE *u = s1;
10346    while ((*u++ = *s2++));
10347    return s1;
10348}
10349
10350Py_UNICODE*
10351Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10352{
10353    Py_UNICODE *u = s1;
10354    while ((*u++ = *s2++))
10355        if (n-- == 0)
10356            break;
10357    return s1;
10358}
10359
10360Py_UNICODE*
10361Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10362{
10363    Py_UNICODE *u1 = s1;
10364    u1 += Py_UNICODE_strlen(u1);
10365    Py_UNICODE_strcpy(u1, s2);
10366    return s1;
10367}
10368
10369int
10370Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10371{
10372    while (*s1 && *s2 && *s1 == *s2)
10373        s1++, s2++;
10374    if (*s1 && *s2)
10375        return (*s1 < *s2) ? -1 : +1;
10376    if (*s1)
10377        return 1;
10378    if (*s2)
10379        return -1;
10380    return 0;
10381}
10382
10383int
10384Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10385{
10386    register Py_UNICODE u1, u2;
10387    for (; n != 0; n--) {
10388        u1 = *s1;
10389        u2 = *s2;
10390        if (u1 != u2)
10391            return (u1 < u2) ? -1 : +1;
10392        if (u1 == '\0')
10393            return 0;
10394        s1++;
10395        s2++;
10396    }
10397    return 0;
10398}
10399
10400Py_UNICODE*
10401Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10402{
10403    const Py_UNICODE *p;
10404    for (p = s; *p; p++)
10405        if (*p == c)
10406            return (Py_UNICODE*)p;
10407    return NULL;
10408}
10409
10410Py_UNICODE*
10411Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10412{
10413    const Py_UNICODE *p;
10414    p = s + Py_UNICODE_strlen(s);
10415    while (p != s) {
10416        p--;
10417        if (*p == c)
10418            return (Py_UNICODE*)p;
10419    }
10420    return NULL;
10421}
10422
10423Py_UNICODE*
10424PyUnicode_AsUnicodeCopy(PyObject *object)
10425{
10426    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10427    Py_UNICODE *copy;
10428    Py_ssize_t size;
10429
10430    /* Ensure we won't overflow the size. */
10431    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10432        PyErr_NoMemory();
10433        return NULL;
10434    }
10435    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10436    size *= sizeof(Py_UNICODE);
10437    copy = PyMem_Malloc(size);
10438    if (copy == NULL) {
10439        PyErr_NoMemory();
10440        return NULL;
10441    }
10442    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10443    return copy;
10444}
10445
10446/* A _string module, to export formatter_parser and formatter_field_name_split
10447   to the string.Formatter class implemented in Python. */
10448
10449static PyMethodDef _string_methods[] = {
10450    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10451     METH_O, PyDoc_STR("split the argument as a field name")},
10452    {"formatter_parser", (PyCFunction) formatter_parser,
10453     METH_O, PyDoc_STR("parse the argument as a format string")},
10454    {NULL, NULL}
10455};
10456
10457static struct PyModuleDef _string_module = {
10458    PyModuleDef_HEAD_INIT,
10459    "_string",
10460    PyDoc_STR("string helper module"),
10461    0,
10462    _string_methods,
10463    NULL,
10464    NULL,
10465    NULL,
10466    NULL
10467};
10468
10469PyMODINIT_FUNC
10470PyInit__string(void)
10471{
10472    return PyModule_Create(&_string_module);
10473}
10474
10475
10476#ifdef __cplusplus
10477}
10478#endif
10479