unicodeobject.c revision 168e117e0a8825bc3ae0c08f0b08a33ac351a14f
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "bytes_methods.h"
45
46#include "unicodeobject.h"
47#include "ucnhash.h"
48
49#ifdef MS_WINDOWS
50#include <windows.h>
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96/* This dictionary holds all interned unicode strings.  Note that references
97   to strings in this dictionary are *not* counted in the string's ob_refcnt.
98   When the interned string reaches a refcnt of 0 the string deallocation
99   function will delete the reference from this dictionary.
100
101   Another way to look at this is that to say that the actual reference
102   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
103*/
104static PyObject *interned;
105
106/* Free list for Unicode objects */
107static PyUnicodeObject *free_list;
108static int numfree;
109
110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114   shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119    0, 0, 0, 0, 0, 0, 0, 0,
120/*     case 0x0009: * CHARACTER TABULATION */
121/*     case 0x000A: * LINE FEED */
122/*     case 0x000B: * LINE TABULATION */
123/*     case 0x000C: * FORM FEED */
124/*     case 0x000D: * CARRIAGE RETURN */
125    0, 1, 1, 1, 1, 1, 0, 0,
126    0, 0, 0, 0, 0, 0, 0, 0,
127/*     case 0x001C: * FILE SEPARATOR */
128/*     case 0x001D: * GROUP SEPARATOR */
129/*     case 0x001E: * RECORD SEPARATOR */
130/*     case 0x001F: * UNIT SEPARATOR */
131    0, 0, 0, 0, 1, 1, 1, 1,
132/*     case 0x0020: * SPACE */
133    1, 0, 0, 0, 0, 0, 0, 0,
134    0, 0, 0, 0, 0, 0, 0, 0,
135    0, 0, 0, 0, 0, 0, 0, 0,
136    0, 0, 0, 0, 0, 0, 0, 0,
137
138    0, 0, 0, 0, 0, 0, 0, 0,
139    0, 0, 0, 0, 0, 0, 0, 0,
140    0, 0, 0, 0, 0, 0, 0, 0,
141    0, 0, 0, 0, 0, 0, 0, 0,
142    0, 0, 0, 0, 0, 0, 0, 0,
143    0, 0, 0, 0, 0, 0, 0, 0,
144    0, 0, 0, 0, 0, 0, 0, 0,
145    0, 0, 0, 0, 0, 0, 0, 0
146};
147
148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149       PyObject **errorHandler,const char *encoding, const char *reason,
150       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
153static void raise_encode_exception(PyObject **exceptionObject,
154                                   const char *encoding,
155                                   const Py_UNICODE *unicode, Py_ssize_t size,
156                                   Py_ssize_t startpos, Py_ssize_t endpos,
157                                   const char *reason);
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161    0, 0, 0, 0, 0, 0, 0, 0,
162/*         0x000A, * LINE FEED */
163/*         0x000B, * LINE TABULATION */
164/*         0x000C, * FORM FEED */
165/*         0x000D, * CARRIAGE RETURN */
166    0, 0, 1, 1, 1, 1, 0, 0,
167    0, 0, 0, 0, 0, 0, 0, 0,
168/*         0x001C, * FILE SEPARATOR */
169/*         0x001D, * GROUP SEPARATOR */
170/*         0x001E, * RECORD SEPARATOR */
171    0, 0, 0, 0, 1, 1, 1, 0,
172    0, 0, 0, 0, 0, 0, 0, 0,
173    0, 0, 0, 0, 0, 0, 0, 0,
174    0, 0, 0, 0, 0, 0, 0, 0,
175    0, 0, 0, 0, 0, 0, 0, 0,
176
177    0, 0, 0, 0, 0, 0, 0, 0,
178    0, 0, 0, 0, 0, 0, 0, 0,
179    0, 0, 0, 0, 0, 0, 0, 0,
180    0, 0, 0, 0, 0, 0, 0, 0,
181    0, 0, 0, 0, 0, 0, 0, 0,
182    0, 0, 0, 0, 0, 0, 0, 0,
183    0, 0, 0, 0, 0, 0, 0, 0,
184    0, 0, 0, 0, 0, 0, 0, 0
185};
186
187
188Py_UNICODE
189PyUnicode_GetMax(void)
190{
191#ifdef Py_UNICODE_WIDE
192    return 0x10FFFF;
193#else
194    /* This is actually an illegal character, so it should
195       not be passed to unichr. */
196    return 0xFFFF;
197#endif
198}
199
200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203   to keep things simple, we use a single bitmask, using the least 5
204   bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224
225#define BLOOM_LINEBREAK(ch)                                             \
226    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
227     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231    /* calculate simple bloom-style bitmask for a given unicode string */
232
233    BLOOM_MASK mask;
234    Py_ssize_t i;
235
236    mask = 0;
237    for (i = 0; i < len; i++)
238        BLOOM_ADD(mask, ptr[i]);
239
240    return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245    Py_ssize_t i;
246
247    for (i = 0; i < setlen; i++)
248        if (set[i] == chr)
249            return 1;
250
251    return 0;
252}
253
254#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
255    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
257/* --- Unicode Object ----------------------------------------------------- */
258
259static
260int unicode_resize(register PyUnicodeObject *unicode,
261                   Py_ssize_t length)
262{
263    void *oldstr;
264
265    /* Shortcut if there's nothing much to do. */
266    if (unicode->length == length)
267        goto reset;
268
269    /* Resizing shared object (unicode_empty or single character
270       objects) in-place is not allowed. Use PyUnicode_Resize()
271       instead ! */
272
273    if (unicode == unicode_empty ||
274        (unicode->length == 1 &&
275         unicode->str[0] < 256U &&
276         unicode_latin1[unicode->str[0]] == unicode)) {
277        PyErr_SetString(PyExc_SystemError,
278                        "can't resize shared str objects");
279        return -1;
280    }
281
282    /* We allocate one more byte to make sure the string is Ux0000 terminated.
283       The overallocation is also used by fastsearch, which assumes that it's
284       safe to look at str[length] (without making any assumptions about what
285       it contains). */
286
287    oldstr = unicode->str;
288    unicode->str = PyObject_REALLOC(unicode->str,
289                                    sizeof(Py_UNICODE) * (length + 1));
290    if (!unicode->str) {
291        unicode->str = (Py_UNICODE *)oldstr;
292        PyErr_NoMemory();
293        return -1;
294    }
295    unicode->str[length] = 0;
296    unicode->length = length;
297
298  reset:
299    /* Reset the object caches */
300    if (unicode->defenc) {
301        Py_CLEAR(unicode->defenc);
302    }
303    unicode->hash = -1;
304
305    return 0;
306}
307
308/* We allocate one more byte to make sure the string is
309   Ux0000 terminated; some code (e.g. new_identifier)
310   relies on that.
311
312   XXX This allocator could further be enhanced by assuring that the
313   free list never reduces its size below 1.
314
315*/
316
317static
318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
319{
320    register PyUnicodeObject *unicode;
321
322    /* Optimization for empty strings */
323    if (length == 0 && unicode_empty != NULL) {
324        Py_INCREF(unicode_empty);
325        return unicode_empty;
326    }
327
328    /* Ensure we won't overflow the size. */
329    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330        return (PyUnicodeObject *)PyErr_NoMemory();
331    }
332
333    /* Unicode freelist & memory allocation */
334    if (free_list) {
335        unicode = free_list;
336        free_list = *(PyUnicodeObject **)unicode;
337        numfree--;
338        if (unicode->str) {
339            /* Keep-Alive optimization: we only upsize the buffer,
340               never downsize it. */
341            if ((unicode->length < length) &&
342                unicode_resize(unicode, length) < 0) {
343                PyObject_DEL(unicode->str);
344                unicode->str = NULL;
345            }
346        }
347        else {
348            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
350        }
351        PyObject_INIT(unicode, &PyUnicode_Type);
352    }
353    else {
354        size_t new_size;
355        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
356        if (unicode == NULL)
357            return NULL;
358        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
360    }
361
362    if (!unicode->str) {
363        PyErr_NoMemory();
364        goto onError;
365    }
366    /* Initialize the first element to guard against cases where
367     * the caller fails before initializing str -- unicode_resize()
368     * reads str[0], and the Keep-Alive optimization can keep memory
369     * allocated for str alive across a call to unicode_dealloc(unicode).
370     * We don't want unicode_resize to read uninitialized memory in
371     * that case.
372     */
373    unicode->str[0] = 0;
374    unicode->str[length] = 0;
375    unicode->length = length;
376    unicode->hash = -1;
377    unicode->state = 0;
378    unicode->defenc = NULL;
379    return unicode;
380
381  onError:
382    /* XXX UNREF/NEWREF interface should be more symmetrical */
383    _Py_DEC_REFTOTAL;
384    _Py_ForgetReference((PyObject *)unicode);
385    PyObject_Del(unicode);
386    return NULL;
387}
388
389static
390void unicode_dealloc(register PyUnicodeObject *unicode)
391{
392    switch (PyUnicode_CHECK_INTERNED(unicode)) {
393    case SSTATE_NOT_INTERNED:
394        break;
395
396    case SSTATE_INTERNED_MORTAL:
397        /* revive dead object temporarily for DelItem */
398        Py_REFCNT(unicode) = 3;
399        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400            Py_FatalError(
401                "deletion of interned string failed");
402        break;
403
404    case SSTATE_INTERNED_IMMORTAL:
405        Py_FatalError("Immortal interned string died.");
406
407    default:
408        Py_FatalError("Inconsistent interned string state.");
409    }
410
411    if (PyUnicode_CheckExact(unicode) &&
412        numfree < PyUnicode_MAXFREELIST) {
413        /* Keep-Alive optimization */
414        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415            PyObject_DEL(unicode->str);
416            unicode->str = NULL;
417            unicode->length = 0;
418        }
419        if (unicode->defenc) {
420            Py_CLEAR(unicode->defenc);
421        }
422        /* Add to free list */
423        *(PyUnicodeObject **)unicode = free_list;
424        free_list = unicode;
425        numfree++;
426    }
427    else {
428        PyObject_DEL(unicode->str);
429        Py_XDECREF(unicode->defenc);
430        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
431    }
432}
433
434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
436{
437    register PyUnicodeObject *v;
438
439    /* Argument checks */
440    if (unicode == NULL) {
441        PyErr_BadInternalCall();
442        return -1;
443    }
444    v = *unicode;
445    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
446        PyErr_BadInternalCall();
447        return -1;
448    }
449
450    /* Resizing unicode_empty and single character objects is not
451       possible since these are being shared. We simply return a fresh
452       copy with the same Unicode content. */
453    if (v->length != length &&
454        (v == unicode_empty || v->length == 1)) {
455        PyUnicodeObject *w = _PyUnicode_New(length);
456        if (w == NULL)
457            return -1;
458        Py_UNICODE_COPY(w->str, v->str,
459                        length < v->length ? length : v->length);
460        Py_DECREF(*unicode);
461        *unicode = w;
462        return 0;
463    }
464
465    /* Note that we don't have to modify *unicode for unshared Unicode
466       objects, since we can modify them in-place. */
467    return unicode_resize(v, length);
468}
469
470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
474
475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
476                                Py_ssize_t size)
477{
478    PyUnicodeObject *unicode;
479
480    /* If the Unicode data is known at construction time, we can apply
481       some optimizations which share commonly used objects. */
482    if (u != NULL) {
483
484        /* Optimization for empty strings */
485        if (size == 0 && unicode_empty != NULL) {
486            Py_INCREF(unicode_empty);
487            return (PyObject *)unicode_empty;
488        }
489
490        /* Single character Unicode objects in the Latin-1 range are
491           shared when using this constructor */
492        if (size == 1 && *u < 256) {
493            unicode = unicode_latin1[*u];
494            if (!unicode) {
495                unicode = _PyUnicode_New(1);
496                if (!unicode)
497                    return NULL;
498                unicode->str[0] = *u;
499                unicode_latin1[*u] = unicode;
500            }
501            Py_INCREF(unicode);
502            return (PyObject *)unicode;
503        }
504    }
505
506    unicode = _PyUnicode_New(size);
507    if (!unicode)
508        return NULL;
509
510    /* Copy the Unicode data into the new object */
511    if (u != NULL)
512        Py_UNICODE_COPY(unicode->str, u, size);
513
514    return (PyObject *)unicode;
515}
516
517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
518{
519    PyUnicodeObject *unicode;
520
521    if (size < 0) {
522        PyErr_SetString(PyExc_SystemError,
523                        "Negative size passed to PyUnicode_FromStringAndSize");
524        return NULL;
525    }
526
527    /* If the Unicode data is known at construction time, we can apply
528       some optimizations which share commonly used objects.
529       Also, this means the input must be UTF-8, so fall back to the
530       UTF-8 decoder at the end. */
531    if (u != NULL) {
532
533        /* Optimization for empty strings */
534        if (size == 0 && unicode_empty != NULL) {
535            Py_INCREF(unicode_empty);
536            return (PyObject *)unicode_empty;
537        }
538
539        /* Single characters are shared when using this constructor.
540           Restrict to ASCII, since the input must be UTF-8. */
541        if (size == 1 && Py_CHARMASK(*u) < 128) {
542            unicode = unicode_latin1[Py_CHARMASK(*u)];
543            if (!unicode) {
544                unicode = _PyUnicode_New(1);
545                if (!unicode)
546                    return NULL;
547                unicode->str[0] = Py_CHARMASK(*u);
548                unicode_latin1[Py_CHARMASK(*u)] = unicode;
549            }
550            Py_INCREF(unicode);
551            return (PyObject *)unicode;
552        }
553
554        return PyUnicode_DecodeUTF8(u, size, NULL);
555    }
556
557    unicode = _PyUnicode_New(size);
558    if (!unicode)
559        return NULL;
560
561    return (PyObject *)unicode;
562}
563
564PyObject *PyUnicode_FromString(const char *u)
565{
566    size_t size = strlen(u);
567    if (size > PY_SSIZE_T_MAX) {
568        PyErr_SetString(PyExc_OverflowError, "input too long");
569        return NULL;
570    }
571
572    return PyUnicode_FromStringAndSize(u, size);
573}
574
575#ifdef HAVE_WCHAR_H
576
577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584   to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587                                 Py_ssize_t size)
588{
589    PyUnicodeObject *unicode;
590    register Py_ssize_t i;
591    Py_ssize_t alloc;
592    const wchar_t *orig_w;
593
594    if (w == NULL) {
595        if (size == 0)
596            return PyUnicode_FromStringAndSize(NULL, 0);
597        PyErr_BadInternalCall();
598        return NULL;
599    }
600
601    if (size == -1) {
602        size = wcslen(w);
603    }
604
605    alloc = size;
606    orig_w = w;
607    for (i = size; i > 0; i--) {
608        if (*w > 0xFFFF)
609            alloc++;
610        w++;
611    }
612    w = orig_w;
613    unicode = _PyUnicode_New(alloc);
614    if (!unicode)
615        return NULL;
616
617    /* Copy the wchar_t data into the new object */
618    {
619        register Py_UNICODE *u;
620        u = PyUnicode_AS_UNICODE(unicode);
621        for (i = size; i > 0; i--) {
622            if (*w > 0xFFFF) {
623                wchar_t ordinal = *w++;
624                ordinal -= 0x10000;
625                *u++ = 0xD800 | (ordinal >> 10);
626                *u++ = 0xDC00 | (ordinal & 0x3FF);
627            }
628            else
629                *u++ = *w++;
630        }
631    }
632    return (PyObject *)unicode;
633}
634
635#else
636
637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
638                                 Py_ssize_t size)
639{
640    PyUnicodeObject *unicode;
641
642    if (w == NULL) {
643        if (size == 0)
644            return PyUnicode_FromStringAndSize(NULL, 0);
645        PyErr_BadInternalCall();
646        return NULL;
647    }
648
649    if (size == -1) {
650        size = wcslen(w);
651    }
652
653    unicode = _PyUnicode_New(size);
654    if (!unicode)
655        return NULL;
656
657    /* Copy the wchar_t data into the new object */
658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
659    memcpy(unicode->str, w, size * sizeof(wchar_t));
660#else
661    {
662        register Py_UNICODE *u;
663        register Py_ssize_t i;
664        u = PyUnicode_AS_UNICODE(unicode);
665        for (i = size; i > 0; i--)
666            *u++ = *w++;
667    }
668#endif
669
670    return (PyObject *)unicode;
671}
672
673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
677static void
678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679        int zeropad, int width, int precision, char c)
680{
681    *fmt++ = '%';
682    if (width) {
683        if (zeropad)
684            *fmt++ = '0';
685        fmt += sprintf(fmt, "%d", width);
686    }
687    if (precision)
688        fmt += sprintf(fmt, ".%d", precision);
689    if (longflag)
690        *fmt++ = 'l';
691    else if (longlongflag) {
692        /* longlongflag should only ever be nonzero on machines with
693           HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695        char *f = PY_FORMAT_LONG_LONG;
696        while (*f)
697            *fmt++ = *f++;
698#else
699        /* we shouldn't ever get here */
700        assert(0);
701        *fmt++ = 'l';
702#endif
703    }
704    else if (size_tflag) {
705        char *f = PY_FORMAT_SIZE_T;
706        while (*f)
707            *fmt++ = *f++;
708    }
709    *fmt++ = c;
710    *fmt = '\0';
711}
712
713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld.  21 characters
718   allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
728    va_list count;
729    Py_ssize_t callcount = 0;
730    PyObject **callresults = NULL;
731    PyObject **callresult = NULL;
732    Py_ssize_t n = 0;
733    int width = 0;
734    int precision = 0;
735    int zeropad;
736    const char* f;
737    Py_UNICODE *s;
738    PyObject *string;
739    /* used by sprintf */
740    char buffer[ITEM_BUFFER_LEN+1];
741    /* use abuffer instead of buffer, if we need more space
742     * (which can happen if there's a format specifier with width). */
743    char *abuffer = NULL;
744    char *realbuffer;
745    Py_ssize_t abuffersize = 0;
746    char fmt[61]; /* should be enough for %0width.precisionlld */
747    const char *copy;
748
749    Py_VA_COPY(count, vargs);
750    /* step 1: count the number of %S/%R/%A/%s format specifications
751     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753     * result in an array) */
754    for (f = format; *f; f++) {
755         if (*f == '%') {
756             if (*(f+1)=='%')
757                 continue;
758             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759                 ++callcount;
760             while (ISDIGIT((unsigned)*f))
761                 width = (width*10) + *f++ - '0';
762             while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763                 ;
764             if (*f == 's')
765                 ++callcount;
766         }
767         else if (128 <= (unsigned char)*f) {
768             PyErr_Format(PyExc_ValueError,
769                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
770                "string, got a non-ASCII byte: 0x%02x",
771                (unsigned char)*f);
772             return NULL;
773         }
774    }
775    /* step 2: allocate memory for the results of
776     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
777    if (callcount) {
778        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
779        if (!callresults) {
780            PyErr_NoMemory();
781            return NULL;
782        }
783        callresult = callresults;
784    }
785    /* step 3: figure out how large a buffer we need */
786    for (f = format; *f; f++) {
787        if (*f == '%') {
788#ifdef HAVE_LONG_LONG
789            int longlongflag = 0;
790#endif
791            const char* p = f;
792            width = 0;
793            while (ISDIGIT((unsigned)*f))
794                width = (width*10) + *f++ - '0';
795            while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
796                ;
797
798            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
799             * they don't affect the amount of space we reserve.
800             */
801            if (*f == 'l') {
802                if (f[1] == 'd' || f[1] == 'u') {
803                    ++f;
804                }
805#ifdef HAVE_LONG_LONG
806                else if (f[1] == 'l' &&
807                         (f[2] == 'd' || f[2] == 'u')) {
808                    longlongflag = 1;
809                    f += 2;
810                }
811#endif
812            }
813            else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
814                ++f;
815            }
816
817            switch (*f) {
818            case 'c':
819                (void)va_arg(count, int);
820                /* fall through... */
821            case '%':
822                n++;
823                break;
824            case 'd': case 'u': case 'i': case 'x':
825                (void) va_arg(count, int);
826#ifdef HAVE_LONG_LONG
827                if (longlongflag) {
828                    if (width < MAX_LONG_LONG_CHARS)
829                        width = MAX_LONG_LONG_CHARS;
830                }
831                else
832#endif
833                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
834                       including sign.  Decimal takes the most space.  This
835                       isn't enough for octal.  If a width is specified we
836                       need more (which we allocate later). */
837                    if (width < MAX_LONG_CHARS)
838                        width = MAX_LONG_CHARS;
839                n += width;
840                /* XXX should allow for large precision here too. */
841                if (abuffersize < width)
842                    abuffersize = width;
843                break;
844            case 's':
845            {
846                /* UTF-8 */
847                const char *s = va_arg(count, const char*);
848                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
849                if (!str)
850                    goto fail;
851                n += PyUnicode_GET_SIZE(str);
852                /* Remember the str and switch to the next slot */
853                *callresult++ = str;
854                break;
855            }
856            case 'U':
857            {
858                PyObject *obj = va_arg(count, PyObject *);
859                assert(obj && PyUnicode_Check(obj));
860                n += PyUnicode_GET_SIZE(obj);
861                break;
862            }
863            case 'V':
864            {
865                PyObject *obj = va_arg(count, PyObject *);
866                const char *str = va_arg(count, const char *);
867                assert(obj || str);
868                assert(!obj || PyUnicode_Check(obj));
869                if (obj)
870                    n += PyUnicode_GET_SIZE(obj);
871                else
872                    n += strlen(str);
873                break;
874            }
875            case 'S':
876            {
877                PyObject *obj = va_arg(count, PyObject *);
878                PyObject *str;
879                assert(obj);
880                str = PyObject_Str(obj);
881                if (!str)
882                    goto fail;
883                n += PyUnicode_GET_SIZE(str);
884                /* Remember the str and switch to the next slot */
885                *callresult++ = str;
886                break;
887            }
888            case 'R':
889            {
890                PyObject *obj = va_arg(count, PyObject *);
891                PyObject *repr;
892                assert(obj);
893                repr = PyObject_Repr(obj);
894                if (!repr)
895                    goto fail;
896                n += PyUnicode_GET_SIZE(repr);
897                /* Remember the repr and switch to the next slot */
898                *callresult++ = repr;
899                break;
900            }
901            case 'A':
902            {
903                PyObject *obj = va_arg(count, PyObject *);
904                PyObject *ascii;
905                assert(obj);
906                ascii = PyObject_ASCII(obj);
907                if (!ascii)
908                    goto fail;
909                n += PyUnicode_GET_SIZE(ascii);
910                /* Remember the repr and switch to the next slot */
911                *callresult++ = ascii;
912                break;
913            }
914            case 'p':
915                (void) va_arg(count, int);
916                /* maximum 64-bit pointer representation:
917                 * 0xffffffffffffffff
918                 * so 19 characters is enough.
919                 * XXX I count 18 -- what's the extra for?
920                 */
921                n += 19;
922                break;
923            default:
924                /* if we stumble upon an unknown
925                   formatting code, copy the rest of
926                   the format string to the output
927                   string. (we cannot just skip the
928                   code, since there's no way to know
929                   what's in the argument list) */
930                n += strlen(p);
931                goto expand;
932            }
933        } else
934            n++;
935    }
936  expand:
937    if (abuffersize > ITEM_BUFFER_LEN) {
938        /* add 1 for sprintf's trailing null byte */
939        abuffer = PyObject_Malloc(abuffersize + 1);
940        if (!abuffer) {
941            PyErr_NoMemory();
942            goto fail;
943        }
944        realbuffer = abuffer;
945    }
946    else
947        realbuffer = buffer;
948    /* step 4: fill the buffer */
949    /* Since we've analyzed how much space we need for the worst case,
950       we don't have to resize the string.
951       There can be no errors beyond this point. */
952    string = PyUnicode_FromUnicode(NULL, n);
953    if (!string)
954        goto fail;
955
956    s = PyUnicode_AS_UNICODE(string);
957    callresult = callresults;
958
959    for (f = format; *f; f++) {
960        if (*f == '%') {
961            const char* p = f++;
962            int longflag = 0;
963            int longlongflag = 0;
964            int size_tflag = 0;
965            zeropad = (*f == '0');
966            /* parse the width.precision part */
967            width = 0;
968            while (ISDIGIT((unsigned)*f))
969                width = (width*10) + *f++ - '0';
970            precision = 0;
971            if (*f == '.') {
972                f++;
973                while (ISDIGIT((unsigned)*f))
974                    precision = (precision*10) + *f++ - '0';
975            }
976            /* Handle %ld, %lu, %lld and %llu. */
977            if (*f == 'l') {
978                if (f[1] == 'd' || f[1] == 'u') {
979                    longflag = 1;
980                    ++f;
981                }
982#ifdef HAVE_LONG_LONG
983                else if (f[1] == 'l' &&
984                         (f[2] == 'd' || f[2] == 'u')) {
985                    longlongflag = 1;
986                    f += 2;
987                }
988#endif
989            }
990            /* handle the size_t flag. */
991            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
992                size_tflag = 1;
993                ++f;
994            }
995
996            switch (*f) {
997            case 'c':
998                *s++ = va_arg(vargs, int);
999                break;
1000            case 'd':
1001                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1002                        width, precision, 'd');
1003                if (longflag)
1004                    sprintf(realbuffer, fmt, va_arg(vargs, long));
1005#ifdef HAVE_LONG_LONG
1006                else if (longlongflag)
1007                    sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1008#endif
1009                else if (size_tflag)
1010                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1011                else
1012                    sprintf(realbuffer, fmt, va_arg(vargs, int));
1013                appendstring(realbuffer);
1014                break;
1015            case 'u':
1016                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1017                        width, precision, 'u');
1018                if (longflag)
1019                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
1020#ifdef HAVE_LONG_LONG
1021                else if (longlongflag)
1022                    sprintf(realbuffer, fmt, va_arg(vargs,
1023                                                    unsigned PY_LONG_LONG));
1024#endif
1025                else if (size_tflag)
1026                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1027                else
1028                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1029                appendstring(realbuffer);
1030                break;
1031            case 'i':
1032                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
1033                sprintf(realbuffer, fmt, va_arg(vargs, int));
1034                appendstring(realbuffer);
1035                break;
1036            case 'x':
1037                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1038                sprintf(realbuffer, fmt, va_arg(vargs, int));
1039                appendstring(realbuffer);
1040                break;
1041            case 's':
1042            {
1043                /* unused, since we already have the result */
1044                (void) va_arg(vargs, char *);
1045                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1046                                PyUnicode_GET_SIZE(*callresult));
1047                s += PyUnicode_GET_SIZE(*callresult);
1048                /* We're done with the unicode()/repr() => forget it */
1049                Py_DECREF(*callresult);
1050                /* switch to next unicode()/repr() result */
1051                ++callresult;
1052                break;
1053            }
1054            case 'U':
1055            {
1056                PyObject *obj = va_arg(vargs, PyObject *);
1057                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1058                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1059                s += size;
1060                break;
1061            }
1062            case 'V':
1063            {
1064                PyObject *obj = va_arg(vargs, PyObject *);
1065                const char *str = va_arg(vargs, const char *);
1066                if (obj) {
1067                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1068                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1069                    s += size;
1070                } else {
1071                    appendstring(str);
1072                }
1073                break;
1074            }
1075            case 'S':
1076            case 'R':
1077            {
1078                Py_UNICODE *ucopy;
1079                Py_ssize_t usize;
1080                Py_ssize_t upos;
1081                /* unused, since we already have the result */
1082                (void) va_arg(vargs, PyObject *);
1083                ucopy = PyUnicode_AS_UNICODE(*callresult);
1084                usize = PyUnicode_GET_SIZE(*callresult);
1085                for (upos = 0; upos<usize;)
1086                    *s++ = ucopy[upos++];
1087                /* We're done with the unicode()/repr() => forget it */
1088                Py_DECREF(*callresult);
1089                /* switch to next unicode()/repr() result */
1090                ++callresult;
1091                break;
1092            }
1093            case 'p':
1094                sprintf(buffer, "%p", va_arg(vargs, void*));
1095                /* %p is ill-defined:  ensure leading 0x. */
1096                if (buffer[1] == 'X')
1097                    buffer[1] = 'x';
1098                else if (buffer[1] != 'x') {
1099                    memmove(buffer+2, buffer, strlen(buffer)+1);
1100                    buffer[0] = '0';
1101                    buffer[1] = 'x';
1102                }
1103                appendstring(buffer);
1104                break;
1105            case '%':
1106                *s++ = '%';
1107                break;
1108            default:
1109                appendstring(p);
1110                goto end;
1111            }
1112        }
1113        else
1114            *s++ = *f;
1115    }
1116
1117  end:
1118    if (callresults)
1119        PyObject_Free(callresults);
1120    if (abuffer)
1121        PyObject_Free(abuffer);
1122    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1123    return string;
1124  fail:
1125    if (callresults) {
1126        PyObject **callresult2 = callresults;
1127        while (callresult2 < callresult) {
1128            Py_DECREF(*callresult2);
1129            ++callresult2;
1130        }
1131        PyObject_Free(callresults);
1132    }
1133    if (abuffer)
1134        PyObject_Free(abuffer);
1135    return NULL;
1136}
1137
1138#undef appendstring
1139
1140PyObject *
1141PyUnicode_FromFormat(const char *format, ...)
1142{
1143    PyObject* ret;
1144    va_list vargs;
1145
1146#ifdef HAVE_STDARG_PROTOTYPES
1147    va_start(vargs, format);
1148#else
1149    va_start(vargs);
1150#endif
1151    ret = PyUnicode_FromFormatV(format, vargs);
1152    va_end(vargs);
1153    return ret;
1154}
1155
1156/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1157   convert a Unicode object to a wide character string.
1158
1159   - If w is NULL: return the number of wide characters (including the nul
1160     character) required to convert the unicode object. Ignore size argument.
1161
1162   - Otherwise: return the number of wide characters (excluding the nul
1163     character) written into w. Write at most size wide characters (including
1164     the nul character). */
1165static Py_ssize_t
1166unicode_aswidechar(PyUnicodeObject *unicode,
1167                   wchar_t *w,
1168                   Py_ssize_t size)
1169{
1170#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1171    Py_ssize_t res;
1172    if (w != NULL) {
1173        res = PyUnicode_GET_SIZE(unicode);
1174        if (size > res)
1175            size = res + 1;
1176        else
1177            res = size;
1178        memcpy(w, unicode->str, size * sizeof(wchar_t));
1179        return res;
1180    }
1181    else
1182        return PyUnicode_GET_SIZE(unicode) + 1;
1183#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1184    register const Py_UNICODE *u;
1185    const Py_UNICODE *uend;
1186    const wchar_t *worig, *wend;
1187    Py_ssize_t nchar;
1188
1189    u = PyUnicode_AS_UNICODE(unicode);
1190    uend = u + PyUnicode_GET_SIZE(unicode);
1191    if (w != NULL) {
1192        worig = w;
1193        wend = w + size;
1194        while (u != uend && w != wend) {
1195            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1196                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1197            {
1198                *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1199                u += 2;
1200            }
1201            else {
1202                *w = *u;
1203                u++;
1204            }
1205            w++;
1206        }
1207        if (w != wend)
1208            *w = L'\0';
1209        return w - worig;
1210    }
1211    else {
1212        nchar = 1; /* nul character at the end */
1213        while (u != uend) {
1214            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216                u += 2;
1217            else
1218                u++;
1219            nchar++;
1220        }
1221    }
1222    return nchar;
1223#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1224    register Py_UNICODE *u, *uend, ordinal;
1225    register Py_ssize_t i;
1226    wchar_t *worig, *wend;
1227    Py_ssize_t nchar;
1228
1229    u = PyUnicode_AS_UNICODE(unicode);
1230    uend = u + PyUnicode_GET_SIZE(u);
1231    if (w != NULL) {
1232        worig = w;
1233        wend = w + size;
1234        while (u != uend && w != wend) {
1235            ordinal = *u;
1236            if (ordinal > 0xffff) {
1237                ordinal -= 0x10000;
1238                *w++ = 0xD800 | (ordinal >> 10);
1239                *w++ = 0xDC00 | (ordinal & 0x3FF);
1240            }
1241            else
1242                *w++ = ordinal;
1243            u++;
1244        }
1245        if (w != wend)
1246            *w = 0;
1247        return w - worig;
1248    }
1249    else {
1250        nchar = 1; /* nul character */
1251        while (u != uend) {
1252            if (*u > 0xffff)
1253                nchar += 2;
1254            else
1255                nchar++;
1256            u++;
1257        }
1258        return nchar;
1259    }
1260#else
1261#  error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
1262#endif
1263}
1264
1265Py_ssize_t
1266PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1267                     wchar_t *w,
1268                     Py_ssize_t size)
1269{
1270    if (unicode == NULL) {
1271        PyErr_BadInternalCall();
1272        return -1;
1273    }
1274    return unicode_aswidechar(unicode, w, size);
1275}
1276
1277wchar_t*
1278PyUnicode_AsWideCharString(PyObject *unicode,
1279                           Py_ssize_t *size)
1280{
1281    wchar_t* buffer;
1282    Py_ssize_t buflen;
1283
1284    if (unicode == NULL) {
1285        PyErr_BadInternalCall();
1286        return NULL;
1287    }
1288
1289    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
1290    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
1291        PyErr_NoMemory();
1292        return NULL;
1293    }
1294
1295    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1296    if (buffer == NULL) {
1297        PyErr_NoMemory();
1298        return NULL;
1299    }
1300    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
1301    if (size != NULL)
1302        *size = buflen;
1303    return buffer;
1304}
1305
1306#endif
1307
1308PyObject *PyUnicode_FromOrdinal(int ordinal)
1309{
1310    Py_UNICODE s[2];
1311
1312    if (ordinal < 0 || ordinal > 0x10ffff) {
1313        PyErr_SetString(PyExc_ValueError,
1314                        "chr() arg not in range(0x110000)");
1315        return NULL;
1316    }
1317
1318#ifndef Py_UNICODE_WIDE
1319    if (ordinal > 0xffff) {
1320        ordinal -= 0x10000;
1321        s[0] = 0xD800 | (ordinal >> 10);
1322        s[1] = 0xDC00 | (ordinal & 0x3FF);
1323        return PyUnicode_FromUnicode(s, 2);
1324    }
1325#endif
1326
1327    s[0] = (Py_UNICODE)ordinal;
1328    return PyUnicode_FromUnicode(s, 1);
1329}
1330
1331PyObject *PyUnicode_FromObject(register PyObject *obj)
1332{
1333    /* XXX Perhaps we should make this API an alias of
1334       PyObject_Str() instead ?! */
1335    if (PyUnicode_CheckExact(obj)) {
1336        Py_INCREF(obj);
1337        return obj;
1338    }
1339    if (PyUnicode_Check(obj)) {
1340        /* For a Unicode subtype that's not a Unicode object,
1341           return a true Unicode object with the same data. */
1342        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1343                                     PyUnicode_GET_SIZE(obj));
1344    }
1345    PyErr_Format(PyExc_TypeError,
1346                 "Can't convert '%.100s' object to str implicitly",
1347                 Py_TYPE(obj)->tp_name);
1348    return NULL;
1349}
1350
1351PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1352                                      const char *encoding,
1353                                      const char *errors)
1354{
1355    Py_buffer buffer;
1356    PyObject *v;
1357
1358    if (obj == NULL) {
1359        PyErr_BadInternalCall();
1360        return NULL;
1361    }
1362
1363    /* Decoding bytes objects is the most common case and should be fast */
1364    if (PyBytes_Check(obj)) {
1365        if (PyBytes_GET_SIZE(obj) == 0) {
1366            Py_INCREF(unicode_empty);
1367            v = (PyObject *) unicode_empty;
1368        }
1369        else {
1370            v = PyUnicode_Decode(
1371                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1372                    encoding, errors);
1373        }
1374        return v;
1375    }
1376
1377    if (PyUnicode_Check(obj)) {
1378        PyErr_SetString(PyExc_TypeError,
1379                        "decoding str is not supported");
1380        return NULL;
1381    }
1382
1383    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1384    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1385        PyErr_Format(PyExc_TypeError,
1386                     "coercing to str: need bytes, bytearray "
1387                     "or buffer-like object, %.80s found",
1388                     Py_TYPE(obj)->tp_name);
1389        return NULL;
1390    }
1391
1392    if (buffer.len == 0) {
1393        Py_INCREF(unicode_empty);
1394        v = (PyObject *) unicode_empty;
1395    }
1396    else
1397        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1398
1399    PyBuffer_Release(&buffer);
1400    return v;
1401}
1402
1403/* Convert encoding to lower case and replace '_' with '-' in order to
1404   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1405   1 on success. */
1406static int
1407normalize_encoding(const char *encoding,
1408                   char *lower,
1409                   size_t lower_len)
1410{
1411    const char *e;
1412    char *l;
1413    char *l_end;
1414
1415    e = encoding;
1416    l = lower;
1417    l_end = &lower[lower_len - 1];
1418    while (*e) {
1419        if (l == l_end)
1420            return 0;
1421        if (ISUPPER(*e)) {
1422            *l++ = TOLOWER(*e++);
1423        }
1424        else if (*e == '_') {
1425            *l++ = '-';
1426            e++;
1427        }
1428        else {
1429            *l++ = *e++;
1430        }
1431    }
1432    *l = '\0';
1433    return 1;
1434}
1435
1436PyObject *PyUnicode_Decode(const char *s,
1437                           Py_ssize_t size,
1438                           const char *encoding,
1439                           const char *errors)
1440{
1441    PyObject *buffer = NULL, *unicode;
1442    Py_buffer info;
1443    char lower[11];  /* Enough for any encoding shortcut */
1444
1445    if (encoding == NULL)
1446        encoding = PyUnicode_GetDefaultEncoding();
1447
1448    /* Shortcuts for common default encodings */
1449    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1450        if (strcmp(lower, "utf-8") == 0)
1451            return PyUnicode_DecodeUTF8(s, size, errors);
1452        else if ((strcmp(lower, "latin-1") == 0) ||
1453                 (strcmp(lower, "iso-8859-1") == 0))
1454            return PyUnicode_DecodeLatin1(s, size, errors);
1455#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1456        else if (strcmp(lower, "mbcs") == 0)
1457            return PyUnicode_DecodeMBCS(s, size, errors);
1458#endif
1459        else if (strcmp(lower, "ascii") == 0)
1460            return PyUnicode_DecodeASCII(s, size, errors);
1461        else if (strcmp(lower, "utf-16") == 0)
1462            return PyUnicode_DecodeUTF16(s, size, errors, 0);
1463        else if (strcmp(lower, "utf-32") == 0)
1464            return PyUnicode_DecodeUTF32(s, size, errors, 0);
1465    }
1466
1467    /* Decode via the codec registry */
1468    buffer = NULL;
1469    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1470        goto onError;
1471    buffer = PyMemoryView_FromBuffer(&info);
1472    if (buffer == NULL)
1473        goto onError;
1474    unicode = PyCodec_Decode(buffer, encoding, errors);
1475    if (unicode == NULL)
1476        goto onError;
1477    if (!PyUnicode_Check(unicode)) {
1478        PyErr_Format(PyExc_TypeError,
1479                     "decoder did not return a str object (type=%.400s)",
1480                     Py_TYPE(unicode)->tp_name);
1481        Py_DECREF(unicode);
1482        goto onError;
1483    }
1484    Py_DECREF(buffer);
1485    return unicode;
1486
1487  onError:
1488    Py_XDECREF(buffer);
1489    return NULL;
1490}
1491
1492PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1493                                    const char *encoding,
1494                                    const char *errors)
1495{
1496    PyObject *v;
1497
1498    if (!PyUnicode_Check(unicode)) {
1499        PyErr_BadArgument();
1500        goto onError;
1501    }
1502
1503    if (encoding == NULL)
1504        encoding = PyUnicode_GetDefaultEncoding();
1505
1506    /* Decode via the codec registry */
1507    v = PyCodec_Decode(unicode, encoding, errors);
1508    if (v == NULL)
1509        goto onError;
1510    return v;
1511
1512  onError:
1513    return NULL;
1514}
1515
1516PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1517                                     const char *encoding,
1518                                     const char *errors)
1519{
1520    PyObject *v;
1521
1522    if (!PyUnicode_Check(unicode)) {
1523        PyErr_BadArgument();
1524        goto onError;
1525    }
1526
1527    if (encoding == NULL)
1528        encoding = PyUnicode_GetDefaultEncoding();
1529
1530    /* Decode via the codec registry */
1531    v = PyCodec_Decode(unicode, encoding, errors);
1532    if (v == NULL)
1533        goto onError;
1534    if (!PyUnicode_Check(v)) {
1535        PyErr_Format(PyExc_TypeError,
1536                     "decoder did not return a str object (type=%.400s)",
1537                     Py_TYPE(v)->tp_name);
1538        Py_DECREF(v);
1539        goto onError;
1540    }
1541    return v;
1542
1543  onError:
1544    return NULL;
1545}
1546
1547PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1548                           Py_ssize_t size,
1549                           const char *encoding,
1550                           const char *errors)
1551{
1552    PyObject *v, *unicode;
1553
1554    unicode = PyUnicode_FromUnicode(s, size);
1555    if (unicode == NULL)
1556        return NULL;
1557    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1558    Py_DECREF(unicode);
1559    return v;
1560}
1561
1562PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1563                                    const char *encoding,
1564                                    const char *errors)
1565{
1566    PyObject *v;
1567
1568    if (!PyUnicode_Check(unicode)) {
1569        PyErr_BadArgument();
1570        goto onError;
1571    }
1572
1573    if (encoding == NULL)
1574        encoding = PyUnicode_GetDefaultEncoding();
1575
1576    /* Encode via the codec registry */
1577    v = PyCodec_Encode(unicode, encoding, errors);
1578    if (v == NULL)
1579        goto onError;
1580    return v;
1581
1582  onError:
1583    return NULL;
1584}
1585
1586PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1587{
1588    if (Py_FileSystemDefaultEncoding) {
1589#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1590        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1591            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1592                                        PyUnicode_GET_SIZE(unicode),
1593                                        NULL);
1594#endif
1595        return PyUnicode_AsEncodedString(unicode,
1596                                         Py_FileSystemDefaultEncoding,
1597                                         "surrogateescape");
1598    }
1599    else {
1600        /* locale encoding with surrogateescape */
1601        wchar_t *wchar;
1602        char *bytes;
1603        PyObject *bytes_obj;
1604
1605        wchar = PyUnicode_AsWideCharString(unicode, NULL);
1606        if (wchar == NULL)
1607            return NULL;
1608        bytes = _Py_wchar2char(wchar);
1609        PyMem_Free(wchar);
1610        if (bytes == NULL)
1611            return NULL;
1612
1613        bytes_obj = PyBytes_FromString(bytes);
1614        PyMem_Free(bytes);
1615        return bytes_obj;
1616    }
1617}
1618
1619PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1620                                    const char *encoding,
1621                                    const char *errors)
1622{
1623    PyObject *v;
1624    char lower[11];  /* Enough for any encoding shortcut */
1625
1626    if (!PyUnicode_Check(unicode)) {
1627        PyErr_BadArgument();
1628        return NULL;
1629    }
1630
1631    if (encoding == NULL)
1632        encoding = PyUnicode_GetDefaultEncoding();
1633
1634    /* Shortcuts for common default encodings */
1635    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1636        if (strcmp(lower, "utf-8") == 0)
1637            return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1638                                        PyUnicode_GET_SIZE(unicode),
1639                                        errors);
1640        else if ((strcmp(lower, "latin-1") == 0) ||
1641                 (strcmp(lower, "iso-8859-1") == 0))
1642            return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1643                                          PyUnicode_GET_SIZE(unicode),
1644                                          errors);
1645#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1646        else if (strcmp(lower, "mbcs") == 0)
1647            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1648                                        PyUnicode_GET_SIZE(unicode),
1649                                        errors);
1650#endif
1651        else if (strcmp(lower, "ascii") == 0)
1652            return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1653                                         PyUnicode_GET_SIZE(unicode),
1654                                         errors);
1655    }
1656    /* During bootstrap, we may need to find the encodings
1657       package, to load the file system encoding, and require the
1658       file system encoding in order to load the encodings
1659       package.
1660
1661       Break out of this dependency by assuming that the path to
1662       the encodings module is ASCII-only.  XXX could try wcstombs
1663       instead, if the file system encoding is the locale's
1664       encoding. */
1665    if (Py_FileSystemDefaultEncoding &&
1666             strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1667             !PyThreadState_GET()->interp->codecs_initialized)
1668        return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1669                                     PyUnicode_GET_SIZE(unicode),
1670                                     errors);
1671
1672    /* Encode via the codec registry */
1673    v = PyCodec_Encode(unicode, encoding, errors);
1674    if (v == NULL)
1675        return NULL;
1676
1677    /* The normal path */
1678    if (PyBytes_Check(v))
1679        return v;
1680
1681    /* If the codec returns a buffer, raise a warning and convert to bytes */
1682    if (PyByteArray_Check(v)) {
1683        int error;
1684        PyObject *b;
1685
1686        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1687            "encoder %s returned bytearray instead of bytes",
1688            encoding);
1689        if (error) {
1690            Py_DECREF(v);
1691            return NULL;
1692        }
1693
1694        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1695        Py_DECREF(v);
1696        return b;
1697    }
1698
1699    PyErr_Format(PyExc_TypeError,
1700                 "encoder did not return a bytes object (type=%.400s)",
1701                 Py_TYPE(v)->tp_name);
1702    Py_DECREF(v);
1703    return NULL;
1704}
1705
1706PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1707                                     const char *encoding,
1708                                     const char *errors)
1709{
1710    PyObject *v;
1711
1712    if (!PyUnicode_Check(unicode)) {
1713        PyErr_BadArgument();
1714        goto onError;
1715    }
1716
1717    if (encoding == NULL)
1718        encoding = PyUnicode_GetDefaultEncoding();
1719
1720    /* Encode via the codec registry */
1721    v = PyCodec_Encode(unicode, encoding, errors);
1722    if (v == NULL)
1723        goto onError;
1724    if (!PyUnicode_Check(v)) {
1725        PyErr_Format(PyExc_TypeError,
1726                     "encoder did not return an str object (type=%.400s)",
1727                     Py_TYPE(v)->tp_name);
1728        Py_DECREF(v);
1729        goto onError;
1730    }
1731    return v;
1732
1733  onError:
1734    return NULL;
1735}
1736
1737PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1738                                            const char *errors)
1739{
1740    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1741    if (v)
1742        return v;
1743    if (errors != NULL)
1744        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1745    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1746                             PyUnicode_GET_SIZE(unicode),
1747                             NULL);
1748    if (!v)
1749        return NULL;
1750    ((PyUnicodeObject *)unicode)->defenc = v;
1751    return v;
1752}
1753
1754PyObject*
1755PyUnicode_DecodeFSDefault(const char *s) {
1756    Py_ssize_t size = (Py_ssize_t)strlen(s);
1757    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1758}
1759
1760PyObject*
1761PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1762{
1763    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1764       can be undefined. If it is case, decode using UTF-8. The following assumes
1765       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1766       bootstrapping process where the codecs aren't ready yet.
1767    */
1768    if (Py_FileSystemDefaultEncoding) {
1769#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1770        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1771            return PyUnicode_DecodeMBCS(s, size, NULL);
1772        }
1773#elif defined(__APPLE__)
1774        if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1775            return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1776        }
1777#endif
1778        return PyUnicode_Decode(s, size,
1779                                Py_FileSystemDefaultEncoding,
1780                                "surrogateescape");
1781    }
1782    else {
1783        /* locale encoding with surrogateescape */
1784        wchar_t *wchar;
1785        PyObject *unicode;
1786        size_t len;
1787
1788        if (s[size] != '\0' || size != strlen(s)) {
1789            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1790            return NULL;
1791        }
1792
1793        wchar = _Py_char2wchar(s, &len);
1794        if (wchar == NULL)
1795            return NULL;
1796
1797        unicode = PyUnicode_FromWideChar(wchar, len);
1798        PyMem_Free(wchar);
1799        return unicode;
1800    }
1801}
1802
1803
1804int
1805PyUnicode_FSConverter(PyObject* arg, void* addr)
1806{
1807    PyObject *output = NULL;
1808    Py_ssize_t size;
1809    void *data;
1810    if (arg == NULL) {
1811        Py_DECREF(*(PyObject**)addr);
1812        return 1;
1813    }
1814    if (PyBytes_Check(arg)) {
1815        output = arg;
1816        Py_INCREF(output);
1817    }
1818    else {
1819        arg = PyUnicode_FromObject(arg);
1820        if (!arg)
1821            return 0;
1822        output = PyUnicode_EncodeFSDefault(arg);
1823        Py_DECREF(arg);
1824        if (!output)
1825            return 0;
1826        if (!PyBytes_Check(output)) {
1827            Py_DECREF(output);
1828            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1829            return 0;
1830        }
1831    }
1832    size = PyBytes_GET_SIZE(output);
1833    data = PyBytes_AS_STRING(output);
1834    if (size != strlen(data)) {
1835        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1836        Py_DECREF(output);
1837        return 0;
1838    }
1839    *(PyObject**)addr = output;
1840    return Py_CLEANUP_SUPPORTED;
1841}
1842
1843
1844int
1845PyUnicode_FSDecoder(PyObject* arg, void* addr)
1846{
1847    PyObject *output = NULL;
1848    Py_ssize_t size;
1849    void *data;
1850    if (arg == NULL) {
1851        Py_DECREF(*(PyObject**)addr);
1852        return 1;
1853    }
1854    if (PyUnicode_Check(arg)) {
1855        output = arg;
1856        Py_INCREF(output);
1857    }
1858    else {
1859        arg = PyBytes_FromObject(arg);
1860        if (!arg)
1861            return 0;
1862        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1863                                                  PyBytes_GET_SIZE(arg));
1864        Py_DECREF(arg);
1865        if (!output)
1866            return 0;
1867        if (!PyUnicode_Check(output)) {
1868            Py_DECREF(output);
1869            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1870            return 0;
1871        }
1872    }
1873    size = PyUnicode_GET_SIZE(output);
1874    data = PyUnicode_AS_UNICODE(output);
1875    if (size != Py_UNICODE_strlen(data)) {
1876        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1877        Py_DECREF(output);
1878        return 0;
1879    }
1880    *(PyObject**)addr = output;
1881    return Py_CLEANUP_SUPPORTED;
1882}
1883
1884
1885char*
1886_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1887{
1888    PyObject *bytes;
1889    if (!PyUnicode_Check(unicode)) {
1890        PyErr_BadArgument();
1891        return NULL;
1892    }
1893    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1894    if (bytes == NULL)
1895        return NULL;
1896    if (psize != NULL)
1897        *psize = PyBytes_GET_SIZE(bytes);
1898    return PyBytes_AS_STRING(bytes);
1899}
1900
1901char*
1902_PyUnicode_AsString(PyObject *unicode)
1903{
1904    return _PyUnicode_AsStringAndSize(unicode, NULL);
1905}
1906
1907Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1908{
1909    if (!PyUnicode_Check(unicode)) {
1910        PyErr_BadArgument();
1911        goto onError;
1912    }
1913    return PyUnicode_AS_UNICODE(unicode);
1914
1915  onError:
1916    return NULL;
1917}
1918
1919Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1920{
1921    if (!PyUnicode_Check(unicode)) {
1922        PyErr_BadArgument();
1923        goto onError;
1924    }
1925    return PyUnicode_GET_SIZE(unicode);
1926
1927  onError:
1928    return -1;
1929}
1930
1931const char *PyUnicode_GetDefaultEncoding(void)
1932{
1933    return "utf-8";
1934}
1935
1936/* create or adjust a UnicodeDecodeError */
1937static void
1938make_decode_exception(PyObject **exceptionObject,
1939                      const char *encoding,
1940                      const char *input, Py_ssize_t length,
1941                      Py_ssize_t startpos, Py_ssize_t endpos,
1942                      const char *reason)
1943{
1944    if (*exceptionObject == NULL) {
1945        *exceptionObject = PyUnicodeDecodeError_Create(
1946            encoding, input, length, startpos, endpos, reason);
1947    }
1948    else {
1949        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1950            goto onError;
1951        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1952            goto onError;
1953        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1954            goto onError;
1955    }
1956    return;
1957
1958onError:
1959    Py_DECREF(*exceptionObject);
1960    *exceptionObject = NULL;
1961}
1962
1963/* error handling callback helper:
1964   build arguments, call the callback and check the arguments,
1965   if no exception occurred, copy the replacement to the output
1966   and adjust various state variables.
1967   return 0 on success, -1 on error
1968*/
1969
1970static
1971int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1972                                     const char *encoding, const char *reason,
1973                                     const char **input, const char **inend, Py_ssize_t *startinpos,
1974                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1975                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1976{
1977    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
1978
1979    PyObject *restuple = NULL;
1980    PyObject *repunicode = NULL;
1981    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1982    Py_ssize_t insize;
1983    Py_ssize_t requiredsize;
1984    Py_ssize_t newpos;
1985    Py_UNICODE *repptr;
1986    PyObject *inputobj = NULL;
1987    Py_ssize_t repsize;
1988    int res = -1;
1989
1990    if (*errorHandler == NULL) {
1991        *errorHandler = PyCodec_LookupError(errors);
1992        if (*errorHandler == NULL)
1993            goto onError;
1994    }
1995
1996    make_decode_exception(exceptionObject,
1997        encoding,
1998        *input, *inend - *input,
1999        *startinpos, *endinpos,
2000        reason);
2001    if (*exceptionObject == NULL)
2002        goto onError;
2003
2004    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2005    if (restuple == NULL)
2006        goto onError;
2007    if (!PyTuple_Check(restuple)) {
2008        PyErr_SetString(PyExc_TypeError, &argparse[4]);
2009        goto onError;
2010    }
2011    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
2012        goto onError;
2013
2014    /* Copy back the bytes variables, which might have been modified by the
2015       callback */
2016    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2017    if (!inputobj)
2018        goto onError;
2019    if (!PyBytes_Check(inputobj)) {
2020        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
2021    }
2022    *input = PyBytes_AS_STRING(inputobj);
2023    insize = PyBytes_GET_SIZE(inputobj);
2024    *inend = *input + insize;
2025    /* we can DECREF safely, as the exception has another reference,
2026       so the object won't go away. */
2027    Py_DECREF(inputobj);
2028
2029    if (newpos<0)
2030        newpos = insize+newpos;
2031    if (newpos<0 || newpos>insize) {
2032        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2033        goto onError;
2034    }
2035
2036    /* need more space? (at least enough for what we
2037       have+the replacement+the rest of the string (starting
2038       at the new input position), so we won't have to check space
2039       when there are no errors in the rest of the string) */
2040    repptr = PyUnicode_AS_UNICODE(repunicode);
2041    repsize = PyUnicode_GET_SIZE(repunicode);
2042    requiredsize = *outpos + repsize + insize-newpos;
2043    if (requiredsize > outsize) {
2044        if (requiredsize<2*outsize)
2045            requiredsize = 2*outsize;
2046        if (_PyUnicode_Resize(output, requiredsize) < 0)
2047            goto onError;
2048        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
2049    }
2050    *endinpos = newpos;
2051    *inptr = *input + newpos;
2052    Py_UNICODE_COPY(*outptr, repptr, repsize);
2053    *outptr += repsize;
2054    *outpos += repsize;
2055
2056    /* we made it! */
2057    res = 0;
2058
2059  onError:
2060    Py_XDECREF(restuple);
2061    return res;
2062}
2063
2064/* --- UTF-7 Codec -------------------------------------------------------- */
2065
2066/* See RFC2152 for details.  We encode conservatively and decode liberally. */
2067
2068/* Three simple macros defining base-64. */
2069
2070/* Is c a base-64 character? */
2071
2072#define IS_BASE64(c) \
2073    (((c) >= 'A' && (c) <= 'Z') ||     \
2074     ((c) >= 'a' && (c) <= 'z') ||     \
2075     ((c) >= '0' && (c) <= '9') ||     \
2076     (c) == '+' || (c) == '/')
2077
2078/* given that c is a base-64 character, what is its base-64 value? */
2079
2080#define FROM_BASE64(c)                                                  \
2081    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
2082     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
2083     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
2084     (c) == '+' ? 62 : 63)
2085
2086/* What is the base-64 character of the bottom 6 bits of n? */
2087
2088#define TO_BASE64(n)  \
2089    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2090
2091/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2092 * decoded as itself.  We are permissive on decoding; the only ASCII
2093 * byte not decoding to itself is the + which begins a base64
2094 * string. */
2095
2096#define DECODE_DIRECT(c)                                \
2097    ((c) <= 127 && (c) != '+')
2098
2099/* The UTF-7 encoder treats ASCII characters differently according to
2100 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2101 * the above).  See RFC2152.  This array identifies these different
2102 * sets:
2103 * 0 : "Set D"
2104 *     alphanumeric and '(),-./:?
2105 * 1 : "Set O"
2106 *     !"#$%&*;<=>@[]^_`{|}
2107 * 2 : "whitespace"
2108 *     ht nl cr sp
2109 * 3 : special (must be base64 encoded)
2110 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2111 */
2112
2113static
2114char utf7_category[128] = {
2115/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
2116    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
2117/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
2118    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2119/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
2120    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
2121/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
2122    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
2123/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
2124    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2125/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
2126    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
2127/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
2128    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2129/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
2130    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
2131};
2132
2133/* ENCODE_DIRECT: this character should be encoded as itself.  The
2134 * answer depends on whether we are encoding set O as itself, and also
2135 * on whether we are encoding whitespace as itself.  RFC2152 makes it
2136 * clear that the answers to these questions vary between
2137 * applications, so this code needs to be flexible.  */
2138
2139#define ENCODE_DIRECT(c, directO, directWS)             \
2140    ((c) < 128 && (c) > 0 &&                            \
2141     ((utf7_category[(c)] == 0) ||                      \
2142      (directWS && (utf7_category[(c)] == 2)) ||        \
2143      (directO && (utf7_category[(c)] == 1))))
2144
2145PyObject *PyUnicode_DecodeUTF7(const char *s,
2146                               Py_ssize_t size,
2147                               const char *errors)
2148{
2149    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2150}
2151
2152/* The decoder.  The only state we preserve is our read position,
2153 * i.e. how many characters we have consumed.  So if we end in the
2154 * middle of a shift sequence we have to back off the read position
2155 * and the output to the beginning of the sequence, otherwise we lose
2156 * all the shift state (seen bits, number of bits seen, high
2157 * surrogate). */
2158
2159PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
2160                                       Py_ssize_t size,
2161                                       const char *errors,
2162                                       Py_ssize_t *consumed)
2163{
2164    const char *starts = s;
2165    Py_ssize_t startinpos;
2166    Py_ssize_t endinpos;
2167    Py_ssize_t outpos;
2168    const char *e;
2169    PyUnicodeObject *unicode;
2170    Py_UNICODE *p;
2171    const char *errmsg = "";
2172    int inShift = 0;
2173    Py_UNICODE *shiftOutStart;
2174    unsigned int base64bits = 0;
2175    unsigned long base64buffer = 0;
2176    Py_UNICODE surrogate = 0;
2177    PyObject *errorHandler = NULL;
2178    PyObject *exc = NULL;
2179
2180    unicode = _PyUnicode_New(size);
2181    if (!unicode)
2182        return NULL;
2183    if (size == 0) {
2184        if (consumed)
2185            *consumed = 0;
2186        return (PyObject *)unicode;
2187    }
2188
2189    p = unicode->str;
2190    shiftOutStart = p;
2191    e = s + size;
2192
2193    while (s < e) {
2194        Py_UNICODE ch;
2195      restart:
2196        ch = (unsigned char) *s;
2197
2198        if (inShift) { /* in a base-64 section */
2199            if (IS_BASE64(ch)) { /* consume a base-64 character */
2200                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2201                base64bits += 6;
2202                s++;
2203                if (base64bits >= 16) {
2204                    /* we have enough bits for a UTF-16 value */
2205                    Py_UNICODE outCh = (Py_UNICODE)
2206                                       (base64buffer >> (base64bits-16));
2207                    base64bits -= 16;
2208                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2209                    if (surrogate) {
2210                        /* expecting a second surrogate */
2211                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2212#ifdef Py_UNICODE_WIDE
2213                            *p++ = (((surrogate & 0x3FF)<<10)
2214                                    | (outCh & 0x3FF)) + 0x10000;
2215#else
2216                            *p++ = surrogate;
2217                            *p++ = outCh;
2218#endif
2219                            surrogate = 0;
2220                        }
2221                        else {
2222                            surrogate = 0;
2223                            errmsg = "second surrogate missing";
2224                            goto utf7Error;
2225                        }
2226                    }
2227                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2228                        /* first surrogate */
2229                        surrogate = outCh;
2230                    }
2231                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2232                        errmsg = "unexpected second surrogate";
2233                        goto utf7Error;
2234                    }
2235                    else {
2236                        *p++ = outCh;
2237                    }
2238                }
2239            }
2240            else { /* now leaving a base-64 section */
2241                inShift = 0;
2242                s++;
2243                if (surrogate) {
2244                    errmsg = "second surrogate missing at end of shift sequence";
2245                    goto utf7Error;
2246                }
2247                if (base64bits > 0) { /* left-over bits */
2248                    if (base64bits >= 6) {
2249                        /* We've seen at least one base-64 character */
2250                        errmsg = "partial character in shift sequence";
2251                        goto utf7Error;
2252                    }
2253                    else {
2254                        /* Some bits remain; they should be zero */
2255                        if (base64buffer != 0) {
2256                            errmsg = "non-zero padding bits in shift sequence";
2257                            goto utf7Error;
2258                        }
2259                    }
2260                }
2261                if (ch != '-') {
2262                    /* '-' is absorbed; other terminating
2263                       characters are preserved */
2264                    *p++ = ch;
2265                }
2266            }
2267        }
2268        else if ( ch == '+' ) {
2269            startinpos = s-starts;
2270            s++; /* consume '+' */
2271            if (s < e && *s == '-') { /* '+-' encodes '+' */
2272                s++;
2273                *p++ = '+';
2274            }
2275            else { /* begin base64-encoded section */
2276                inShift = 1;
2277                shiftOutStart = p;
2278                base64bits = 0;
2279            }
2280        }
2281        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
2282            *p++ = ch;
2283            s++;
2284        }
2285        else {
2286            startinpos = s-starts;
2287            s++;
2288            errmsg = "unexpected special character";
2289            goto utf7Error;
2290        }
2291        continue;
2292utf7Error:
2293        outpos = p-PyUnicode_AS_UNICODE(unicode);
2294        endinpos = s-starts;
2295        if (unicode_decode_call_errorhandler(
2296                errors, &errorHandler,
2297                "utf7", errmsg,
2298                &starts, &e, &startinpos, &endinpos, &exc, &s,
2299                &unicode, &outpos, &p))
2300            goto onError;
2301    }
2302
2303    /* end of string */
2304
2305    if (inShift && !consumed) { /* in shift sequence, no more to follow */
2306        /* if we're in an inconsistent state, that's an error */
2307        if (surrogate ||
2308                (base64bits >= 6) ||
2309                (base64bits > 0 && base64buffer != 0)) {
2310            outpos = p-PyUnicode_AS_UNICODE(unicode);
2311            endinpos = size;
2312            if (unicode_decode_call_errorhandler(
2313                    errors, &errorHandler,
2314                    "utf7", "unterminated shift sequence",
2315                    &starts, &e, &startinpos, &endinpos, &exc, &s,
2316                    &unicode, &outpos, &p))
2317                goto onError;
2318            if (s < e)
2319                goto restart;
2320        }
2321    }
2322
2323    /* return state */
2324    if (consumed) {
2325        if (inShift) {
2326            p = shiftOutStart; /* back off output */
2327            *consumed = startinpos;
2328        }
2329        else {
2330            *consumed = s-starts;
2331        }
2332    }
2333
2334    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2335        goto onError;
2336
2337    Py_XDECREF(errorHandler);
2338    Py_XDECREF(exc);
2339    return (PyObject *)unicode;
2340
2341  onError:
2342    Py_XDECREF(errorHandler);
2343    Py_XDECREF(exc);
2344    Py_DECREF(unicode);
2345    return NULL;
2346}
2347
2348
2349PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2350                               Py_ssize_t size,
2351                               int base64SetO,
2352                               int base64WhiteSpace,
2353                               const char *errors)
2354{
2355    PyObject *v;
2356    /* It might be possible to tighten this worst case */
2357    Py_ssize_t allocated = 8 * size;
2358    int inShift = 0;
2359    Py_ssize_t i = 0;
2360    unsigned int base64bits = 0;
2361    unsigned long base64buffer = 0;
2362    char * out;
2363    char * start;
2364
2365    if (size == 0)
2366        return PyBytes_FromStringAndSize(NULL, 0);
2367
2368    if (allocated / 8 != size)
2369        return PyErr_NoMemory();
2370
2371    v = PyBytes_FromStringAndSize(NULL, allocated);
2372    if (v == NULL)
2373        return NULL;
2374
2375    start = out = PyBytes_AS_STRING(v);
2376    for (;i < size; ++i) {
2377        Py_UNICODE ch = s[i];
2378
2379        if (inShift) {
2380            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2381                /* shifting out */
2382                if (base64bits) { /* output remaining bits */
2383                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
2384                    base64buffer = 0;
2385                    base64bits = 0;
2386                }
2387                inShift = 0;
2388                /* Characters not in the BASE64 set implicitly unshift the sequence
2389                   so no '-' is required, except if the character is itself a '-' */
2390                if (IS_BASE64(ch) || ch == '-') {
2391                    *out++ = '-';
2392                }
2393                *out++ = (char) ch;
2394            }
2395            else {
2396                goto encode_char;
2397            }
2398        }
2399        else { /* not in a shift sequence */
2400            if (ch == '+') {
2401                *out++ = '+';
2402                        *out++ = '-';
2403            }
2404            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2405                *out++ = (char) ch;
2406            }
2407            else {
2408                *out++ = '+';
2409                inShift = 1;
2410                goto encode_char;
2411            }
2412        }
2413        continue;
2414encode_char:
2415#ifdef Py_UNICODE_WIDE
2416        if (ch >= 0x10000) {
2417            /* code first surrogate */
2418            base64bits += 16;
2419            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2420            while (base64bits >= 6) {
2421                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2422                base64bits -= 6;
2423            }
2424            /* prepare second surrogate */
2425            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
2426        }
2427#endif
2428        base64bits += 16;
2429        base64buffer = (base64buffer << 16) | ch;
2430        while (base64bits >= 6) {
2431            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2432            base64bits -= 6;
2433        }
2434    }
2435    if (base64bits)
2436        *out++= TO_BASE64(base64buffer << (6-base64bits) );
2437    if (inShift)
2438        *out++ = '-';
2439    if (_PyBytes_Resize(&v, out - start) < 0)
2440        return NULL;
2441    return v;
2442}
2443
2444#undef IS_BASE64
2445#undef FROM_BASE64
2446#undef TO_BASE64
2447#undef DECODE_DIRECT
2448#undef ENCODE_DIRECT
2449
2450/* --- UTF-8 Codec -------------------------------------------------------- */
2451
2452static
2453char utf8_code_length[256] = {
2454    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
2455       illegal prefix.  See RFC 3629 for details */
2456    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2457    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2458    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2459    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2460    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2461    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2462    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2463    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2464    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2465    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2466    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2467    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2468    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2469    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2470    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2471    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
2472};
2473
2474PyObject *PyUnicode_DecodeUTF8(const char *s,
2475                               Py_ssize_t size,
2476                               const char *errors)
2477{
2478    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2479}
2480
2481/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2482#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2483
2484/* Mask to quickly check whether a C 'long' contains a
2485   non-ASCII, UTF8-encoded char. */
2486#if (SIZEOF_LONG == 8)
2487# define ASCII_CHAR_MASK 0x8080808080808080L
2488#elif (SIZEOF_LONG == 4)
2489# define ASCII_CHAR_MASK 0x80808080L
2490#else
2491# error C 'long' size should be either 4 or 8!
2492#endif
2493
2494PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2495                                       Py_ssize_t size,
2496                                       const char *errors,
2497                                       Py_ssize_t *consumed)
2498{
2499    const char *starts = s;
2500    int n;
2501    int k;
2502    Py_ssize_t startinpos;
2503    Py_ssize_t endinpos;
2504    Py_ssize_t outpos;
2505    const char *e, *aligned_end;
2506    PyUnicodeObject *unicode;
2507    Py_UNICODE *p;
2508    const char *errmsg = "";
2509    PyObject *errorHandler = NULL;
2510    PyObject *exc = NULL;
2511
2512    /* Note: size will always be longer than the resulting Unicode
2513       character count */
2514    unicode = _PyUnicode_New(size);
2515    if (!unicode)
2516        return NULL;
2517    if (size == 0) {
2518        if (consumed)
2519            *consumed = 0;
2520        return (PyObject *)unicode;
2521    }
2522
2523    /* Unpack UTF-8 encoded data */
2524    p = unicode->str;
2525    e = s + size;
2526    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2527
2528    while (s < e) {
2529        Py_UCS4 ch = (unsigned char)*s;
2530
2531        if (ch < 0x80) {
2532            /* Fast path for runs of ASCII characters. Given that common UTF-8
2533               input will consist of an overwhelming majority of ASCII
2534               characters, we try to optimize for this case by checking
2535               as many characters as a C 'long' can contain.
2536               First, check if we can do an aligned read, as most CPUs have
2537               a penalty for unaligned reads.
2538            */
2539            if (!((size_t) s & LONG_PTR_MASK)) {
2540                /* Help register allocation */
2541                register const char *_s = s;
2542                register Py_UNICODE *_p = p;
2543                while (_s < aligned_end) {
2544                    /* Read a whole long at a time (either 4 or 8 bytes),
2545                       and do a fast unrolled copy if it only contains ASCII
2546                       characters. */
2547                    unsigned long data = *(unsigned long *) _s;
2548                    if (data & ASCII_CHAR_MASK)
2549                        break;
2550                    _p[0] = (unsigned char) _s[0];
2551                    _p[1] = (unsigned char) _s[1];
2552                    _p[2] = (unsigned char) _s[2];
2553                    _p[3] = (unsigned char) _s[3];
2554#if (SIZEOF_LONG == 8)
2555                    _p[4] = (unsigned char) _s[4];
2556                    _p[5] = (unsigned char) _s[5];
2557                    _p[6] = (unsigned char) _s[6];
2558                    _p[7] = (unsigned char) _s[7];
2559#endif
2560                    _s += SIZEOF_LONG;
2561                    _p += SIZEOF_LONG;
2562                }
2563                s = _s;
2564                p = _p;
2565                if (s == e)
2566                    break;
2567                ch = (unsigned char)*s;
2568            }
2569        }
2570
2571        if (ch < 0x80) {
2572            *p++ = (Py_UNICODE)ch;
2573            s++;
2574            continue;
2575        }
2576
2577        n = utf8_code_length[ch];
2578
2579        if (s + n > e) {
2580            if (consumed)
2581                break;
2582            else {
2583                errmsg = "unexpected end of data";
2584                startinpos = s-starts;
2585                endinpos = startinpos+1;
2586                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2587                    endinpos++;
2588                goto utf8Error;
2589            }
2590        }
2591
2592        switch (n) {
2593
2594        case 0:
2595            errmsg = "invalid start byte";
2596            startinpos = s-starts;
2597            endinpos = startinpos+1;
2598            goto utf8Error;
2599
2600        case 1:
2601            errmsg = "internal error";
2602            startinpos = s-starts;
2603            endinpos = startinpos+1;
2604            goto utf8Error;
2605
2606        case 2:
2607            if ((s[1] & 0xc0) != 0x80) {
2608                errmsg = "invalid continuation byte";
2609                startinpos = s-starts;
2610                endinpos = startinpos + 1;
2611                goto utf8Error;
2612            }
2613            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2614            assert ((ch > 0x007F) && (ch <= 0x07FF));
2615            *p++ = (Py_UNICODE)ch;
2616            break;
2617
2618        case 3:
2619            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2620               will result in surrogates in range d800-dfff. Surrogates are
2621               not valid UTF-8 so they are rejected.
2622               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2623               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2624            if ((s[1] & 0xc0) != 0x80 ||
2625                (s[2] & 0xc0) != 0x80 ||
2626                ((unsigned char)s[0] == 0xE0 &&
2627                 (unsigned char)s[1] < 0xA0) ||
2628                ((unsigned char)s[0] == 0xED &&
2629                 (unsigned char)s[1] > 0x9F)) {
2630                errmsg = "invalid continuation byte";
2631                startinpos = s-starts;
2632                endinpos = startinpos + 1;
2633
2634                /* if s[1] first two bits are 1 and 0, then the invalid
2635                   continuation byte is s[2], so increment endinpos by 1,
2636                   if not, s[1] is invalid and endinpos doesn't need to
2637                   be incremented. */
2638                if ((s[1] & 0xC0) == 0x80)
2639                    endinpos++;
2640                goto utf8Error;
2641            }
2642            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2643            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2644            *p++ = (Py_UNICODE)ch;
2645            break;
2646
2647        case 4:
2648            if ((s[1] & 0xc0) != 0x80 ||
2649                (s[2] & 0xc0) != 0x80 ||
2650                (s[3] & 0xc0) != 0x80 ||
2651                ((unsigned char)s[0] == 0xF0 &&
2652                 (unsigned char)s[1] < 0x90) ||
2653                ((unsigned char)s[0] == 0xF4 &&
2654                 (unsigned char)s[1] > 0x8F)) {
2655                errmsg = "invalid continuation byte";
2656                startinpos = s-starts;
2657                endinpos = startinpos + 1;
2658                if ((s[1] & 0xC0) == 0x80) {
2659                    endinpos++;
2660                    if ((s[2] & 0xC0) == 0x80)
2661                        endinpos++;
2662                }
2663                goto utf8Error;
2664            }
2665            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2666                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2667            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2668
2669#ifdef Py_UNICODE_WIDE
2670            *p++ = (Py_UNICODE)ch;
2671#else
2672            /*  compute and append the two surrogates: */
2673
2674            /*  translate from 10000..10FFFF to 0..FFFF */
2675            ch -= 0x10000;
2676
2677            /*  high surrogate = top 10 bits added to D800 */
2678            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2679
2680            /*  low surrogate = bottom 10 bits added to DC00 */
2681            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2682#endif
2683            break;
2684        }
2685        s += n;
2686        continue;
2687
2688      utf8Error:
2689        outpos = p-PyUnicode_AS_UNICODE(unicode);
2690        if (unicode_decode_call_errorhandler(
2691                errors, &errorHandler,
2692                "utf8", errmsg,
2693                &starts, &e, &startinpos, &endinpos, &exc, &s,
2694                &unicode, &outpos, &p))
2695            goto onError;
2696        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2697    }
2698    if (consumed)
2699        *consumed = s-starts;
2700
2701    /* Adjust length */
2702    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2703        goto onError;
2704
2705    Py_XDECREF(errorHandler);
2706    Py_XDECREF(exc);
2707    return (PyObject *)unicode;
2708
2709  onError:
2710    Py_XDECREF(errorHandler);
2711    Py_XDECREF(exc);
2712    Py_DECREF(unicode);
2713    return NULL;
2714}
2715
2716#undef ASCII_CHAR_MASK
2717
2718
2719/* Allocation strategy:  if the string is short, convert into a stack buffer
2720   and allocate exactly as much space needed at the end.  Else allocate the
2721   maximum possible needed (4 result bytes per Unicode character), and return
2722   the excess memory at the end.
2723*/
2724PyObject *
2725PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2726                     Py_ssize_t size,
2727                     const char *errors)
2728{
2729#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2730
2731    Py_ssize_t i;                /* index into s of next input byte */
2732    PyObject *result;            /* result string object */
2733    char *p;                     /* next free byte in output buffer */
2734    Py_ssize_t nallocated;      /* number of result bytes allocated */
2735    Py_ssize_t nneeded;            /* number of result bytes needed */
2736    char stackbuf[MAX_SHORT_UNICHARS * 4];
2737    PyObject *errorHandler = NULL;
2738    PyObject *exc = NULL;
2739
2740    assert(s != NULL);
2741    assert(size >= 0);
2742
2743    if (size <= MAX_SHORT_UNICHARS) {
2744        /* Write into the stack buffer; nallocated can't overflow.
2745         * At the end, we'll allocate exactly as much heap space as it
2746         * turns out we need.
2747         */
2748        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2749        result = NULL;   /* will allocate after we're done */
2750        p = stackbuf;
2751    }
2752    else {
2753        /* Overallocate on the heap, and give the excess back at the end. */
2754        nallocated = size * 4;
2755        if (nallocated / 4 != size)  /* overflow! */
2756            return PyErr_NoMemory();
2757        result = PyBytes_FromStringAndSize(NULL, nallocated);
2758        if (result == NULL)
2759            return NULL;
2760        p = PyBytes_AS_STRING(result);
2761    }
2762
2763    for (i = 0; i < size;) {
2764        Py_UCS4 ch = s[i++];
2765
2766        if (ch < 0x80)
2767            /* Encode ASCII */
2768            *p++ = (char) ch;
2769
2770        else if (ch < 0x0800) {
2771            /* Encode Latin-1 */
2772            *p++ = (char)(0xc0 | (ch >> 6));
2773            *p++ = (char)(0x80 | (ch & 0x3f));
2774        } else if (0xD800 <= ch && ch <= 0xDFFF) {
2775#ifndef Py_UNICODE_WIDE
2776            /* Special case: check for high and low surrogate */
2777            if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2778                Py_UCS4 ch2 = s[i];
2779                /* Combine the two surrogates to form a UCS4 value */
2780                ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2781                i++;
2782
2783                /* Encode UCS4 Unicode ordinals */
2784                *p++ = (char)(0xf0 | (ch >> 18));
2785                *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2786                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2787                *p++ = (char)(0x80 | (ch & 0x3f));
2788            } else {
2789#endif
2790                Py_ssize_t newpos;
2791                PyObject *rep;
2792                Py_ssize_t repsize, k;
2793                rep = unicode_encode_call_errorhandler
2794                    (errors, &errorHandler, "utf-8", "surrogates not allowed",
2795                     s, size, &exc, i-1, i, &newpos);
2796                if (!rep)
2797                    goto error;
2798
2799                if (PyBytes_Check(rep))
2800                    repsize = PyBytes_GET_SIZE(rep);
2801                else
2802                    repsize = PyUnicode_GET_SIZE(rep);
2803
2804                if (repsize > 4) {
2805                    Py_ssize_t offset;
2806
2807                    if (result == NULL)
2808                        offset = p - stackbuf;
2809                    else
2810                        offset = p - PyBytes_AS_STRING(result);
2811
2812                    if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2813                        /* integer overflow */
2814                        PyErr_NoMemory();
2815                        goto error;
2816                    }
2817                    nallocated += repsize - 4;
2818                    if (result != NULL) {
2819                        if (_PyBytes_Resize(&result, nallocated) < 0)
2820                            goto error;
2821                    } else {
2822                        result = PyBytes_FromStringAndSize(NULL, nallocated);
2823                        if (result == NULL)
2824                            goto error;
2825                        Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2826                    }
2827                    p = PyBytes_AS_STRING(result) + offset;
2828                }
2829
2830                if (PyBytes_Check(rep)) {
2831                    char *prep = PyBytes_AS_STRING(rep);
2832                    for(k = repsize; k > 0; k--)
2833                        *p++ = *prep++;
2834                } else /* rep is unicode */ {
2835                    Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2836                    Py_UNICODE c;
2837
2838                    for(k=0; k<repsize; k++) {
2839                        c = prep[k];
2840                        if (0x80 <= c) {
2841                            raise_encode_exception(&exc, "utf-8", s, size,
2842                                                   i-1, i, "surrogates not allowed");
2843                            goto error;
2844                        }
2845                        *p++ = (char)prep[k];
2846                    }
2847                }
2848                Py_DECREF(rep);
2849#ifndef Py_UNICODE_WIDE
2850            }
2851#endif
2852        } else if (ch < 0x10000) {
2853            *p++ = (char)(0xe0 | (ch >> 12));
2854            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2855            *p++ = (char)(0x80 | (ch & 0x3f));
2856        } else /* ch >= 0x10000 */ {
2857            /* Encode UCS4 Unicode ordinals */
2858            *p++ = (char)(0xf0 | (ch >> 18));
2859            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2860            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2861            *p++ = (char)(0x80 | (ch & 0x3f));
2862        }
2863    }
2864
2865    if (result == NULL) {
2866        /* This was stack allocated. */
2867        nneeded = p - stackbuf;
2868        assert(nneeded <= nallocated);
2869        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2870    }
2871    else {
2872        /* Cut back to size actually needed. */
2873        nneeded = p - PyBytes_AS_STRING(result);
2874        assert(nneeded <= nallocated);
2875        _PyBytes_Resize(&result, nneeded);
2876    }
2877    Py_XDECREF(errorHandler);
2878    Py_XDECREF(exc);
2879    return result;
2880 error:
2881    Py_XDECREF(errorHandler);
2882    Py_XDECREF(exc);
2883    Py_XDECREF(result);
2884    return NULL;
2885
2886#undef MAX_SHORT_UNICHARS
2887}
2888
2889PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2890{
2891    if (!PyUnicode_Check(unicode)) {
2892        PyErr_BadArgument();
2893        return NULL;
2894    }
2895    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2896                                PyUnicode_GET_SIZE(unicode),
2897                                NULL);
2898}
2899
2900/* --- UTF-32 Codec ------------------------------------------------------- */
2901
2902PyObject *
2903PyUnicode_DecodeUTF32(const char *s,
2904                      Py_ssize_t size,
2905                      const char *errors,
2906                      int *byteorder)
2907{
2908    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2909}
2910
2911PyObject *
2912PyUnicode_DecodeUTF32Stateful(const char *s,
2913                              Py_ssize_t size,
2914                              const char *errors,
2915                              int *byteorder,
2916                              Py_ssize_t *consumed)
2917{
2918    const char *starts = s;
2919    Py_ssize_t startinpos;
2920    Py_ssize_t endinpos;
2921    Py_ssize_t outpos;
2922    PyUnicodeObject *unicode;
2923    Py_UNICODE *p;
2924#ifndef Py_UNICODE_WIDE
2925    int pairs = 0;
2926    const unsigned char *qq;
2927#else
2928    const int pairs = 0;
2929#endif
2930    const unsigned char *q, *e;
2931    int bo = 0;       /* assume native ordering by default */
2932    const char *errmsg = "";
2933    /* Offsets from q for retrieving bytes in the right order. */
2934#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2935    int iorder[] = {0, 1, 2, 3};
2936#else
2937    int iorder[] = {3, 2, 1, 0};
2938#endif
2939    PyObject *errorHandler = NULL;
2940    PyObject *exc = NULL;
2941
2942    q = (unsigned char *)s;
2943    e = q + size;
2944
2945    if (byteorder)
2946        bo = *byteorder;
2947
2948    /* Check for BOM marks (U+FEFF) in the input and adjust current
2949       byte order setting accordingly. In native mode, the leading BOM
2950       mark is skipped, in all other modes, it is copied to the output
2951       stream as-is (giving a ZWNBSP character). */
2952    if (bo == 0) {
2953        if (size >= 4) {
2954            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2955                (q[iorder[1]] << 8) | q[iorder[0]];
2956#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2957            if (bom == 0x0000FEFF) {
2958                q += 4;
2959                bo = -1;
2960            }
2961            else if (bom == 0xFFFE0000) {
2962                q += 4;
2963                bo = 1;
2964            }
2965#else
2966            if (bom == 0x0000FEFF) {
2967                q += 4;
2968                bo = 1;
2969            }
2970            else if (bom == 0xFFFE0000) {
2971                q += 4;
2972                bo = -1;
2973            }
2974#endif
2975        }
2976    }
2977
2978    if (bo == -1) {
2979        /* force LE */
2980        iorder[0] = 0;
2981        iorder[1] = 1;
2982        iorder[2] = 2;
2983        iorder[3] = 3;
2984    }
2985    else if (bo == 1) {
2986        /* force BE */
2987        iorder[0] = 3;
2988        iorder[1] = 2;
2989        iorder[2] = 1;
2990        iorder[3] = 0;
2991    }
2992
2993    /* On narrow builds we split characters outside the BMP into two
2994       codepoints => count how much extra space we need. */
2995#ifndef Py_UNICODE_WIDE
2996    for (qq = q; qq < e; qq += 4)
2997        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2998            pairs++;
2999#endif
3000
3001    /* This might be one to much, because of a BOM */
3002    unicode = _PyUnicode_New((size+3)/4+pairs);
3003    if (!unicode)
3004        return NULL;
3005    if (size == 0)
3006        return (PyObject *)unicode;
3007
3008    /* Unpack UTF-32 encoded data */
3009    p = unicode->str;
3010
3011    while (q < e) {
3012        Py_UCS4 ch;
3013        /* remaining bytes at the end? (size should be divisible by 4) */
3014        if (e-q<4) {
3015            if (consumed)
3016                break;
3017            errmsg = "truncated data";
3018            startinpos = ((const char *)q)-starts;
3019            endinpos = ((const char *)e)-starts;
3020            goto utf32Error;
3021            /* The remaining input chars are ignored if the callback
3022               chooses to skip the input */
3023        }
3024        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3025            (q[iorder[1]] << 8) | q[iorder[0]];
3026
3027        if (ch >= 0x110000)
3028        {
3029            errmsg = "codepoint not in range(0x110000)";
3030            startinpos = ((const char *)q)-starts;
3031            endinpos = startinpos+4;
3032            goto utf32Error;
3033        }
3034#ifndef Py_UNICODE_WIDE
3035        if (ch >= 0x10000)
3036        {
3037            *p++ = 0xD800 | ((ch-0x10000) >> 10);
3038            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3039        }
3040        else
3041#endif
3042            *p++ = ch;
3043        q += 4;
3044        continue;
3045      utf32Error:
3046        outpos = p-PyUnicode_AS_UNICODE(unicode);
3047        if (unicode_decode_call_errorhandler(
3048                errors, &errorHandler,
3049                "utf32", errmsg,
3050                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3051                &unicode, &outpos, &p))
3052            goto onError;
3053    }
3054
3055    if (byteorder)
3056        *byteorder = bo;
3057
3058    if (consumed)
3059        *consumed = (const char *)q-starts;
3060
3061    /* Adjust length */
3062    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3063        goto onError;
3064
3065    Py_XDECREF(errorHandler);
3066    Py_XDECREF(exc);
3067    return (PyObject *)unicode;
3068
3069  onError:
3070    Py_DECREF(unicode);
3071    Py_XDECREF(errorHandler);
3072    Py_XDECREF(exc);
3073    return NULL;
3074}
3075
3076PyObject *
3077PyUnicode_EncodeUTF32(const Py_UNICODE *s,
3078                      Py_ssize_t size,
3079                      const char *errors,
3080                      int byteorder)
3081{
3082    PyObject *v;
3083    unsigned char *p;
3084    Py_ssize_t nsize, bytesize;
3085#ifndef Py_UNICODE_WIDE
3086    Py_ssize_t i, pairs;
3087#else
3088    const int pairs = 0;
3089#endif
3090    /* Offsets from p for storing byte pairs in the right order. */
3091#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3092    int iorder[] = {0, 1, 2, 3};
3093#else
3094    int iorder[] = {3, 2, 1, 0};
3095#endif
3096
3097#define STORECHAR(CH)                           \
3098    do {                                        \
3099        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
3100        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
3101        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
3102        p[iorder[0]] = (CH) & 0xff;             \
3103        p += 4;                                 \
3104    } while(0)
3105
3106    /* In narrow builds we can output surrogate pairs as one codepoint,
3107       so we need less space. */
3108#ifndef Py_UNICODE_WIDE
3109    for (i = pairs = 0; i < size-1; i++)
3110        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3111            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3112            pairs++;
3113#endif
3114    nsize = (size - pairs + (byteorder == 0));
3115    bytesize = nsize * 4;
3116    if (bytesize / 4 != nsize)
3117        return PyErr_NoMemory();
3118    v = PyBytes_FromStringAndSize(NULL, bytesize);
3119    if (v == NULL)
3120        return NULL;
3121
3122    p = (unsigned char *)PyBytes_AS_STRING(v);
3123    if (byteorder == 0)
3124        STORECHAR(0xFEFF);
3125    if (size == 0)
3126        goto done;
3127
3128    if (byteorder == -1) {
3129        /* force LE */
3130        iorder[0] = 0;
3131        iorder[1] = 1;
3132        iorder[2] = 2;
3133        iorder[3] = 3;
3134    }
3135    else if (byteorder == 1) {
3136        /* force BE */
3137        iorder[0] = 3;
3138        iorder[1] = 2;
3139        iorder[2] = 1;
3140        iorder[3] = 0;
3141    }
3142
3143    while (size-- > 0) {
3144        Py_UCS4 ch = *s++;
3145#ifndef Py_UNICODE_WIDE
3146        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3147            Py_UCS4 ch2 = *s;
3148            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3149                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3150                s++;
3151                size--;
3152            }
3153        }
3154#endif
3155        STORECHAR(ch);
3156    }
3157
3158  done:
3159    return v;
3160#undef STORECHAR
3161}
3162
3163PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3164{
3165    if (!PyUnicode_Check(unicode)) {
3166        PyErr_BadArgument();
3167        return NULL;
3168    }
3169    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
3170                                 PyUnicode_GET_SIZE(unicode),
3171                                 NULL,
3172                                 0);
3173}
3174
3175/* --- UTF-16 Codec ------------------------------------------------------- */
3176
3177PyObject *
3178PyUnicode_DecodeUTF16(const char *s,
3179                      Py_ssize_t size,
3180                      const char *errors,
3181                      int *byteorder)
3182{
3183    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3184}
3185
3186/* Two masks for fast checking of whether a C 'long' may contain
3187   UTF16-encoded surrogate characters. This is an efficient heuristic,
3188   assuming that non-surrogate characters with a code point >= 0x8000 are
3189   rare in most input.
3190   FAST_CHAR_MASK is used when the input is in native byte ordering,
3191   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
3192*/
3193#if (SIZEOF_LONG == 8)
3194# define FAST_CHAR_MASK         0x8000800080008000L
3195# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3196#elif (SIZEOF_LONG == 4)
3197# define FAST_CHAR_MASK         0x80008000L
3198# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3199#else
3200# error C 'long' size should be either 4 or 8!
3201#endif
3202
3203PyObject *
3204PyUnicode_DecodeUTF16Stateful(const char *s,
3205                              Py_ssize_t size,
3206                              const char *errors,
3207                              int *byteorder,
3208                              Py_ssize_t *consumed)
3209{
3210    const char *starts = s;
3211    Py_ssize_t startinpos;
3212    Py_ssize_t endinpos;
3213    Py_ssize_t outpos;
3214    PyUnicodeObject *unicode;
3215    Py_UNICODE *p;
3216    const unsigned char *q, *e, *aligned_end;
3217    int bo = 0;       /* assume native ordering by default */
3218    int native_ordering = 0;
3219    const char *errmsg = "";
3220    /* Offsets from q for retrieving byte pairs in the right order. */
3221#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3222    int ihi = 1, ilo = 0;
3223#else
3224    int ihi = 0, ilo = 1;
3225#endif
3226    PyObject *errorHandler = NULL;
3227    PyObject *exc = NULL;
3228
3229    /* Note: size will always be longer than the resulting Unicode
3230       character count */
3231    unicode = _PyUnicode_New(size);
3232    if (!unicode)
3233        return NULL;
3234    if (size == 0)
3235        return (PyObject *)unicode;
3236
3237    /* Unpack UTF-16 encoded data */
3238    p = unicode->str;
3239    q = (unsigned char *)s;
3240    e = q + size - 1;
3241
3242    if (byteorder)
3243        bo = *byteorder;
3244
3245    /* Check for BOM marks (U+FEFF) in the input and adjust current
3246       byte order setting accordingly. In native mode, the leading BOM
3247       mark is skipped, in all other modes, it is copied to the output
3248       stream as-is (giving a ZWNBSP character). */
3249    if (bo == 0) {
3250        if (size >= 2) {
3251            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
3252#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3253            if (bom == 0xFEFF) {
3254                q += 2;
3255                bo = -1;
3256            }
3257            else if (bom == 0xFFFE) {
3258                q += 2;
3259                bo = 1;
3260            }
3261#else
3262            if (bom == 0xFEFF) {
3263                q += 2;
3264                bo = 1;
3265            }
3266            else if (bom == 0xFFFE) {
3267                q += 2;
3268                bo = -1;
3269            }
3270#endif
3271        }
3272    }
3273
3274    if (bo == -1) {
3275        /* force LE */
3276        ihi = 1;
3277        ilo = 0;
3278    }
3279    else if (bo == 1) {
3280        /* force BE */
3281        ihi = 0;
3282        ilo = 1;
3283    }
3284#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3285    native_ordering = ilo < ihi;
3286#else
3287    native_ordering = ilo > ihi;
3288#endif
3289
3290    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3291    while (q < e) {
3292        Py_UNICODE ch;
3293        /* First check for possible aligned read of a C 'long'. Unaligned
3294           reads are more expensive, better to defer to another iteration. */
3295        if (!((size_t) q & LONG_PTR_MASK)) {
3296            /* Fast path for runs of non-surrogate chars. */
3297            register const unsigned char *_q = q;
3298            Py_UNICODE *_p = p;
3299            if (native_ordering) {
3300                /* Native ordering is simple: as long as the input cannot
3301                   possibly contain a surrogate char, do an unrolled copy
3302                   of several 16-bit code points to the target object.
3303                   The non-surrogate check is done on several input bytes
3304                   at a time (as many as a C 'long' can contain). */
3305                while (_q < aligned_end) {
3306                    unsigned long data = * (unsigned long *) _q;
3307                    if (data & FAST_CHAR_MASK)
3308                        break;
3309                    _p[0] = ((unsigned short *) _q)[0];
3310                    _p[1] = ((unsigned short *) _q)[1];
3311#if (SIZEOF_LONG == 8)
3312                    _p[2] = ((unsigned short *) _q)[2];
3313                    _p[3] = ((unsigned short *) _q)[3];
3314#endif
3315                    _q += SIZEOF_LONG;
3316                    _p += SIZEOF_LONG / 2;
3317                }
3318            }
3319            else {
3320                /* Byteswapped ordering is similar, but we must decompose
3321                   the copy bytewise, and take care of zero'ing out the
3322                   upper bytes if the target object is in 32-bit units
3323                   (that is, in UCS-4 builds). */
3324                while (_q < aligned_end) {
3325                    unsigned long data = * (unsigned long *) _q;
3326                    if (data & SWAPPED_FAST_CHAR_MASK)
3327                        break;
3328                    /* Zero upper bytes in UCS-4 builds */
3329#if (Py_UNICODE_SIZE > 2)
3330                    _p[0] = 0;
3331                    _p[1] = 0;
3332#if (SIZEOF_LONG == 8)
3333                    _p[2] = 0;
3334                    _p[3] = 0;
3335#endif
3336#endif
3337                    /* Issue #4916; UCS-4 builds on big endian machines must
3338                       fill the two last bytes of each 4-byte unit. */
3339#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3340# define OFF 2
3341#else
3342# define OFF 0
3343#endif
3344                    ((unsigned char *) _p)[OFF + 1] = _q[0];
3345                    ((unsigned char *) _p)[OFF + 0] = _q[1];
3346                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3347                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3348#if (SIZEOF_LONG == 8)
3349                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3350                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3351                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3352                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3353#endif
3354#undef OFF
3355                    _q += SIZEOF_LONG;
3356                    _p += SIZEOF_LONG / 2;
3357                }
3358            }
3359            p = _p;
3360            q = _q;
3361            if (q >= e)
3362                break;
3363        }
3364        ch = (q[ihi] << 8) | q[ilo];
3365
3366        q += 2;
3367
3368        if (ch < 0xD800 || ch > 0xDFFF) {
3369            *p++ = ch;
3370            continue;
3371        }
3372
3373        /* UTF-16 code pair: */
3374        if (q > e) {
3375            errmsg = "unexpected end of data";
3376            startinpos = (((const char *)q) - 2) - starts;
3377            endinpos = ((const char *)e) + 1 - starts;
3378            goto utf16Error;
3379        }
3380        if (0xD800 <= ch && ch <= 0xDBFF) {
3381            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3382            q += 2;
3383            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3384#ifndef Py_UNICODE_WIDE
3385                *p++ = ch;
3386                *p++ = ch2;
3387#else
3388                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3389#endif
3390                continue;
3391            }
3392            else {
3393                errmsg = "illegal UTF-16 surrogate";
3394                startinpos = (((const char *)q)-4)-starts;
3395                endinpos = startinpos+2;
3396                goto utf16Error;
3397            }
3398
3399        }
3400        errmsg = "illegal encoding";
3401        startinpos = (((const char *)q)-2)-starts;
3402        endinpos = startinpos+2;
3403        /* Fall through to report the error */
3404
3405      utf16Error:
3406        outpos = p - PyUnicode_AS_UNICODE(unicode);
3407        if (unicode_decode_call_errorhandler(
3408                errors,
3409                &errorHandler,
3410                "utf16", errmsg,
3411                &starts,
3412                (const char **)&e,
3413                &startinpos,
3414                &endinpos,
3415                &exc,
3416                (const char **)&q,
3417                &unicode,
3418                &outpos,
3419                &p))
3420            goto onError;
3421    }
3422    /* remaining byte at the end? (size should be even) */
3423    if (e == q) {
3424        if (!consumed) {
3425            errmsg = "truncated data";
3426            startinpos = ((const char *)q) - starts;
3427            endinpos = ((const char *)e) + 1 - starts;
3428            outpos = p - PyUnicode_AS_UNICODE(unicode);
3429            if (unicode_decode_call_errorhandler(
3430                    errors,
3431                    &errorHandler,
3432                    "utf16", errmsg,
3433                    &starts,
3434                    (const char **)&e,
3435                    &startinpos,
3436                    &endinpos,
3437                    &exc,
3438                    (const char **)&q,
3439                    &unicode,
3440                    &outpos,
3441                    &p))
3442                goto onError;
3443            /* The remaining input chars are ignored if the callback
3444               chooses to skip the input */
3445        }
3446    }
3447
3448    if (byteorder)
3449        *byteorder = bo;
3450
3451    if (consumed)
3452        *consumed = (const char *)q-starts;
3453
3454    /* Adjust length */
3455    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3456        goto onError;
3457
3458    Py_XDECREF(errorHandler);
3459    Py_XDECREF(exc);
3460    return (PyObject *)unicode;
3461
3462  onError:
3463    Py_DECREF(unicode);
3464    Py_XDECREF(errorHandler);
3465    Py_XDECREF(exc);
3466    return NULL;
3467}
3468
3469#undef FAST_CHAR_MASK
3470#undef SWAPPED_FAST_CHAR_MASK
3471
3472PyObject *
3473PyUnicode_EncodeUTF16(const Py_UNICODE *s,
3474                      Py_ssize_t size,
3475                      const char *errors,
3476                      int byteorder)
3477{
3478    PyObject *v;
3479    unsigned char *p;
3480    Py_ssize_t nsize, bytesize;
3481#ifdef Py_UNICODE_WIDE
3482    Py_ssize_t i, pairs;
3483#else
3484    const int pairs = 0;
3485#endif
3486    /* Offsets from p for storing byte pairs in the right order. */
3487#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3488    int ihi = 1, ilo = 0;
3489#else
3490    int ihi = 0, ilo = 1;
3491#endif
3492
3493#define STORECHAR(CH)                           \
3494    do {                                        \
3495        p[ihi] = ((CH) >> 8) & 0xff;            \
3496        p[ilo] = (CH) & 0xff;                   \
3497        p += 2;                                 \
3498    } while(0)
3499
3500#ifdef Py_UNICODE_WIDE
3501    for (i = pairs = 0; i < size; i++)
3502        if (s[i] >= 0x10000)
3503            pairs++;
3504#endif
3505    /* 2 * (size + pairs + (byteorder == 0)) */
3506    if (size > PY_SSIZE_T_MAX ||
3507        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
3508        return PyErr_NoMemory();
3509    nsize = size + pairs + (byteorder == 0);
3510    bytesize = nsize * 2;
3511    if (bytesize / 2 != nsize)
3512        return PyErr_NoMemory();
3513    v = PyBytes_FromStringAndSize(NULL, bytesize);
3514    if (v == NULL)
3515        return NULL;
3516
3517    p = (unsigned char *)PyBytes_AS_STRING(v);
3518    if (byteorder == 0)
3519        STORECHAR(0xFEFF);
3520    if (size == 0)
3521        goto done;
3522
3523    if (byteorder == -1) {
3524        /* force LE */
3525        ihi = 1;
3526        ilo = 0;
3527    }
3528    else if (byteorder == 1) {
3529        /* force BE */
3530        ihi = 0;
3531        ilo = 1;
3532    }
3533
3534    while (size-- > 0) {
3535        Py_UNICODE ch = *s++;
3536        Py_UNICODE ch2 = 0;
3537#ifdef Py_UNICODE_WIDE
3538        if (ch >= 0x10000) {
3539            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3540            ch  = 0xD800 | ((ch-0x10000) >> 10);
3541        }
3542#endif
3543        STORECHAR(ch);
3544        if (ch2)
3545            STORECHAR(ch2);
3546    }
3547
3548  done:
3549    return v;
3550#undef STORECHAR
3551}
3552
3553PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3554{
3555    if (!PyUnicode_Check(unicode)) {
3556        PyErr_BadArgument();
3557        return NULL;
3558    }
3559    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3560                                 PyUnicode_GET_SIZE(unicode),
3561                                 NULL,
3562                                 0);
3563}
3564
3565/* --- Unicode Escape Codec ----------------------------------------------- */
3566
3567static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3568
3569PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3570                                        Py_ssize_t size,
3571                                        const char *errors)
3572{
3573    const char *starts = s;
3574    Py_ssize_t startinpos;
3575    Py_ssize_t endinpos;
3576    Py_ssize_t outpos;
3577    int i;
3578    PyUnicodeObject *v;
3579    Py_UNICODE *p;
3580    const char *end;
3581    char* message;
3582    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3583    PyObject *errorHandler = NULL;
3584    PyObject *exc = NULL;
3585
3586    /* Escaped strings will always be longer than the resulting
3587       Unicode string, so we start with size here and then reduce the
3588       length after conversion to the true value.
3589       (but if the error callback returns a long replacement string
3590       we'll have to allocate more space) */
3591    v = _PyUnicode_New(size);
3592    if (v == NULL)
3593        goto onError;
3594    if (size == 0)
3595        return (PyObject *)v;
3596
3597    p = PyUnicode_AS_UNICODE(v);
3598    end = s + size;
3599
3600    while (s < end) {
3601        unsigned char c;
3602        Py_UNICODE x;
3603        int digits;
3604
3605        /* Non-escape characters are interpreted as Unicode ordinals */
3606        if (*s != '\\') {
3607            *p++ = (unsigned char) *s++;
3608            continue;
3609        }
3610
3611        startinpos = s-starts;
3612        /* \ - Escapes */
3613        s++;
3614        c = *s++;
3615        if (s > end)
3616            c = '\0'; /* Invalid after \ */
3617        switch (c) {
3618
3619            /* \x escapes */
3620        case '\n': break;
3621        case '\\': *p++ = '\\'; break;
3622        case '\'': *p++ = '\''; break;
3623        case '\"': *p++ = '\"'; break;
3624        case 'b': *p++ = '\b'; break;
3625        case 'f': *p++ = '\014'; break; /* FF */
3626        case 't': *p++ = '\t'; break;
3627        case 'n': *p++ = '\n'; break;
3628        case 'r': *p++ = '\r'; break;
3629        case 'v': *p++ = '\013'; break; /* VT */
3630        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3631
3632            /* \OOO (octal) escapes */
3633        case '0': case '1': case '2': case '3':
3634        case '4': case '5': case '6': case '7':
3635            x = s[-1] - '0';
3636            if (s < end && '0' <= *s && *s <= '7') {
3637                x = (x<<3) + *s++ - '0';
3638                if (s < end && '0' <= *s && *s <= '7')
3639                    x = (x<<3) + *s++ - '0';
3640            }
3641            *p++ = x;
3642            break;
3643
3644            /* hex escapes */
3645            /* \xXX */
3646        case 'x':
3647            digits = 2;
3648            message = "truncated \\xXX escape";
3649            goto hexescape;
3650
3651            /* \uXXXX */
3652        case 'u':
3653            digits = 4;
3654            message = "truncated \\uXXXX escape";
3655            goto hexescape;
3656
3657            /* \UXXXXXXXX */
3658        case 'U':
3659            digits = 8;
3660            message = "truncated \\UXXXXXXXX escape";
3661        hexescape:
3662            chr = 0;
3663            outpos = p-PyUnicode_AS_UNICODE(v);
3664            if (s+digits>end) {
3665                endinpos = size;
3666                if (unicode_decode_call_errorhandler(
3667                        errors, &errorHandler,
3668                        "unicodeescape", "end of string in escape sequence",
3669                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3670                        &v, &outpos, &p))
3671                    goto onError;
3672                goto nextByte;
3673            }
3674            for (i = 0; i < digits; ++i) {
3675                c = (unsigned char) s[i];
3676                if (!ISXDIGIT(c)) {
3677                    endinpos = (s+i+1)-starts;
3678                    if (unicode_decode_call_errorhandler(
3679                            errors, &errorHandler,
3680                            "unicodeescape", message,
3681                            &starts, &end, &startinpos, &endinpos, &exc, &s,
3682                            &v, &outpos, &p))
3683                        goto onError;
3684                    goto nextByte;
3685                }
3686                chr = (chr<<4) & ~0xF;
3687                if (c >= '0' && c <= '9')
3688                    chr += c - '0';
3689                else if (c >= 'a' && c <= 'f')
3690                    chr += 10 + c - 'a';
3691                else
3692                    chr += 10 + c - 'A';
3693            }
3694            s += i;
3695            if (chr == 0xffffffff && PyErr_Occurred())
3696                /* _decoding_error will have already written into the
3697                   target buffer. */
3698                break;
3699        store:
3700            /* when we get here, chr is a 32-bit unicode character */
3701            if (chr <= 0xffff)
3702                /* UCS-2 character */
3703                *p++ = (Py_UNICODE) chr;
3704            else if (chr <= 0x10ffff) {
3705                /* UCS-4 character. Either store directly, or as
3706                   surrogate pair. */
3707#ifdef Py_UNICODE_WIDE
3708                *p++ = chr;
3709#else
3710                chr -= 0x10000L;
3711                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3712                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3713#endif
3714            } else {
3715                endinpos = s-starts;
3716                outpos = p-PyUnicode_AS_UNICODE(v);
3717                if (unicode_decode_call_errorhandler(
3718                        errors, &errorHandler,
3719                        "unicodeescape", "illegal Unicode character",
3720                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3721                        &v, &outpos, &p))
3722                    goto onError;
3723            }
3724            break;
3725
3726            /* \N{name} */
3727        case 'N':
3728            message = "malformed \\N character escape";
3729            if (ucnhash_CAPI == NULL) {
3730                /* load the unicode data module */
3731                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3732                if (ucnhash_CAPI == NULL)
3733                    goto ucnhashError;
3734            }
3735            if (*s == '{') {
3736                const char *start = s+1;
3737                /* look for the closing brace */
3738                while (*s != '}' && s < end)
3739                    s++;
3740                if (s > start && s < end && *s == '}') {
3741                    /* found a name.  look it up in the unicode database */
3742                    message = "unknown Unicode character name";
3743                    s++;
3744                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3745                        goto store;
3746                }
3747            }
3748            endinpos = s-starts;
3749            outpos = p-PyUnicode_AS_UNICODE(v);
3750            if (unicode_decode_call_errorhandler(
3751                    errors, &errorHandler,
3752                    "unicodeescape", message,
3753                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3754                    &v, &outpos, &p))
3755                goto onError;
3756            break;
3757
3758        default:
3759            if (s > end) {
3760                message = "\\ at end of string";
3761                s--;
3762                endinpos = s-starts;
3763                outpos = p-PyUnicode_AS_UNICODE(v);
3764                if (unicode_decode_call_errorhandler(
3765                        errors, &errorHandler,
3766                        "unicodeescape", message,
3767                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3768                        &v, &outpos, &p))
3769                    goto onError;
3770            }
3771            else {
3772                *p++ = '\\';
3773                *p++ = (unsigned char)s[-1];
3774            }
3775            break;
3776        }
3777      nextByte:
3778        ;
3779    }
3780    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3781        goto onError;
3782    Py_XDECREF(errorHandler);
3783    Py_XDECREF(exc);
3784    return (PyObject *)v;
3785
3786  ucnhashError:
3787    PyErr_SetString(
3788        PyExc_UnicodeError,
3789        "\\N escapes not supported (can't load unicodedata module)"
3790        );
3791    Py_XDECREF(v);
3792    Py_XDECREF(errorHandler);
3793    Py_XDECREF(exc);
3794    return NULL;
3795
3796  onError:
3797    Py_XDECREF(v);
3798    Py_XDECREF(errorHandler);
3799    Py_XDECREF(exc);
3800    return NULL;
3801}
3802
3803/* Return a Unicode-Escape string version of the Unicode object.
3804
3805   If quotes is true, the string is enclosed in u"" or u'' quotes as
3806   appropriate.
3807
3808*/
3809
3810Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3811                                             Py_ssize_t size,
3812                                             Py_UNICODE ch)
3813{
3814    /* like wcschr, but doesn't stop at NULL characters */
3815
3816    while (size-- > 0) {
3817        if (*s == ch)
3818            return s;
3819        s++;
3820    }
3821
3822    return NULL;
3823}
3824
3825static const char *hexdigits = "0123456789abcdef";
3826
3827PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3828                                        Py_ssize_t size)
3829{
3830    PyObject *repr;
3831    char *p;
3832
3833#ifdef Py_UNICODE_WIDE
3834    const Py_ssize_t expandsize = 10;
3835#else
3836    const Py_ssize_t expandsize = 6;
3837#endif
3838
3839    /* XXX(nnorwitz): rather than over-allocating, it would be
3840       better to choose a different scheme.  Perhaps scan the
3841       first N-chars of the string and allocate based on that size.
3842    */
3843    /* Initial allocation is based on the longest-possible unichr
3844       escape.
3845
3846       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3847       unichr, so in this case it's the longest unichr escape. In
3848       narrow (UTF-16) builds this is five chars per source unichr
3849       since there are two unichrs in the surrogate pair, so in narrow
3850       (UTF-16) builds it's not the longest unichr escape.
3851
3852       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3853       so in the narrow (UTF-16) build case it's the longest unichr
3854       escape.
3855    */
3856
3857    if (size == 0)
3858        return PyBytes_FromStringAndSize(NULL, 0);
3859
3860    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3861        return PyErr_NoMemory();
3862
3863    repr = PyBytes_FromStringAndSize(NULL,
3864                                     2
3865                                     + expandsize*size
3866                                     + 1);
3867    if (repr == NULL)
3868        return NULL;
3869
3870    p = PyBytes_AS_STRING(repr);
3871
3872    while (size-- > 0) {
3873        Py_UNICODE ch = *s++;
3874
3875        /* Escape backslashes */
3876        if (ch == '\\') {
3877            *p++ = '\\';
3878            *p++ = (char) ch;
3879            continue;
3880        }
3881
3882#ifdef Py_UNICODE_WIDE
3883        /* Map 21-bit characters to '\U00xxxxxx' */
3884        else if (ch >= 0x10000) {
3885            *p++ = '\\';
3886            *p++ = 'U';
3887            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3888            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3889            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3890            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3891            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3892            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3893            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3894            *p++ = hexdigits[ch & 0x0000000F];
3895            continue;
3896        }
3897#else
3898        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3899        else if (ch >= 0xD800 && ch < 0xDC00) {
3900            Py_UNICODE ch2;
3901            Py_UCS4 ucs;
3902
3903            ch2 = *s++;
3904            size--;
3905            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3906                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3907                *p++ = '\\';
3908                *p++ = 'U';
3909                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3910                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3911                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3912                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3913                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3914                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3915                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3916                *p++ = hexdigits[ucs & 0x0000000F];
3917                continue;
3918            }
3919            /* Fall through: isolated surrogates are copied as-is */
3920            s--;
3921            size++;
3922        }
3923#endif
3924
3925        /* Map 16-bit characters to '\uxxxx' */
3926        if (ch >= 256) {
3927            *p++ = '\\';
3928            *p++ = 'u';
3929            *p++ = hexdigits[(ch >> 12) & 0x000F];
3930            *p++ = hexdigits[(ch >> 8) & 0x000F];
3931            *p++ = hexdigits[(ch >> 4) & 0x000F];
3932            *p++ = hexdigits[ch & 0x000F];
3933        }
3934
3935        /* Map special whitespace to '\t', \n', '\r' */
3936        else if (ch == '\t') {
3937            *p++ = '\\';
3938            *p++ = 't';
3939        }
3940        else if (ch == '\n') {
3941            *p++ = '\\';
3942            *p++ = 'n';
3943        }
3944        else if (ch == '\r') {
3945            *p++ = '\\';
3946            *p++ = 'r';
3947        }
3948
3949        /* Map non-printable US ASCII to '\xhh' */
3950        else if (ch < ' ' || ch >= 0x7F) {
3951            *p++ = '\\';
3952            *p++ = 'x';
3953            *p++ = hexdigits[(ch >> 4) & 0x000F];
3954            *p++ = hexdigits[ch & 0x000F];
3955        }
3956
3957        /* Copy everything else as-is */
3958        else
3959            *p++ = (char) ch;
3960    }
3961
3962    assert(p - PyBytes_AS_STRING(repr) > 0);
3963    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3964        return NULL;
3965    return repr;
3966}
3967
3968PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3969{
3970    PyObject *s;
3971    if (!PyUnicode_Check(unicode)) {
3972        PyErr_BadArgument();
3973        return NULL;
3974    }
3975    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3976                                      PyUnicode_GET_SIZE(unicode));
3977    return s;
3978}
3979
3980/* --- Raw Unicode Escape Codec ------------------------------------------- */
3981
3982PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3983                                           Py_ssize_t size,
3984                                           const char *errors)
3985{
3986    const char *starts = s;
3987    Py_ssize_t startinpos;
3988    Py_ssize_t endinpos;
3989    Py_ssize_t outpos;
3990    PyUnicodeObject *v;
3991    Py_UNICODE *p;
3992    const char *end;
3993    const char *bs;
3994    PyObject *errorHandler = NULL;
3995    PyObject *exc = NULL;
3996
3997    /* Escaped strings will always be longer than the resulting
3998       Unicode string, so we start with size here and then reduce the
3999       length after conversion to the true value. (But decoding error
4000       handler might have to resize the string) */
4001    v = _PyUnicode_New(size);
4002    if (v == NULL)
4003        goto onError;
4004    if (size == 0)
4005        return (PyObject *)v;
4006    p = PyUnicode_AS_UNICODE(v);
4007    end = s + size;
4008    while (s < end) {
4009        unsigned char c;
4010        Py_UCS4 x;
4011        int i;
4012        int count;
4013
4014        /* Non-escape characters are interpreted as Unicode ordinals */
4015        if (*s != '\\') {
4016            *p++ = (unsigned char)*s++;
4017            continue;
4018        }
4019        startinpos = s-starts;
4020
4021        /* \u-escapes are only interpreted iff the number of leading
4022           backslashes if odd */
4023        bs = s;
4024        for (;s < end;) {
4025            if (*s != '\\')
4026                break;
4027            *p++ = (unsigned char)*s++;
4028        }
4029        if (((s - bs) & 1) == 0 ||
4030            s >= end ||
4031            (*s != 'u' && *s != 'U')) {
4032            continue;
4033        }
4034        p--;
4035        count = *s=='u' ? 4 : 8;
4036        s++;
4037
4038        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4039        outpos = p-PyUnicode_AS_UNICODE(v);
4040        for (x = 0, i = 0; i < count; ++i, ++s) {
4041            c = (unsigned char)*s;
4042            if (!ISXDIGIT(c)) {
4043                endinpos = s-starts;
4044                if (unicode_decode_call_errorhandler(
4045                        errors, &errorHandler,
4046                        "rawunicodeescape", "truncated \\uXXXX",
4047                        &starts, &end, &startinpos, &endinpos, &exc, &s,
4048                        &v, &outpos, &p))
4049                    goto onError;
4050                goto nextByte;
4051            }
4052            x = (x<<4) & ~0xF;
4053            if (c >= '0' && c <= '9')
4054                x += c - '0';
4055            else if (c >= 'a' && c <= 'f')
4056                x += 10 + c - 'a';
4057            else
4058                x += 10 + c - 'A';
4059        }
4060        if (x <= 0xffff)
4061            /* UCS-2 character */
4062            *p++ = (Py_UNICODE) x;
4063        else if (x <= 0x10ffff) {
4064            /* UCS-4 character. Either store directly, or as
4065               surrogate pair. */
4066#ifdef Py_UNICODE_WIDE
4067            *p++ = (Py_UNICODE) x;
4068#else
4069            x -= 0x10000L;
4070            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4071            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
4072#endif
4073        } else {
4074            endinpos = s-starts;
4075            outpos = p-PyUnicode_AS_UNICODE(v);
4076            if (unicode_decode_call_errorhandler(
4077                    errors, &errorHandler,
4078                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
4079                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4080                    &v, &outpos, &p))
4081                goto onError;
4082        }
4083      nextByte:
4084        ;
4085    }
4086    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4087        goto onError;
4088    Py_XDECREF(errorHandler);
4089    Py_XDECREF(exc);
4090    return (PyObject *)v;
4091
4092  onError:
4093    Py_XDECREF(v);
4094    Py_XDECREF(errorHandler);
4095    Py_XDECREF(exc);
4096    return NULL;
4097}
4098
4099PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4100                                           Py_ssize_t size)
4101{
4102    PyObject *repr;
4103    char *p;
4104    char *q;
4105
4106#ifdef Py_UNICODE_WIDE
4107    const Py_ssize_t expandsize = 10;
4108#else
4109    const Py_ssize_t expandsize = 6;
4110#endif
4111
4112    if (size > PY_SSIZE_T_MAX / expandsize)
4113        return PyErr_NoMemory();
4114
4115    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
4116    if (repr == NULL)
4117        return NULL;
4118    if (size == 0)
4119        return repr;
4120
4121    p = q = PyBytes_AS_STRING(repr);
4122    while (size-- > 0) {
4123        Py_UNICODE ch = *s++;
4124#ifdef Py_UNICODE_WIDE
4125        /* Map 32-bit characters to '\Uxxxxxxxx' */
4126        if (ch >= 0x10000) {
4127            *p++ = '\\';
4128            *p++ = 'U';
4129            *p++ = hexdigits[(ch >> 28) & 0xf];
4130            *p++ = hexdigits[(ch >> 24) & 0xf];
4131            *p++ = hexdigits[(ch >> 20) & 0xf];
4132            *p++ = hexdigits[(ch >> 16) & 0xf];
4133            *p++ = hexdigits[(ch >> 12) & 0xf];
4134            *p++ = hexdigits[(ch >> 8) & 0xf];
4135            *p++ = hexdigits[(ch >> 4) & 0xf];
4136            *p++ = hexdigits[ch & 15];
4137        }
4138        else
4139#else
4140            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4141            if (ch >= 0xD800 && ch < 0xDC00) {
4142                Py_UNICODE ch2;
4143                Py_UCS4 ucs;
4144
4145                ch2 = *s++;
4146                size--;
4147                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4148                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4149                    *p++ = '\\';
4150                    *p++ = 'U';
4151                    *p++ = hexdigits[(ucs >> 28) & 0xf];
4152                    *p++ = hexdigits[(ucs >> 24) & 0xf];
4153                    *p++ = hexdigits[(ucs >> 20) & 0xf];
4154                    *p++ = hexdigits[(ucs >> 16) & 0xf];
4155                    *p++ = hexdigits[(ucs >> 12) & 0xf];
4156                    *p++ = hexdigits[(ucs >> 8) & 0xf];
4157                    *p++ = hexdigits[(ucs >> 4) & 0xf];
4158                    *p++ = hexdigits[ucs & 0xf];
4159                    continue;
4160                }
4161                /* Fall through: isolated surrogates are copied as-is */
4162                s--;
4163                size++;
4164            }
4165#endif
4166        /* Map 16-bit characters to '\uxxxx' */
4167        if (ch >= 256) {
4168            *p++ = '\\';
4169            *p++ = 'u';
4170            *p++ = hexdigits[(ch >> 12) & 0xf];
4171            *p++ = hexdigits[(ch >> 8) & 0xf];
4172            *p++ = hexdigits[(ch >> 4) & 0xf];
4173            *p++ = hexdigits[ch & 15];
4174        }
4175        /* Copy everything else as-is */
4176        else
4177            *p++ = (char) ch;
4178    }
4179    size = p - q;
4180
4181    assert(size > 0);
4182    if (_PyBytes_Resize(&repr, size) < 0)
4183        return NULL;
4184    return repr;
4185}
4186
4187PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4188{
4189    PyObject *s;
4190    if (!PyUnicode_Check(unicode)) {
4191        PyErr_BadArgument();
4192        return NULL;
4193    }
4194    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4195                                         PyUnicode_GET_SIZE(unicode));
4196
4197    return s;
4198}
4199
4200/* --- Unicode Internal Codec ------------------------------------------- */
4201
4202PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
4203                                           Py_ssize_t size,
4204                                           const char *errors)
4205{
4206    const char *starts = s;
4207    Py_ssize_t startinpos;
4208    Py_ssize_t endinpos;
4209    Py_ssize_t outpos;
4210    PyUnicodeObject *v;
4211    Py_UNICODE *p;
4212    const char *end;
4213    const char *reason;
4214    PyObject *errorHandler = NULL;
4215    PyObject *exc = NULL;
4216
4217#ifdef Py_UNICODE_WIDE
4218    Py_UNICODE unimax = PyUnicode_GetMax();
4219#endif
4220
4221    /* XXX overflow detection missing */
4222    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4223    if (v == NULL)
4224        goto onError;
4225    if (PyUnicode_GetSize((PyObject *)v) == 0)
4226        return (PyObject *)v;
4227    p = PyUnicode_AS_UNICODE(v);
4228    end = s + size;
4229
4230    while (s < end) {
4231        memcpy(p, s, sizeof(Py_UNICODE));
4232        /* We have to sanity check the raw data, otherwise doom looms for
4233           some malformed UCS-4 data. */
4234        if (
4235#ifdef Py_UNICODE_WIDE
4236            *p > unimax || *p < 0 ||
4237#endif
4238            end-s < Py_UNICODE_SIZE
4239            )
4240        {
4241            startinpos = s - starts;
4242            if (end-s < Py_UNICODE_SIZE) {
4243                endinpos = end-starts;
4244                reason = "truncated input";
4245            }
4246            else {
4247                endinpos = s - starts + Py_UNICODE_SIZE;
4248                reason = "illegal code point (> 0x10FFFF)";
4249            }
4250            outpos = p - PyUnicode_AS_UNICODE(v);
4251            if (unicode_decode_call_errorhandler(
4252                    errors, &errorHandler,
4253                    "unicode_internal", reason,
4254                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4255                    &v, &outpos, &p)) {
4256                goto onError;
4257            }
4258        }
4259        else {
4260            p++;
4261            s += Py_UNICODE_SIZE;
4262        }
4263    }
4264
4265    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4266        goto onError;
4267    Py_XDECREF(errorHandler);
4268    Py_XDECREF(exc);
4269    return (PyObject *)v;
4270
4271  onError:
4272    Py_XDECREF(v);
4273    Py_XDECREF(errorHandler);
4274    Py_XDECREF(exc);
4275    return NULL;
4276}
4277
4278/* --- Latin-1 Codec ------------------------------------------------------ */
4279
4280PyObject *PyUnicode_DecodeLatin1(const char *s,
4281                                 Py_ssize_t size,
4282                                 const char *errors)
4283{
4284    PyUnicodeObject *v;
4285    Py_UNICODE *p;
4286    const char *e, *unrolled_end;
4287
4288    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
4289    if (size == 1) {
4290        Py_UNICODE r = *(unsigned char*)s;
4291        return PyUnicode_FromUnicode(&r, 1);
4292    }
4293
4294    v = _PyUnicode_New(size);
4295    if (v == NULL)
4296        goto onError;
4297    if (size == 0)
4298        return (PyObject *)v;
4299    p = PyUnicode_AS_UNICODE(v);
4300    e = s + size;
4301    /* Unrolling the copy makes it much faster by reducing the looping
4302       overhead. This is similar to what many memcpy() implementations do. */
4303    unrolled_end = e - 4;
4304    while (s < unrolled_end) {
4305        p[0] = (unsigned char) s[0];
4306        p[1] = (unsigned char) s[1];
4307        p[2] = (unsigned char) s[2];
4308        p[3] = (unsigned char) s[3];
4309        s += 4;
4310        p += 4;
4311    }
4312    while (s < e)
4313        *p++ = (unsigned char) *s++;
4314    return (PyObject *)v;
4315
4316  onError:
4317    Py_XDECREF(v);
4318    return NULL;
4319}
4320
4321/* create or adjust a UnicodeEncodeError */
4322static void make_encode_exception(PyObject **exceptionObject,
4323                                  const char *encoding,
4324                                  const Py_UNICODE *unicode, Py_ssize_t size,
4325                                  Py_ssize_t startpos, Py_ssize_t endpos,
4326                                  const char *reason)
4327{
4328    if (*exceptionObject == NULL) {
4329        *exceptionObject = PyUnicodeEncodeError_Create(
4330            encoding, unicode, size, startpos, endpos, reason);
4331    }
4332    else {
4333        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4334            goto onError;
4335        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4336            goto onError;
4337        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4338            goto onError;
4339        return;
4340      onError:
4341        Py_DECREF(*exceptionObject);
4342        *exceptionObject = NULL;
4343    }
4344}
4345
4346/* raises a UnicodeEncodeError */
4347static void raise_encode_exception(PyObject **exceptionObject,
4348                                   const char *encoding,
4349                                   const Py_UNICODE *unicode, Py_ssize_t size,
4350                                   Py_ssize_t startpos, Py_ssize_t endpos,
4351                                   const char *reason)
4352{
4353    make_encode_exception(exceptionObject,
4354                          encoding, unicode, size, startpos, endpos, reason);
4355    if (*exceptionObject != NULL)
4356        PyCodec_StrictErrors(*exceptionObject);
4357}
4358
4359/* error handling callback helper:
4360   build arguments, call the callback and check the arguments,
4361   put the result into newpos and return the replacement string, which
4362   has to be freed by the caller */
4363static PyObject *unicode_encode_call_errorhandler(const char *errors,
4364                                                  PyObject **errorHandler,
4365                                                  const char *encoding, const char *reason,
4366                                                  const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4367                                                  Py_ssize_t startpos, Py_ssize_t endpos,
4368                                                  Py_ssize_t *newpos)
4369{
4370    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4371
4372    PyObject *restuple;
4373    PyObject *resunicode;
4374
4375    if (*errorHandler == NULL) {
4376        *errorHandler = PyCodec_LookupError(errors);
4377        if (*errorHandler == NULL)
4378            return NULL;
4379    }
4380
4381    make_encode_exception(exceptionObject,
4382                          encoding, unicode, size, startpos, endpos, reason);
4383    if (*exceptionObject == NULL)
4384        return NULL;
4385
4386    restuple = PyObject_CallFunctionObjArgs(
4387        *errorHandler, *exceptionObject, NULL);
4388    if (restuple == NULL)
4389        return NULL;
4390    if (!PyTuple_Check(restuple)) {
4391        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4392        Py_DECREF(restuple);
4393        return NULL;
4394    }
4395    if (!PyArg_ParseTuple(restuple, argparse,
4396                          &resunicode, newpos)) {
4397        Py_DECREF(restuple);
4398        return NULL;
4399    }
4400    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4401        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4402        Py_DECREF(restuple);
4403        return NULL;
4404    }
4405    if (*newpos<0)
4406        *newpos = size+*newpos;
4407    if (*newpos<0 || *newpos>size) {
4408        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4409        Py_DECREF(restuple);
4410        return NULL;
4411    }
4412    Py_INCREF(resunicode);
4413    Py_DECREF(restuple);
4414    return resunicode;
4415}
4416
4417static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
4418                                     Py_ssize_t size,
4419                                     const char *errors,
4420                                     int limit)
4421{
4422    /* output object */
4423    PyObject *res;
4424    /* pointers to the beginning and end+1 of input */
4425    const Py_UNICODE *startp = p;
4426    const Py_UNICODE *endp = p + size;
4427    /* pointer to the beginning of the unencodable characters */
4428    /* const Py_UNICODE *badp = NULL; */
4429    /* pointer into the output */
4430    char *str;
4431    /* current output position */
4432    Py_ssize_t ressize;
4433    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4434    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4435    PyObject *errorHandler = NULL;
4436    PyObject *exc = NULL;
4437    /* the following variable is used for caching string comparisons
4438     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4439    int known_errorHandler = -1;
4440
4441    /* allocate enough for a simple encoding without
4442       replacements, if we need more, we'll resize */
4443    if (size == 0)
4444        return PyBytes_FromStringAndSize(NULL, 0);
4445    res = PyBytes_FromStringAndSize(NULL, size);
4446    if (res == NULL)
4447        return NULL;
4448    str = PyBytes_AS_STRING(res);
4449    ressize = size;
4450
4451    while (p<endp) {
4452        Py_UNICODE c = *p;
4453
4454        /* can we encode this? */
4455        if (c<limit) {
4456            /* no overflow check, because we know that the space is enough */
4457            *str++ = (char)c;
4458            ++p;
4459        }
4460        else {
4461            Py_ssize_t unicodepos = p-startp;
4462            Py_ssize_t requiredsize;
4463            PyObject *repunicode;
4464            Py_ssize_t repsize;
4465            Py_ssize_t newpos;
4466            Py_ssize_t respos;
4467            Py_UNICODE *uni2;
4468            /* startpos for collecting unencodable chars */
4469            const Py_UNICODE *collstart = p;
4470            const Py_UNICODE *collend = p;
4471            /* find all unecodable characters */
4472            while ((collend < endp) && ((*collend)>=limit))
4473                ++collend;
4474            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4475            if (known_errorHandler==-1) {
4476                if ((errors==NULL) || (!strcmp(errors, "strict")))
4477                    known_errorHandler = 1;
4478                else if (!strcmp(errors, "replace"))
4479                    known_errorHandler = 2;
4480                else if (!strcmp(errors, "ignore"))
4481                    known_errorHandler = 3;
4482                else if (!strcmp(errors, "xmlcharrefreplace"))
4483                    known_errorHandler = 4;
4484                else
4485                    known_errorHandler = 0;
4486            }
4487            switch (known_errorHandler) {
4488            case 1: /* strict */
4489                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4490                goto onError;
4491            case 2: /* replace */
4492                while (collstart++<collend)
4493                    *str++ = '?'; /* fall through */
4494            case 3: /* ignore */
4495                p = collend;
4496                break;
4497            case 4: /* xmlcharrefreplace */
4498                respos = str - PyBytes_AS_STRING(res);
4499                /* determine replacement size (temporarily (mis)uses p) */
4500                for (p = collstart, repsize = 0; p < collend; ++p) {
4501                    if (*p<10)
4502                        repsize += 2+1+1;
4503                    else if (*p<100)
4504                        repsize += 2+2+1;
4505                    else if (*p<1000)
4506                        repsize += 2+3+1;
4507                    else if (*p<10000)
4508                        repsize += 2+4+1;
4509#ifndef Py_UNICODE_WIDE
4510                    else
4511                        repsize += 2+5+1;
4512#else
4513                    else if (*p<100000)
4514                        repsize += 2+5+1;
4515                    else if (*p<1000000)
4516                        repsize += 2+6+1;
4517                    else
4518                        repsize += 2+7+1;
4519#endif
4520                }
4521                requiredsize = respos+repsize+(endp-collend);
4522                if (requiredsize > ressize) {
4523                    if (requiredsize<2*ressize)
4524                        requiredsize = 2*ressize;
4525                    if (_PyBytes_Resize(&res, requiredsize))
4526                        goto onError;
4527                    str = PyBytes_AS_STRING(res) + respos;
4528                    ressize = requiredsize;
4529                }
4530                /* generate replacement (temporarily (mis)uses p) */
4531                for (p = collstart; p < collend; ++p) {
4532                    str += sprintf(str, "&#%d;", (int)*p);
4533                }
4534                p = collend;
4535                break;
4536            default:
4537                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4538                                                              encoding, reason, startp, size, &exc,
4539                                                              collstart-startp, collend-startp, &newpos);
4540                if (repunicode == NULL)
4541                    goto onError;
4542                if (PyBytes_Check(repunicode)) {
4543                    /* Directly copy bytes result to output. */
4544                    repsize = PyBytes_Size(repunicode);
4545                    if (repsize > 1) {
4546                        /* Make room for all additional bytes. */
4547                        respos = str - PyBytes_AS_STRING(res);
4548                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4549                            Py_DECREF(repunicode);
4550                            goto onError;
4551                        }
4552                        str = PyBytes_AS_STRING(res) + respos;
4553                        ressize += repsize-1;
4554                    }
4555                    memcpy(str, PyBytes_AsString(repunicode), repsize);
4556                    str += repsize;
4557                    p = startp + newpos;
4558                    Py_DECREF(repunicode);
4559                    break;
4560                }
4561                /* need more space? (at least enough for what we
4562                   have+the replacement+the rest of the string, so
4563                   we won't have to check space for encodable characters) */
4564                respos = str - PyBytes_AS_STRING(res);
4565                repsize = PyUnicode_GET_SIZE(repunicode);
4566                requiredsize = respos+repsize+(endp-collend);
4567                if (requiredsize > ressize) {
4568                    if (requiredsize<2*ressize)
4569                        requiredsize = 2*ressize;
4570                    if (_PyBytes_Resize(&res, requiredsize)) {
4571                        Py_DECREF(repunicode);
4572                        goto onError;
4573                    }
4574                    str = PyBytes_AS_STRING(res) + respos;
4575                    ressize = requiredsize;
4576                }
4577                /* check if there is anything unencodable in the replacement
4578                   and copy it to the output */
4579                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4580                    c = *uni2;
4581                    if (c >= limit) {
4582                        raise_encode_exception(&exc, encoding, startp, size,
4583                                               unicodepos, unicodepos+1, reason);
4584                        Py_DECREF(repunicode);
4585                        goto onError;
4586                    }
4587                    *str = (char)c;
4588                }
4589                p = startp + newpos;
4590                Py_DECREF(repunicode);
4591            }
4592        }
4593    }
4594    /* Resize if we allocated to much */
4595    size = str - PyBytes_AS_STRING(res);
4596    if (size < ressize) { /* If this falls res will be NULL */
4597        assert(size >= 0);
4598        if (_PyBytes_Resize(&res, size) < 0)
4599            goto onError;
4600    }
4601
4602    Py_XDECREF(errorHandler);
4603    Py_XDECREF(exc);
4604    return res;
4605
4606  onError:
4607    Py_XDECREF(res);
4608    Py_XDECREF(errorHandler);
4609    Py_XDECREF(exc);
4610    return NULL;
4611}
4612
4613PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4614                                 Py_ssize_t size,
4615                                 const char *errors)
4616{
4617    return unicode_encode_ucs1(p, size, errors, 256);
4618}
4619
4620PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4621{
4622    if (!PyUnicode_Check(unicode)) {
4623        PyErr_BadArgument();
4624        return NULL;
4625    }
4626    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4627                                  PyUnicode_GET_SIZE(unicode),
4628                                  NULL);
4629}
4630
4631/* --- 7-bit ASCII Codec -------------------------------------------------- */
4632
4633PyObject *PyUnicode_DecodeASCII(const char *s,
4634                                Py_ssize_t size,
4635                                const char *errors)
4636{
4637    const char *starts = s;
4638    PyUnicodeObject *v;
4639    Py_UNICODE *p;
4640    Py_ssize_t startinpos;
4641    Py_ssize_t endinpos;
4642    Py_ssize_t outpos;
4643    const char *e;
4644    PyObject *errorHandler = NULL;
4645    PyObject *exc = NULL;
4646
4647    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4648    if (size == 1 && *(unsigned char*)s < 128) {
4649        Py_UNICODE r = *(unsigned char*)s;
4650        return PyUnicode_FromUnicode(&r, 1);
4651    }
4652
4653    v = _PyUnicode_New(size);
4654    if (v == NULL)
4655        goto onError;
4656    if (size == 0)
4657        return (PyObject *)v;
4658    p = PyUnicode_AS_UNICODE(v);
4659    e = s + size;
4660    while (s < e) {
4661        register unsigned char c = (unsigned char)*s;
4662        if (c < 128) {
4663            *p++ = c;
4664            ++s;
4665        }
4666        else {
4667            startinpos = s-starts;
4668            endinpos = startinpos + 1;
4669            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4670            if (unicode_decode_call_errorhandler(
4671                    errors, &errorHandler,
4672                    "ascii", "ordinal not in range(128)",
4673                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4674                    &v, &outpos, &p))
4675                goto onError;
4676        }
4677    }
4678    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4679        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4680            goto onError;
4681    Py_XDECREF(errorHandler);
4682    Py_XDECREF(exc);
4683    return (PyObject *)v;
4684
4685  onError:
4686    Py_XDECREF(v);
4687    Py_XDECREF(errorHandler);
4688    Py_XDECREF(exc);
4689    return NULL;
4690}
4691
4692PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4693                                Py_ssize_t size,
4694                                const char *errors)
4695{
4696    return unicode_encode_ucs1(p, size, errors, 128);
4697}
4698
4699PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4700{
4701    if (!PyUnicode_Check(unicode)) {
4702        PyErr_BadArgument();
4703        return NULL;
4704    }
4705    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4706                                 PyUnicode_GET_SIZE(unicode),
4707                                 NULL);
4708}
4709
4710#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4711
4712/* --- MBCS codecs for Windows -------------------------------------------- */
4713
4714#if SIZEOF_INT < SIZEOF_SIZE_T
4715#define NEED_RETRY
4716#endif
4717
4718/* XXX This code is limited to "true" double-byte encodings, as
4719   a) it assumes an incomplete character consists of a single byte, and
4720   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4721   encodings, see IsDBCSLeadByteEx documentation. */
4722
4723static int is_dbcs_lead_byte(const char *s, int offset)
4724{
4725    const char *curr = s + offset;
4726
4727    if (IsDBCSLeadByte(*curr)) {
4728        const char *prev = CharPrev(s, curr);
4729        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4730    }
4731    return 0;
4732}
4733
4734/*
4735 * Decode MBCS string into unicode object. If 'final' is set, converts
4736 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4737 */
4738static int decode_mbcs(PyUnicodeObject **v,
4739                       const char *s, /* MBCS string */
4740                       int size, /* sizeof MBCS string */
4741                       int final,
4742                       const char *errors)
4743{
4744    Py_UNICODE *p;
4745    Py_ssize_t n;
4746    DWORD usize;
4747    DWORD flags;
4748
4749    assert(size >= 0);
4750
4751    /* check and handle 'errors' arg */
4752    if (errors==NULL || strcmp(errors, "strict")==0)
4753        flags = MB_ERR_INVALID_CHARS;
4754    else if (strcmp(errors, "ignore")==0)
4755        flags = 0;
4756    else {
4757        PyErr_Format(PyExc_ValueError,
4758                     "mbcs encoding does not support errors='%s'",
4759                     errors);
4760        return -1;
4761    }
4762
4763    /* Skip trailing lead-byte unless 'final' is set */
4764    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4765        --size;
4766
4767    /* First get the size of the result */
4768    if (size > 0) {
4769        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4770        if (usize==0)
4771            goto mbcs_decode_error;
4772    } else
4773        usize = 0;
4774
4775    if (*v == NULL) {
4776        /* Create unicode object */
4777        *v = _PyUnicode_New(usize);
4778        if (*v == NULL)
4779            return -1;
4780        n = 0;
4781    }
4782    else {
4783        /* Extend unicode object */
4784        n = PyUnicode_GET_SIZE(*v);
4785        if (_PyUnicode_Resize(v, n + usize) < 0)
4786            return -1;
4787    }
4788
4789    /* Do the conversion */
4790    if (usize > 0) {
4791        p = PyUnicode_AS_UNICODE(*v) + n;
4792        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4793            goto mbcs_decode_error;
4794        }
4795    }
4796    return size;
4797
4798mbcs_decode_error:
4799    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4800       we raise a UnicodeDecodeError - else it is a 'generic'
4801       windows error
4802     */
4803    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4804        /* Ideally, we should get reason from FormatMessage - this
4805           is the Windows 2000 English version of the message
4806        */
4807        PyObject *exc = NULL;
4808        const char *reason = "No mapping for the Unicode character exists "
4809                             "in the target multi-byte code page.";
4810        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4811        if (exc != NULL) {
4812            PyCodec_StrictErrors(exc);
4813            Py_DECREF(exc);
4814        }
4815    } else {
4816        PyErr_SetFromWindowsErrWithFilename(0, NULL);
4817    }
4818    return -1;
4819}
4820
4821PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4822                                       Py_ssize_t size,
4823                                       const char *errors,
4824                                       Py_ssize_t *consumed)
4825{
4826    PyUnicodeObject *v = NULL;
4827    int done;
4828
4829    if (consumed)
4830        *consumed = 0;
4831
4832#ifdef NEED_RETRY
4833  retry:
4834    if (size > INT_MAX)
4835        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
4836    else
4837#endif
4838        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
4839
4840    if (done < 0) {
4841        Py_XDECREF(v);
4842        return NULL;
4843    }
4844
4845    if (consumed)
4846        *consumed += done;
4847
4848#ifdef NEED_RETRY
4849    if (size > INT_MAX) {
4850        s += done;
4851        size -= done;
4852        goto retry;
4853    }
4854#endif
4855
4856    return (PyObject *)v;
4857}
4858
4859PyObject *PyUnicode_DecodeMBCS(const char *s,
4860                               Py_ssize_t size,
4861                               const char *errors)
4862{
4863    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4864}
4865
4866/*
4867 * Convert unicode into string object (MBCS).
4868 * Returns 0 if succeed, -1 otherwise.
4869 */
4870static int encode_mbcs(PyObject **repr,
4871                       const Py_UNICODE *p, /* unicode */
4872                       int size, /* size of unicode */
4873                       const char* errors)
4874{
4875    BOOL usedDefaultChar = FALSE;
4876    BOOL *pusedDefaultChar;
4877    int mbcssize;
4878    Py_ssize_t n;
4879    PyObject *exc = NULL;
4880    DWORD flags;
4881
4882    assert(size >= 0);
4883
4884    /* check and handle 'errors' arg */
4885    if (errors==NULL || strcmp(errors, "strict")==0) {
4886        flags = WC_NO_BEST_FIT_CHARS;
4887        pusedDefaultChar = &usedDefaultChar;
4888    } else if (strcmp(errors, "replace")==0) {
4889        flags = 0;
4890        pusedDefaultChar = NULL;
4891    } else {
4892         PyErr_Format(PyExc_ValueError,
4893                      "mbcs encoding does not support errors='%s'",
4894                      errors);
4895         return -1;
4896    }
4897
4898    /* First get the size of the result */
4899    if (size > 0) {
4900        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4901                                       NULL, pusedDefaultChar);
4902        if (mbcssize == 0) {
4903            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4904            return -1;
4905        }
4906        /* If we used a default char, then we failed! */
4907        if (pusedDefaultChar && *pusedDefaultChar)
4908            goto mbcs_encode_error;
4909    } else {
4910        mbcssize = 0;
4911    }
4912
4913    if (*repr == NULL) {
4914        /* Create string object */
4915        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4916        if (*repr == NULL)
4917            return -1;
4918        n = 0;
4919    }
4920    else {
4921        /* Extend string object */
4922        n = PyBytes_Size(*repr);
4923        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4924            return -1;
4925    }
4926
4927    /* Do the conversion */
4928    if (size > 0) {
4929        char *s = PyBytes_AS_STRING(*repr) + n;
4930        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4931                                     NULL, pusedDefaultChar)) {
4932            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4933            return -1;
4934        }
4935        if (pusedDefaultChar && *pusedDefaultChar)
4936            goto mbcs_encode_error;
4937    }
4938    return 0;
4939
4940mbcs_encode_error:
4941    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4942    Py_XDECREF(exc);
4943    return -1;
4944}
4945
4946PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4947                               Py_ssize_t size,
4948                               const char *errors)
4949{
4950    PyObject *repr = NULL;
4951    int ret;
4952
4953#ifdef NEED_RETRY
4954  retry:
4955    if (size > INT_MAX)
4956        ret = encode_mbcs(&repr, p, INT_MAX, errors);
4957    else
4958#endif
4959        ret = encode_mbcs(&repr, p, (int)size, errors);
4960
4961    if (ret < 0) {
4962        Py_XDECREF(repr);
4963        return NULL;
4964    }
4965
4966#ifdef NEED_RETRY
4967    if (size > INT_MAX) {
4968        p += INT_MAX;
4969        size -= INT_MAX;
4970        goto retry;
4971    }
4972#endif
4973
4974    return repr;
4975}
4976
4977PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4978{
4979    if (!PyUnicode_Check(unicode)) {
4980        PyErr_BadArgument();
4981        return NULL;
4982    }
4983    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4984                                PyUnicode_GET_SIZE(unicode),
4985                                NULL);
4986}
4987
4988#undef NEED_RETRY
4989
4990#endif /* MS_WINDOWS */
4991
4992/* --- Character Mapping Codec -------------------------------------------- */
4993
4994PyObject *PyUnicode_DecodeCharmap(const char *s,
4995                                  Py_ssize_t size,
4996                                  PyObject *mapping,
4997                                  const char *errors)
4998{
4999    const char *starts = s;
5000    Py_ssize_t startinpos;
5001    Py_ssize_t endinpos;
5002    Py_ssize_t outpos;
5003    const char *e;
5004    PyUnicodeObject *v;
5005    Py_UNICODE *p;
5006    Py_ssize_t extrachars = 0;
5007    PyObject *errorHandler = NULL;
5008    PyObject *exc = NULL;
5009    Py_UNICODE *mapstring = NULL;
5010    Py_ssize_t maplen = 0;
5011
5012    /* Default to Latin-1 */
5013    if (mapping == NULL)
5014        return PyUnicode_DecodeLatin1(s, size, errors);
5015
5016    v = _PyUnicode_New(size);
5017    if (v == NULL)
5018        goto onError;
5019    if (size == 0)
5020        return (PyObject *)v;
5021    p = PyUnicode_AS_UNICODE(v);
5022    e = s + size;
5023    if (PyUnicode_CheckExact(mapping)) {
5024        mapstring = PyUnicode_AS_UNICODE(mapping);
5025        maplen = PyUnicode_GET_SIZE(mapping);
5026        while (s < e) {
5027            unsigned char ch = *s;
5028            Py_UNICODE x = 0xfffe; /* illegal value */
5029
5030            if (ch < maplen)
5031                x = mapstring[ch];
5032
5033            if (x == 0xfffe) {
5034                /* undefined mapping */
5035                outpos = p-PyUnicode_AS_UNICODE(v);
5036                startinpos = s-starts;
5037                endinpos = startinpos+1;
5038                if (unicode_decode_call_errorhandler(
5039                        errors, &errorHandler,
5040                        "charmap", "character maps to <undefined>",
5041                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5042                        &v, &outpos, &p)) {
5043                    goto onError;
5044                }
5045                continue;
5046            }
5047            *p++ = x;
5048            ++s;
5049        }
5050    }
5051    else {
5052        while (s < e) {
5053            unsigned char ch = *s;
5054            PyObject *w, *x;
5055
5056            /* Get mapping (char ordinal -> integer, Unicode char or None) */
5057            w = PyLong_FromLong((long)ch);
5058            if (w == NULL)
5059                goto onError;
5060            x = PyObject_GetItem(mapping, w);
5061            Py_DECREF(w);
5062            if (x == NULL) {
5063                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5064                    /* No mapping found means: mapping is undefined. */
5065                    PyErr_Clear();
5066                    x = Py_None;
5067                    Py_INCREF(x);
5068                } else
5069                    goto onError;
5070            }
5071
5072            /* Apply mapping */
5073            if (PyLong_Check(x)) {
5074                long value = PyLong_AS_LONG(x);
5075                if (value < 0 || value > 65535) {
5076                    PyErr_SetString(PyExc_TypeError,
5077                                    "character mapping must be in range(65536)");
5078                    Py_DECREF(x);
5079                    goto onError;
5080                }
5081                *p++ = (Py_UNICODE)value;
5082            }
5083            else if (x == Py_None) {
5084                /* undefined mapping */
5085                outpos = p-PyUnicode_AS_UNICODE(v);
5086                startinpos = s-starts;
5087                endinpos = startinpos+1;
5088                if (unicode_decode_call_errorhandler(
5089                        errors, &errorHandler,
5090                        "charmap", "character maps to <undefined>",
5091                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5092                        &v, &outpos, &p)) {
5093                    Py_DECREF(x);
5094                    goto onError;
5095                }
5096                Py_DECREF(x);
5097                continue;
5098            }
5099            else if (PyUnicode_Check(x)) {
5100                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
5101
5102                if (targetsize == 1)
5103                    /* 1-1 mapping */
5104                    *p++ = *PyUnicode_AS_UNICODE(x);
5105
5106                else if (targetsize > 1) {
5107                    /* 1-n mapping */
5108                    if (targetsize > extrachars) {
5109                        /* resize first */
5110                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5111                        Py_ssize_t needed = (targetsize - extrachars) + \
5112                            (targetsize << 2);
5113                        extrachars += needed;
5114                        /* XXX overflow detection missing */
5115                        if (_PyUnicode_Resize(&v,
5116                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
5117                            Py_DECREF(x);
5118                            goto onError;
5119                        }
5120                        p = PyUnicode_AS_UNICODE(v) + oldpos;
5121                    }
5122                    Py_UNICODE_COPY(p,
5123                                    PyUnicode_AS_UNICODE(x),
5124                                    targetsize);
5125                    p += targetsize;
5126                    extrachars -= targetsize;
5127                }
5128                /* 1-0 mapping: skip the character */
5129            }
5130            else {
5131                /* wrong return value */
5132                PyErr_SetString(PyExc_TypeError,
5133                                "character mapping must return integer, None or str");
5134                Py_DECREF(x);
5135                goto onError;
5136            }
5137            Py_DECREF(x);
5138            ++s;
5139        }
5140    }
5141    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
5142        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5143            goto onError;
5144    Py_XDECREF(errorHandler);
5145    Py_XDECREF(exc);
5146    return (PyObject *)v;
5147
5148  onError:
5149    Py_XDECREF(errorHandler);
5150    Py_XDECREF(exc);
5151    Py_XDECREF(v);
5152    return NULL;
5153}
5154
5155/* Charmap encoding: the lookup table */
5156
5157struct encoding_map{
5158    PyObject_HEAD
5159    unsigned char level1[32];
5160    int count2, count3;
5161    unsigned char level23[1];
5162};
5163
5164static PyObject*
5165encoding_map_size(PyObject *obj, PyObject* args)
5166{
5167    struct encoding_map *map = (struct encoding_map*)obj;
5168    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
5169                           128*map->count3);
5170}
5171
5172static PyMethodDef encoding_map_methods[] = {
5173    {"size", encoding_map_size, METH_NOARGS,
5174     PyDoc_STR("Return the size (in bytes) of this object") },
5175    { 0 }
5176};
5177
5178static void
5179encoding_map_dealloc(PyObject* o)
5180{
5181    PyObject_FREE(o);
5182}
5183
5184static PyTypeObject EncodingMapType = {
5185    PyVarObject_HEAD_INIT(NULL, 0)
5186    "EncodingMap",          /*tp_name*/
5187    sizeof(struct encoding_map),   /*tp_basicsize*/
5188    0,                      /*tp_itemsize*/
5189    /* methods */
5190    encoding_map_dealloc,   /*tp_dealloc*/
5191    0,                      /*tp_print*/
5192    0,                      /*tp_getattr*/
5193    0,                      /*tp_setattr*/
5194    0,                      /*tp_reserved*/
5195    0,                      /*tp_repr*/
5196    0,                      /*tp_as_number*/
5197    0,                      /*tp_as_sequence*/
5198    0,                      /*tp_as_mapping*/
5199    0,                      /*tp_hash*/
5200    0,                      /*tp_call*/
5201    0,                      /*tp_str*/
5202    0,                      /*tp_getattro*/
5203    0,                      /*tp_setattro*/
5204    0,                      /*tp_as_buffer*/
5205    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
5206    0,                      /*tp_doc*/
5207    0,                      /*tp_traverse*/
5208    0,                      /*tp_clear*/
5209    0,                      /*tp_richcompare*/
5210    0,                      /*tp_weaklistoffset*/
5211    0,                      /*tp_iter*/
5212    0,                      /*tp_iternext*/
5213    encoding_map_methods,   /*tp_methods*/
5214    0,                      /*tp_members*/
5215    0,                      /*tp_getset*/
5216    0,                      /*tp_base*/
5217    0,                      /*tp_dict*/
5218    0,                      /*tp_descr_get*/
5219    0,                      /*tp_descr_set*/
5220    0,                      /*tp_dictoffset*/
5221    0,                      /*tp_init*/
5222    0,                      /*tp_alloc*/
5223    0,                      /*tp_new*/
5224    0,                      /*tp_free*/
5225    0,                      /*tp_is_gc*/
5226};
5227
5228PyObject*
5229PyUnicode_BuildEncodingMap(PyObject* string)
5230{
5231    Py_UNICODE *decode;
5232    PyObject *result;
5233    struct encoding_map *mresult;
5234    int i;
5235    int need_dict = 0;
5236    unsigned char level1[32];
5237    unsigned char level2[512];
5238    unsigned char *mlevel1, *mlevel2, *mlevel3;
5239    int count2 = 0, count3 = 0;
5240
5241    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5242        PyErr_BadArgument();
5243        return NULL;
5244    }
5245    decode = PyUnicode_AS_UNICODE(string);
5246    memset(level1, 0xFF, sizeof level1);
5247    memset(level2, 0xFF, sizeof level2);
5248
5249    /* If there isn't a one-to-one mapping of NULL to \0,
5250       or if there are non-BMP characters, we need to use
5251       a mapping dictionary. */
5252    if (decode[0] != 0)
5253        need_dict = 1;
5254    for (i = 1; i < 256; i++) {
5255        int l1, l2;
5256        if (decode[i] == 0
5257#ifdef Py_UNICODE_WIDE
5258            || decode[i] > 0xFFFF
5259#endif
5260            ) {
5261            need_dict = 1;
5262            break;
5263        }
5264        if (decode[i] == 0xFFFE)
5265            /* unmapped character */
5266            continue;
5267        l1 = decode[i] >> 11;
5268        l2 = decode[i] >> 7;
5269        if (level1[l1] == 0xFF)
5270            level1[l1] = count2++;
5271        if (level2[l2] == 0xFF)
5272            level2[l2] = count3++;
5273    }
5274
5275    if (count2 >= 0xFF || count3 >= 0xFF)
5276        need_dict = 1;
5277
5278    if (need_dict) {
5279        PyObject *result = PyDict_New();
5280        PyObject *key, *value;
5281        if (!result)
5282            return NULL;
5283        for (i = 0; i < 256; i++) {
5284            key = value = NULL;
5285            key = PyLong_FromLong(decode[i]);
5286            value = PyLong_FromLong(i);
5287            if (!key || !value)
5288                goto failed1;
5289            if (PyDict_SetItem(result, key, value) == -1)
5290                goto failed1;
5291            Py_DECREF(key);
5292            Py_DECREF(value);
5293        }
5294        return result;
5295      failed1:
5296        Py_XDECREF(key);
5297        Py_XDECREF(value);
5298        Py_DECREF(result);
5299        return NULL;
5300    }
5301
5302    /* Create a three-level trie */
5303    result = PyObject_MALLOC(sizeof(struct encoding_map) +
5304                             16*count2 + 128*count3 - 1);
5305    if (!result)
5306        return PyErr_NoMemory();
5307    PyObject_Init(result, &EncodingMapType);
5308    mresult = (struct encoding_map*)result;
5309    mresult->count2 = count2;
5310    mresult->count3 = count3;
5311    mlevel1 = mresult->level1;
5312    mlevel2 = mresult->level23;
5313    mlevel3 = mresult->level23 + 16*count2;
5314    memcpy(mlevel1, level1, 32);
5315    memset(mlevel2, 0xFF, 16*count2);
5316    memset(mlevel3, 0, 128*count3);
5317    count3 = 0;
5318    for (i = 1; i < 256; i++) {
5319        int o1, o2, o3, i2, i3;
5320        if (decode[i] == 0xFFFE)
5321            /* unmapped character */
5322            continue;
5323        o1 = decode[i]>>11;
5324        o2 = (decode[i]>>7) & 0xF;
5325        i2 = 16*mlevel1[o1] + o2;
5326        if (mlevel2[i2] == 0xFF)
5327            mlevel2[i2] = count3++;
5328        o3 = decode[i] & 0x7F;
5329        i3 = 128*mlevel2[i2] + o3;
5330        mlevel3[i3] = i;
5331    }
5332    return result;
5333}
5334
5335static int
5336encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5337{
5338    struct encoding_map *map = (struct encoding_map*)mapping;
5339    int l1 = c>>11;
5340    int l2 = (c>>7) & 0xF;
5341    int l3 = c & 0x7F;
5342    int i;
5343
5344#ifdef Py_UNICODE_WIDE
5345    if (c > 0xFFFF) {
5346        return -1;
5347    }
5348#endif
5349    if (c == 0)
5350        return 0;
5351    /* level 1*/
5352    i = map->level1[l1];
5353    if (i == 0xFF) {
5354        return -1;
5355    }
5356    /* level 2*/
5357    i = map->level23[16*i+l2];
5358    if (i == 0xFF) {
5359        return -1;
5360    }
5361    /* level 3 */
5362    i = map->level23[16*map->count2 + 128*i + l3];
5363    if (i == 0) {
5364        return -1;
5365    }
5366    return i;
5367}
5368
5369/* Lookup the character ch in the mapping. If the character
5370   can't be found, Py_None is returned (or NULL, if another
5371   error occurred). */
5372static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5373{
5374    PyObject *w = PyLong_FromLong((long)c);
5375    PyObject *x;
5376
5377    if (w == NULL)
5378        return NULL;
5379    x = PyObject_GetItem(mapping, w);
5380    Py_DECREF(w);
5381    if (x == NULL) {
5382        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5383            /* No mapping found means: mapping is undefined. */
5384            PyErr_Clear();
5385            x = Py_None;
5386            Py_INCREF(x);
5387            return x;
5388        } else
5389            return NULL;
5390    }
5391    else if (x == Py_None)
5392        return x;
5393    else if (PyLong_Check(x)) {
5394        long value = PyLong_AS_LONG(x);
5395        if (value < 0 || value > 255) {
5396            PyErr_SetString(PyExc_TypeError,
5397                            "character mapping must be in range(256)");
5398            Py_DECREF(x);
5399            return NULL;
5400        }
5401        return x;
5402    }
5403    else if (PyBytes_Check(x))
5404        return x;
5405    else {
5406        /* wrong return value */
5407        PyErr_Format(PyExc_TypeError,
5408                     "character mapping must return integer, bytes or None, not %.400s",
5409                     x->ob_type->tp_name);
5410        Py_DECREF(x);
5411        return NULL;
5412    }
5413}
5414
5415static int
5416charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5417{
5418    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5419    /* exponentially overallocate to minimize reallocations */
5420    if (requiredsize < 2*outsize)
5421        requiredsize = 2*outsize;
5422    if (_PyBytes_Resize(outobj, requiredsize))
5423        return -1;
5424    return 0;
5425}
5426
5427typedef enum charmapencode_result {
5428    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5429}charmapencode_result;
5430/* lookup the character, put the result in the output string and adjust
5431   various state variables. Resize the output bytes object if not enough
5432   space is available. Return a new reference to the object that
5433   was put in the output buffer, or Py_None, if the mapping was undefined
5434   (in which case no character was written) or NULL, if a
5435   reallocation error occurred. The caller must decref the result */
5436static
5437charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
5438                                          PyObject **outobj, Py_ssize_t *outpos)
5439{
5440    PyObject *rep;
5441    char *outstart;
5442    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5443
5444    if (Py_TYPE(mapping) == &EncodingMapType) {
5445        int res = encoding_map_lookup(c, mapping);
5446        Py_ssize_t requiredsize = *outpos+1;
5447        if (res == -1)
5448            return enc_FAILED;
5449        if (outsize<requiredsize)
5450            if (charmapencode_resize(outobj, outpos, requiredsize))
5451                return enc_EXCEPTION;
5452        outstart = PyBytes_AS_STRING(*outobj);
5453        outstart[(*outpos)++] = (char)res;
5454        return enc_SUCCESS;
5455    }
5456
5457    rep = charmapencode_lookup(c, mapping);
5458    if (rep==NULL)
5459        return enc_EXCEPTION;
5460    else if (rep==Py_None) {
5461        Py_DECREF(rep);
5462        return enc_FAILED;
5463    } else {
5464        if (PyLong_Check(rep)) {
5465            Py_ssize_t requiredsize = *outpos+1;
5466            if (outsize<requiredsize)
5467                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5468                    Py_DECREF(rep);
5469                    return enc_EXCEPTION;
5470                }
5471            outstart = PyBytes_AS_STRING(*outobj);
5472            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5473        }
5474        else {
5475            const char *repchars = PyBytes_AS_STRING(rep);
5476            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5477            Py_ssize_t requiredsize = *outpos+repsize;
5478            if (outsize<requiredsize)
5479                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5480                    Py_DECREF(rep);
5481                    return enc_EXCEPTION;
5482                }
5483            outstart = PyBytes_AS_STRING(*outobj);
5484            memcpy(outstart + *outpos, repchars, repsize);
5485            *outpos += repsize;
5486        }
5487    }
5488    Py_DECREF(rep);
5489    return enc_SUCCESS;
5490}
5491
5492/* handle an error in PyUnicode_EncodeCharmap
5493   Return 0 on success, -1 on error */
5494static
5495int charmap_encoding_error(
5496    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5497    PyObject **exceptionObject,
5498    int *known_errorHandler, PyObject **errorHandler, const char *errors,
5499    PyObject **res, Py_ssize_t *respos)
5500{
5501    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5502    Py_ssize_t repsize;
5503    Py_ssize_t newpos;
5504    Py_UNICODE *uni2;
5505    /* startpos for collecting unencodable chars */
5506    Py_ssize_t collstartpos = *inpos;
5507    Py_ssize_t collendpos = *inpos+1;
5508    Py_ssize_t collpos;
5509    char *encoding = "charmap";
5510    char *reason = "character maps to <undefined>";
5511    charmapencode_result x;
5512
5513    /* find all unencodable characters */
5514    while (collendpos < size) {
5515        PyObject *rep;
5516        if (Py_TYPE(mapping) == &EncodingMapType) {
5517            int res = encoding_map_lookup(p[collendpos], mapping);
5518            if (res != -1)
5519                break;
5520            ++collendpos;
5521            continue;
5522        }
5523
5524        rep = charmapencode_lookup(p[collendpos], mapping);
5525        if (rep==NULL)
5526            return -1;
5527        else if (rep!=Py_None) {
5528            Py_DECREF(rep);
5529            break;
5530        }
5531        Py_DECREF(rep);
5532        ++collendpos;
5533    }
5534    /* cache callback name lookup
5535     * (if not done yet, i.e. it's the first error) */
5536    if (*known_errorHandler==-1) {
5537        if ((errors==NULL) || (!strcmp(errors, "strict")))
5538            *known_errorHandler = 1;
5539        else if (!strcmp(errors, "replace"))
5540            *known_errorHandler = 2;
5541        else if (!strcmp(errors, "ignore"))
5542            *known_errorHandler = 3;
5543        else if (!strcmp(errors, "xmlcharrefreplace"))
5544            *known_errorHandler = 4;
5545        else
5546            *known_errorHandler = 0;
5547    }
5548    switch (*known_errorHandler) {
5549    case 1: /* strict */
5550        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5551        return -1;
5552    case 2: /* replace */
5553        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
5554            x = charmapencode_output('?', mapping, res, respos);
5555            if (x==enc_EXCEPTION) {
5556                return -1;
5557            }
5558            else if (x==enc_FAILED) {
5559                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5560                return -1;
5561            }
5562        }
5563        /* fall through */
5564    case 3: /* ignore */
5565        *inpos = collendpos;
5566        break;
5567    case 4: /* xmlcharrefreplace */
5568        /* generate replacement (temporarily (mis)uses p) */
5569        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
5570            char buffer[2+29+1+1];
5571            char *cp;
5572            sprintf(buffer, "&#%d;", (int)p[collpos]);
5573            for (cp = buffer; *cp; ++cp) {
5574                x = charmapencode_output(*cp, mapping, res, respos);
5575                if (x==enc_EXCEPTION)
5576                    return -1;
5577                else if (x==enc_FAILED) {
5578                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5579                    return -1;
5580                }
5581            }
5582        }
5583        *inpos = collendpos;
5584        break;
5585    default:
5586        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
5587                                                      encoding, reason, p, size, exceptionObject,
5588                                                      collstartpos, collendpos, &newpos);
5589        if (repunicode == NULL)
5590            return -1;
5591        if (PyBytes_Check(repunicode)) {
5592            /* Directly copy bytes result to output. */
5593            Py_ssize_t outsize = PyBytes_Size(*res);
5594            Py_ssize_t requiredsize;
5595            repsize = PyBytes_Size(repunicode);
5596            requiredsize = *respos + repsize;
5597            if (requiredsize > outsize)
5598                /* Make room for all additional bytes. */
5599                if (charmapencode_resize(res, respos, requiredsize)) {
5600                    Py_DECREF(repunicode);
5601                    return -1;
5602                }
5603            memcpy(PyBytes_AsString(*res) + *respos,
5604                   PyBytes_AsString(repunicode),  repsize);
5605            *respos += repsize;
5606            *inpos = newpos;
5607            Py_DECREF(repunicode);
5608            break;
5609        }
5610        /* generate replacement  */
5611        repsize = PyUnicode_GET_SIZE(repunicode);
5612        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5613            x = charmapencode_output(*uni2, mapping, res, respos);
5614            if (x==enc_EXCEPTION) {
5615                return -1;
5616            }
5617            else if (x==enc_FAILED) {
5618                Py_DECREF(repunicode);
5619                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5620                return -1;
5621            }
5622        }
5623        *inpos = newpos;
5624        Py_DECREF(repunicode);
5625    }
5626    return 0;
5627}
5628
5629PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5630                                  Py_ssize_t size,
5631                                  PyObject *mapping,
5632                                  const char *errors)
5633{
5634    /* output object */
5635    PyObject *res = NULL;
5636    /* current input position */
5637    Py_ssize_t inpos = 0;
5638    /* current output position */
5639    Py_ssize_t respos = 0;
5640    PyObject *errorHandler = NULL;
5641    PyObject *exc = NULL;
5642    /* the following variable is used for caching string comparisons
5643     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5644     * 3=ignore, 4=xmlcharrefreplace */
5645    int known_errorHandler = -1;
5646
5647    /* Default to Latin-1 */
5648    if (mapping == NULL)
5649        return PyUnicode_EncodeLatin1(p, size, errors);
5650
5651    /* allocate enough for a simple encoding without
5652       replacements, if we need more, we'll resize */
5653    res = PyBytes_FromStringAndSize(NULL, size);
5654    if (res == NULL)
5655        goto onError;
5656    if (size == 0)
5657        return res;
5658
5659    while (inpos<size) {
5660        /* try to encode it */
5661        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5662        if (x==enc_EXCEPTION) /* error */
5663            goto onError;
5664        if (x==enc_FAILED) { /* unencodable character */
5665            if (charmap_encoding_error(p, size, &inpos, mapping,
5666                                       &exc,
5667                                       &known_errorHandler, &errorHandler, errors,
5668                                       &res, &respos)) {
5669                goto onError;
5670            }
5671        }
5672        else
5673            /* done with this character => adjust input position */
5674            ++inpos;
5675    }
5676
5677    /* Resize if we allocated to much */
5678    if (respos<PyBytes_GET_SIZE(res))
5679        if (_PyBytes_Resize(&res, respos) < 0)
5680            goto onError;
5681
5682    Py_XDECREF(exc);
5683    Py_XDECREF(errorHandler);
5684    return res;
5685
5686  onError:
5687    Py_XDECREF(res);
5688    Py_XDECREF(exc);
5689    Py_XDECREF(errorHandler);
5690    return NULL;
5691}
5692
5693PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5694                                    PyObject *mapping)
5695{
5696    if (!PyUnicode_Check(unicode) || mapping == NULL) {
5697        PyErr_BadArgument();
5698        return NULL;
5699    }
5700    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5701                                   PyUnicode_GET_SIZE(unicode),
5702                                   mapping,
5703                                   NULL);
5704}
5705
5706/* create or adjust a UnicodeTranslateError */
5707static void make_translate_exception(PyObject **exceptionObject,
5708                                     const Py_UNICODE *unicode, Py_ssize_t size,
5709                                     Py_ssize_t startpos, Py_ssize_t endpos,
5710                                     const char *reason)
5711{
5712    if (*exceptionObject == NULL) {
5713        *exceptionObject = PyUnicodeTranslateError_Create(
5714            unicode, size, startpos, endpos, reason);
5715    }
5716    else {
5717        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5718            goto onError;
5719        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5720            goto onError;
5721        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5722            goto onError;
5723        return;
5724      onError:
5725        Py_DECREF(*exceptionObject);
5726        *exceptionObject = NULL;
5727    }
5728}
5729
5730/* raises a UnicodeTranslateError */
5731static void raise_translate_exception(PyObject **exceptionObject,
5732                                      const Py_UNICODE *unicode, Py_ssize_t size,
5733                                      Py_ssize_t startpos, Py_ssize_t endpos,
5734                                      const char *reason)
5735{
5736    make_translate_exception(exceptionObject,
5737                             unicode, size, startpos, endpos, reason);
5738    if (*exceptionObject != NULL)
5739        PyCodec_StrictErrors(*exceptionObject);
5740}
5741
5742/* error handling callback helper:
5743   build arguments, call the callback and check the arguments,
5744   put the result into newpos and return the replacement string, which
5745   has to be freed by the caller */
5746static PyObject *unicode_translate_call_errorhandler(const char *errors,
5747                                                     PyObject **errorHandler,
5748                                                     const char *reason,
5749                                                     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5750                                                     Py_ssize_t startpos, Py_ssize_t endpos,
5751                                                     Py_ssize_t *newpos)
5752{
5753    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5754
5755    Py_ssize_t i_newpos;
5756    PyObject *restuple;
5757    PyObject *resunicode;
5758
5759    if (*errorHandler == NULL) {
5760        *errorHandler = PyCodec_LookupError(errors);
5761        if (*errorHandler == NULL)
5762            return NULL;
5763    }
5764
5765    make_translate_exception(exceptionObject,
5766                             unicode, size, startpos, endpos, reason);
5767    if (*exceptionObject == NULL)
5768        return NULL;
5769
5770    restuple = PyObject_CallFunctionObjArgs(
5771        *errorHandler, *exceptionObject, NULL);
5772    if (restuple == NULL)
5773        return NULL;
5774    if (!PyTuple_Check(restuple)) {
5775        PyErr_SetString(PyExc_TypeError, &argparse[4]);
5776        Py_DECREF(restuple);
5777        return NULL;
5778    }
5779    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
5780                          &resunicode, &i_newpos)) {
5781        Py_DECREF(restuple);
5782        return NULL;
5783    }
5784    if (i_newpos<0)
5785        *newpos = size+i_newpos;
5786    else
5787        *newpos = i_newpos;
5788    if (*newpos<0 || *newpos>size) {
5789        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5790        Py_DECREF(restuple);
5791        return NULL;
5792    }
5793    Py_INCREF(resunicode);
5794    Py_DECREF(restuple);
5795    return resunicode;
5796}
5797
5798/* Lookup the character ch in the mapping and put the result in result,
5799   which must be decrefed by the caller.
5800   Return 0 on success, -1 on error */
5801static
5802int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5803{
5804    PyObject *w = PyLong_FromLong((long)c);
5805    PyObject *x;
5806
5807    if (w == NULL)
5808        return -1;
5809    x = PyObject_GetItem(mapping, w);
5810    Py_DECREF(w);
5811    if (x == NULL) {
5812        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5813            /* No mapping found means: use 1:1 mapping. */
5814            PyErr_Clear();
5815            *result = NULL;
5816            return 0;
5817        } else
5818            return -1;
5819    }
5820    else if (x == Py_None) {
5821        *result = x;
5822        return 0;
5823    }
5824    else if (PyLong_Check(x)) {
5825        long value = PyLong_AS_LONG(x);
5826        long max = PyUnicode_GetMax();
5827        if (value < 0 || value > max) {
5828            PyErr_Format(PyExc_TypeError,
5829                         "character mapping must be in range(0x%x)", max+1);
5830            Py_DECREF(x);
5831            return -1;
5832        }
5833        *result = x;
5834        return 0;
5835    }
5836    else if (PyUnicode_Check(x)) {
5837        *result = x;
5838        return 0;
5839    }
5840    else {
5841        /* wrong return value */
5842        PyErr_SetString(PyExc_TypeError,
5843                        "character mapping must return integer, None or str");
5844        Py_DECREF(x);
5845        return -1;
5846    }
5847}
5848/* ensure that *outobj is at least requiredsize characters long,
5849   if not reallocate and adjust various state variables.
5850   Return 0 on success, -1 on error */
5851static
5852int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
5853                               Py_ssize_t requiredsize)
5854{
5855    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
5856    if (requiredsize > oldsize) {
5857        /* remember old output position */
5858        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5859        /* exponentially overallocate to minimize reallocations */
5860        if (requiredsize < 2 * oldsize)
5861            requiredsize = 2 * oldsize;
5862        if (PyUnicode_Resize(outobj, requiredsize) < 0)
5863            return -1;
5864        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5865    }
5866    return 0;
5867}
5868/* lookup the character, put the result in the output string and adjust
5869   various state variables. Return a new reference to the object that
5870   was put in the output buffer in *result, or Py_None, if the mapping was
5871   undefined (in which case no character was written).
5872   The called must decref result.
5873   Return 0 on success, -1 on error. */
5874static
5875int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5876                            Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5877                            PyObject **res)
5878{
5879    if (charmaptranslate_lookup(*curinp, mapping, res))
5880        return -1;
5881    if (*res==NULL) {
5882        /* not found => default to 1:1 mapping */
5883        *(*outp)++ = *curinp;
5884    }
5885    else if (*res==Py_None)
5886        ;
5887    else if (PyLong_Check(*res)) {
5888        /* no overflow check, because we know that the space is enough */
5889        *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
5890    }
5891    else if (PyUnicode_Check(*res)) {
5892        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5893        if (repsize==1) {
5894            /* no overflow check, because we know that the space is enough */
5895            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5896        }
5897        else if (repsize!=0) {
5898            /* more than one character */
5899            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5900                (insize - (curinp-startinp)) +
5901                repsize - 1;
5902            if (charmaptranslate_makespace(outobj, outp, requiredsize))
5903                return -1;
5904            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5905            *outp += repsize;
5906        }
5907    }
5908    else
5909        return -1;
5910    return 0;
5911}
5912
5913PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5914                                     Py_ssize_t size,
5915                                     PyObject *mapping,
5916                                     const char *errors)
5917{
5918    /* output object */
5919    PyObject *res = NULL;
5920    /* pointers to the beginning and end+1 of input */
5921    const Py_UNICODE *startp = p;
5922    const Py_UNICODE *endp = p + size;
5923    /* pointer into the output */
5924    Py_UNICODE *str;
5925    /* current output position */
5926    Py_ssize_t respos = 0;
5927    char *reason = "character maps to <undefined>";
5928    PyObject *errorHandler = NULL;
5929    PyObject *exc = NULL;
5930    /* the following variable is used for caching string comparisons
5931     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5932     * 3=ignore, 4=xmlcharrefreplace */
5933    int known_errorHandler = -1;
5934
5935    if (mapping == NULL) {
5936        PyErr_BadArgument();
5937        return NULL;
5938    }
5939
5940    /* allocate enough for a simple 1:1 translation without
5941       replacements, if we need more, we'll resize */
5942    res = PyUnicode_FromUnicode(NULL, size);
5943    if (res == NULL)
5944        goto onError;
5945    if (size == 0)
5946        return res;
5947    str = PyUnicode_AS_UNICODE(res);
5948
5949    while (p<endp) {
5950        /* try to encode it */
5951        PyObject *x = NULL;
5952        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5953            Py_XDECREF(x);
5954            goto onError;
5955        }
5956        Py_XDECREF(x);
5957        if (x!=Py_None) /* it worked => adjust input pointer */
5958            ++p;
5959        else { /* untranslatable character */
5960            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5961            Py_ssize_t repsize;
5962            Py_ssize_t newpos;
5963            Py_UNICODE *uni2;
5964            /* startpos for collecting untranslatable chars */
5965            const Py_UNICODE *collstart = p;
5966            const Py_UNICODE *collend = p+1;
5967            const Py_UNICODE *coll;
5968
5969            /* find all untranslatable characters */
5970            while (collend < endp) {
5971                if (charmaptranslate_lookup(*collend, mapping, &x))
5972                    goto onError;
5973                Py_XDECREF(x);
5974                if (x!=Py_None)
5975                    break;
5976                ++collend;
5977            }
5978            /* cache callback name lookup
5979             * (if not done yet, i.e. it's the first error) */
5980            if (known_errorHandler==-1) {
5981                if ((errors==NULL) || (!strcmp(errors, "strict")))
5982                    known_errorHandler = 1;
5983                else if (!strcmp(errors, "replace"))
5984                    known_errorHandler = 2;
5985                else if (!strcmp(errors, "ignore"))
5986                    known_errorHandler = 3;
5987                else if (!strcmp(errors, "xmlcharrefreplace"))
5988                    known_errorHandler = 4;
5989                else
5990                    known_errorHandler = 0;
5991            }
5992            switch (known_errorHandler) {
5993            case 1: /* strict */
5994                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5995                goto onError;
5996            case 2: /* replace */
5997                /* No need to check for space, this is a 1:1 replacement */
5998                for (coll = collstart; coll<collend; ++coll)
5999                    *str++ = '?';
6000                /* fall through */
6001            case 3: /* ignore */
6002                p = collend;
6003                break;
6004            case 4: /* xmlcharrefreplace */
6005                /* generate replacement (temporarily (mis)uses p) */
6006                for (p = collstart; p < collend; ++p) {
6007                    char buffer[2+29+1+1];
6008                    char *cp;
6009                    sprintf(buffer, "&#%d;", (int)*p);
6010                    if (charmaptranslate_makespace(&res, &str,
6011                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6012                        goto onError;
6013                    for (cp = buffer; *cp; ++cp)
6014                        *str++ = *cp;
6015                }
6016                p = collend;
6017                break;
6018            default:
6019                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6020                                                                 reason, startp, size, &exc,
6021                                                                 collstart-startp, collend-startp, &newpos);
6022                if (repunicode == NULL)
6023                    goto onError;
6024                /* generate replacement  */
6025                repsize = PyUnicode_GET_SIZE(repunicode);
6026                if (charmaptranslate_makespace(&res, &str,
6027                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6028                    Py_DECREF(repunicode);
6029                    goto onError;
6030                }
6031                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6032                    *str++ = *uni2;
6033                p = startp + newpos;
6034                Py_DECREF(repunicode);
6035            }
6036        }
6037    }
6038    /* Resize if we allocated to much */
6039    respos = str-PyUnicode_AS_UNICODE(res);
6040    if (respos<PyUnicode_GET_SIZE(res)) {
6041        if (PyUnicode_Resize(&res, respos) < 0)
6042            goto onError;
6043    }
6044    Py_XDECREF(exc);
6045    Py_XDECREF(errorHandler);
6046    return res;
6047
6048  onError:
6049    Py_XDECREF(res);
6050    Py_XDECREF(exc);
6051    Py_XDECREF(errorHandler);
6052    return NULL;
6053}
6054
6055PyObject *PyUnicode_Translate(PyObject *str,
6056                              PyObject *mapping,
6057                              const char *errors)
6058{
6059    PyObject *result;
6060
6061    str = PyUnicode_FromObject(str);
6062    if (str == NULL)
6063        goto onError;
6064    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
6065                                        PyUnicode_GET_SIZE(str),
6066                                        mapping,
6067                                        errors);
6068    Py_DECREF(str);
6069    return result;
6070
6071  onError:
6072    Py_XDECREF(str);
6073    return NULL;
6074}
6075
6076/* --- Decimal Encoder ---------------------------------------------------- */
6077
6078int PyUnicode_EncodeDecimal(Py_UNICODE *s,
6079                            Py_ssize_t length,
6080                            char *output,
6081                            const char *errors)
6082{
6083    Py_UNICODE *p, *end;
6084    PyObject *errorHandler = NULL;
6085    PyObject *exc = NULL;
6086    const char *encoding = "decimal";
6087    const char *reason = "invalid decimal Unicode string";
6088    /* the following variable is used for caching string comparisons
6089     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6090    int known_errorHandler = -1;
6091
6092    if (output == NULL) {
6093        PyErr_BadArgument();
6094        return -1;
6095    }
6096
6097    p = s;
6098    end = s + length;
6099    while (p < end) {
6100        register Py_UNICODE ch = *p;
6101        int decimal;
6102        PyObject *repunicode;
6103        Py_ssize_t repsize;
6104        Py_ssize_t newpos;
6105        Py_UNICODE *uni2;
6106        Py_UNICODE *collstart;
6107        Py_UNICODE *collend;
6108
6109        if (Py_UNICODE_ISSPACE(ch)) {
6110            *output++ = ' ';
6111            ++p;
6112            continue;
6113        }
6114        decimal = Py_UNICODE_TODECIMAL(ch);
6115        if (decimal >= 0) {
6116            *output++ = '0' + decimal;
6117            ++p;
6118            continue;
6119        }
6120        if (0 < ch && ch < 256) {
6121            *output++ = (char)ch;
6122            ++p;
6123            continue;
6124        }
6125        /* All other characters are considered unencodable */
6126        collstart = p;
6127        collend = p+1;
6128        while (collend < end) {
6129            if ((0 < *collend && *collend < 256) ||
6130                !Py_UNICODE_ISSPACE(*collend) ||
6131                Py_UNICODE_TODECIMAL(*collend))
6132                break;
6133        }
6134        /* cache callback name lookup
6135         * (if not done yet, i.e. it's the first error) */
6136        if (known_errorHandler==-1) {
6137            if ((errors==NULL) || (!strcmp(errors, "strict")))
6138                known_errorHandler = 1;
6139            else if (!strcmp(errors, "replace"))
6140                known_errorHandler = 2;
6141            else if (!strcmp(errors, "ignore"))
6142                known_errorHandler = 3;
6143            else if (!strcmp(errors, "xmlcharrefreplace"))
6144                known_errorHandler = 4;
6145            else
6146                known_errorHandler = 0;
6147        }
6148        switch (known_errorHandler) {
6149        case 1: /* strict */
6150            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6151            goto onError;
6152        case 2: /* replace */
6153            for (p = collstart; p < collend; ++p)
6154                *output++ = '?';
6155            /* fall through */
6156        case 3: /* ignore */
6157            p = collend;
6158            break;
6159        case 4: /* xmlcharrefreplace */
6160            /* generate replacement (temporarily (mis)uses p) */
6161            for (p = collstart; p < collend; ++p)
6162                output += sprintf(output, "&#%d;", (int)*p);
6163            p = collend;
6164            break;
6165        default:
6166            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6167                                                          encoding, reason, s, length, &exc,
6168                                                          collstart-s, collend-s, &newpos);
6169            if (repunicode == NULL)
6170                goto onError;
6171            if (!PyUnicode_Check(repunicode)) {
6172                /* Byte results not supported, since they have no decimal property. */
6173                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6174                Py_DECREF(repunicode);
6175                goto onError;
6176            }
6177            /* generate replacement  */
6178            repsize = PyUnicode_GET_SIZE(repunicode);
6179            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6180                Py_UNICODE ch = *uni2;
6181                if (Py_UNICODE_ISSPACE(ch))
6182                    *output++ = ' ';
6183                else {
6184                    decimal = Py_UNICODE_TODECIMAL(ch);
6185                    if (decimal >= 0)
6186                        *output++ = '0' + decimal;
6187                    else if (0 < ch && ch < 256)
6188                        *output++ = (char)ch;
6189                    else {
6190                        Py_DECREF(repunicode);
6191                        raise_encode_exception(&exc, encoding,
6192                                               s, length, collstart-s, collend-s, reason);
6193                        goto onError;
6194                    }
6195                }
6196            }
6197            p = s + newpos;
6198            Py_DECREF(repunicode);
6199        }
6200    }
6201    /* 0-terminate the output string */
6202    *output++ = '\0';
6203    Py_XDECREF(exc);
6204    Py_XDECREF(errorHandler);
6205    return 0;
6206
6207  onError:
6208    Py_XDECREF(exc);
6209    Py_XDECREF(errorHandler);
6210    return -1;
6211}
6212
6213/* --- Helpers ------------------------------------------------------------ */
6214
6215#include "stringlib/unicodedefs.h"
6216#include "stringlib/fastsearch.h"
6217
6218#include "stringlib/count.h"
6219#include "stringlib/find.h"
6220#include "stringlib/partition.h"
6221#include "stringlib/split.h"
6222
6223#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
6224#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
6225#include "stringlib/localeutil.h"
6226
6227/* helper macro to fixup start/end slice values */
6228#define ADJUST_INDICES(start, end, len)         \
6229    if (end > len)                              \
6230        end = len;                              \
6231    else if (end < 0) {                         \
6232        end += len;                             \
6233        if (end < 0)                            \
6234            end = 0;                            \
6235    }                                           \
6236    if (start < 0) {                            \
6237        start += len;                           \
6238        if (start < 0)                          \
6239            start = 0;                          \
6240    }
6241
6242Py_ssize_t PyUnicode_Count(PyObject *str,
6243                           PyObject *substr,
6244                           Py_ssize_t start,
6245                           Py_ssize_t end)
6246{
6247    Py_ssize_t result;
6248    PyUnicodeObject* str_obj;
6249    PyUnicodeObject* sub_obj;
6250
6251    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6252    if (!str_obj)
6253        return -1;
6254    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6255    if (!sub_obj) {
6256        Py_DECREF(str_obj);
6257        return -1;
6258    }
6259
6260    ADJUST_INDICES(start, end, str_obj->length);
6261    result = stringlib_count(
6262        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6263        PY_SSIZE_T_MAX
6264        );
6265
6266    Py_DECREF(sub_obj);
6267    Py_DECREF(str_obj);
6268
6269    return result;
6270}
6271
6272Py_ssize_t PyUnicode_Find(PyObject *str,
6273                          PyObject *sub,
6274                          Py_ssize_t start,
6275                          Py_ssize_t end,
6276                          int direction)
6277{
6278    Py_ssize_t result;
6279
6280    str = PyUnicode_FromObject(str);
6281    if (!str)
6282        return -2;
6283    sub = PyUnicode_FromObject(sub);
6284    if (!sub) {
6285        Py_DECREF(str);
6286        return -2;
6287    }
6288
6289    if (direction > 0)
6290        result = stringlib_find_slice(
6291            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6292            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6293            start, end
6294            );
6295    else
6296        result = stringlib_rfind_slice(
6297            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6298            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6299            start, end
6300            );
6301
6302    Py_DECREF(str);
6303    Py_DECREF(sub);
6304
6305    return result;
6306}
6307
6308static
6309int tailmatch(PyUnicodeObject *self,
6310              PyUnicodeObject *substring,
6311              Py_ssize_t start,
6312              Py_ssize_t end,
6313              int direction)
6314{
6315    if (substring->length == 0)
6316        return 1;
6317
6318    ADJUST_INDICES(start, end, self->length);
6319    end -= substring->length;
6320    if (end < start)
6321        return 0;
6322
6323    if (direction > 0) {
6324        if (Py_UNICODE_MATCH(self, end, substring))
6325            return 1;
6326    } else {
6327        if (Py_UNICODE_MATCH(self, start, substring))
6328            return 1;
6329    }
6330
6331    return 0;
6332}
6333
6334Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
6335                               PyObject *substr,
6336                               Py_ssize_t start,
6337                               Py_ssize_t end,
6338                               int direction)
6339{
6340    Py_ssize_t result;
6341
6342    str = PyUnicode_FromObject(str);
6343    if (str == NULL)
6344        return -1;
6345    substr = PyUnicode_FromObject(substr);
6346    if (substr == NULL) {
6347        Py_DECREF(str);
6348        return -1;
6349    }
6350
6351    result = tailmatch((PyUnicodeObject *)str,
6352                       (PyUnicodeObject *)substr,
6353                       start, end, direction);
6354    Py_DECREF(str);
6355    Py_DECREF(substr);
6356    return result;
6357}
6358
6359/* Apply fixfct filter to the Unicode object self and return a
6360   reference to the modified object */
6361
6362static
6363PyObject *fixup(PyUnicodeObject *self,
6364                int (*fixfct)(PyUnicodeObject *s))
6365{
6366
6367    PyUnicodeObject *u;
6368
6369    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6370    if (u == NULL)
6371        return NULL;
6372
6373    Py_UNICODE_COPY(u->str, self->str, self->length);
6374
6375    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6376        /* fixfct should return TRUE if it modified the buffer. If
6377           FALSE, return a reference to the original buffer instead
6378           (to save space, not time) */
6379        Py_INCREF(self);
6380        Py_DECREF(u);
6381        return (PyObject*) self;
6382    }
6383    return (PyObject*) u;
6384}
6385
6386static
6387int fixupper(PyUnicodeObject *self)
6388{
6389    Py_ssize_t len = self->length;
6390    Py_UNICODE *s = self->str;
6391    int status = 0;
6392
6393    while (len-- > 0) {
6394        register Py_UNICODE ch;
6395
6396        ch = Py_UNICODE_TOUPPER(*s);
6397        if (ch != *s) {
6398            status = 1;
6399            *s = ch;
6400        }
6401        s++;
6402    }
6403
6404    return status;
6405}
6406
6407static
6408int fixlower(PyUnicodeObject *self)
6409{
6410    Py_ssize_t len = self->length;
6411    Py_UNICODE *s = self->str;
6412    int status = 0;
6413
6414    while (len-- > 0) {
6415        register Py_UNICODE ch;
6416
6417        ch = Py_UNICODE_TOLOWER(*s);
6418        if (ch != *s) {
6419            status = 1;
6420            *s = ch;
6421        }
6422        s++;
6423    }
6424
6425    return status;
6426}
6427
6428static
6429int fixswapcase(PyUnicodeObject *self)
6430{
6431    Py_ssize_t len = self->length;
6432    Py_UNICODE *s = self->str;
6433    int status = 0;
6434
6435    while (len-- > 0) {
6436        if (Py_UNICODE_ISUPPER(*s)) {
6437            *s = Py_UNICODE_TOLOWER(*s);
6438            status = 1;
6439        } else if (Py_UNICODE_ISLOWER(*s)) {
6440            *s = Py_UNICODE_TOUPPER(*s);
6441            status = 1;
6442        }
6443        s++;
6444    }
6445
6446    return status;
6447}
6448
6449static
6450int fixcapitalize(PyUnicodeObject *self)
6451{
6452    Py_ssize_t len = self->length;
6453    Py_UNICODE *s = self->str;
6454    int status = 0;
6455
6456    if (len == 0)
6457        return 0;
6458    if (Py_UNICODE_ISLOWER(*s)) {
6459        *s = Py_UNICODE_TOUPPER(*s);
6460        status = 1;
6461    }
6462    s++;
6463    while (--len > 0) {
6464        if (Py_UNICODE_ISUPPER(*s)) {
6465            *s = Py_UNICODE_TOLOWER(*s);
6466            status = 1;
6467        }
6468        s++;
6469    }
6470    return status;
6471}
6472
6473static
6474int fixtitle(PyUnicodeObject *self)
6475{
6476    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6477    register Py_UNICODE *e;
6478    int previous_is_cased;
6479
6480    /* Shortcut for single character strings */
6481    if (PyUnicode_GET_SIZE(self) == 1) {
6482        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6483        if (*p != ch) {
6484            *p = ch;
6485            return 1;
6486        }
6487        else
6488            return 0;
6489    }
6490
6491    e = p + PyUnicode_GET_SIZE(self);
6492    previous_is_cased = 0;
6493    for (; p < e; p++) {
6494        register const Py_UNICODE ch = *p;
6495
6496        if (previous_is_cased)
6497            *p = Py_UNICODE_TOLOWER(ch);
6498        else
6499            *p = Py_UNICODE_TOTITLE(ch);
6500
6501        if (Py_UNICODE_ISLOWER(ch) ||
6502            Py_UNICODE_ISUPPER(ch) ||
6503            Py_UNICODE_ISTITLE(ch))
6504            previous_is_cased = 1;
6505        else
6506            previous_is_cased = 0;
6507    }
6508    return 1;
6509}
6510
6511PyObject *
6512PyUnicode_Join(PyObject *separator, PyObject *seq)
6513{
6514    const Py_UNICODE blank = ' ';
6515    const Py_UNICODE *sep = &blank;
6516    Py_ssize_t seplen = 1;
6517    PyUnicodeObject *res = NULL; /* the result */
6518    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
6519    PyObject *fseq;          /* PySequence_Fast(seq) */
6520    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
6521    PyObject **items;
6522    PyObject *item;
6523    Py_ssize_t sz, i;
6524
6525    fseq = PySequence_Fast(seq, "");
6526    if (fseq == NULL) {
6527        return NULL;
6528    }
6529
6530    /* NOTE: the following code can't call back into Python code,
6531     * so we are sure that fseq won't be mutated.
6532     */
6533
6534    seqlen = PySequence_Fast_GET_SIZE(fseq);
6535    /* If empty sequence, return u"". */
6536    if (seqlen == 0) {
6537        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
6538        goto Done;
6539    }
6540    items = PySequence_Fast_ITEMS(fseq);
6541    /* If singleton sequence with an exact Unicode, return that. */
6542    if (seqlen == 1) {
6543        item = items[0];
6544        if (PyUnicode_CheckExact(item)) {
6545            Py_INCREF(item);
6546            res = (PyUnicodeObject *)item;
6547            goto Done;
6548        }
6549    }
6550    else {
6551        /* Set up sep and seplen */
6552        if (separator == NULL) {
6553            sep = &blank;
6554            seplen = 1;
6555        }
6556        else {
6557            if (!PyUnicode_Check(separator)) {
6558                PyErr_Format(PyExc_TypeError,
6559                             "separator: expected str instance,"
6560                             " %.80s found",
6561                             Py_TYPE(separator)->tp_name);
6562                goto onError;
6563            }
6564            sep = PyUnicode_AS_UNICODE(separator);
6565            seplen = PyUnicode_GET_SIZE(separator);
6566        }
6567    }
6568
6569    /* There are at least two things to join, or else we have a subclass
6570     * of str in the sequence.
6571     * Do a pre-pass to figure out the total amount of space we'll
6572     * need (sz), and see whether all argument are strings.
6573     */
6574    sz = 0;
6575    for (i = 0; i < seqlen; i++) {
6576        const Py_ssize_t old_sz = sz;
6577        item = items[i];
6578        if (!PyUnicode_Check(item)) {
6579            PyErr_Format(PyExc_TypeError,
6580                         "sequence item %zd: expected str instance,"
6581                         " %.80s found",
6582                         i, Py_TYPE(item)->tp_name);
6583            goto onError;
6584        }
6585        sz += PyUnicode_GET_SIZE(item);
6586        if (i != 0)
6587            sz += seplen;
6588        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6589            PyErr_SetString(PyExc_OverflowError,
6590                            "join() result is too long for a Python string");
6591            goto onError;
6592        }
6593    }
6594
6595    res = _PyUnicode_New(sz);
6596    if (res == NULL)
6597        goto onError;
6598
6599    /* Catenate everything. */
6600    res_p = PyUnicode_AS_UNICODE(res);
6601    for (i = 0; i < seqlen; ++i) {
6602        Py_ssize_t itemlen;
6603        item = items[i];
6604        itemlen = PyUnicode_GET_SIZE(item);
6605        /* Copy item, and maybe the separator. */
6606        if (i) {
6607            Py_UNICODE_COPY(res_p, sep, seplen);
6608            res_p += seplen;
6609        }
6610        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6611        res_p += itemlen;
6612    }
6613
6614  Done:
6615    Py_DECREF(fseq);
6616    return (PyObject *)res;
6617
6618  onError:
6619    Py_DECREF(fseq);
6620    Py_XDECREF(res);
6621    return NULL;
6622}
6623
6624static
6625PyUnicodeObject *pad(PyUnicodeObject *self,
6626                     Py_ssize_t left,
6627                     Py_ssize_t right,
6628                     Py_UNICODE fill)
6629{
6630    PyUnicodeObject *u;
6631
6632    if (left < 0)
6633        left = 0;
6634    if (right < 0)
6635        right = 0;
6636
6637    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6638        Py_INCREF(self);
6639        return self;
6640    }
6641
6642    if (left > PY_SSIZE_T_MAX - self->length ||
6643        right > PY_SSIZE_T_MAX - (left + self->length)) {
6644        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6645        return NULL;
6646    }
6647    u = _PyUnicode_New(left + self->length + right);
6648    if (u) {
6649        if (left)
6650            Py_UNICODE_FILL(u->str, fill, left);
6651        Py_UNICODE_COPY(u->str + left, self->str, self->length);
6652        if (right)
6653            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6654    }
6655
6656    return u;
6657}
6658
6659PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
6660{
6661    PyObject *list;
6662
6663    string = PyUnicode_FromObject(string);
6664    if (string == NULL)
6665        return NULL;
6666
6667    list = stringlib_splitlines(
6668        (PyObject*) string, PyUnicode_AS_UNICODE(string),
6669        PyUnicode_GET_SIZE(string), keepends);
6670
6671    Py_DECREF(string);
6672    return list;
6673}
6674
6675static
6676PyObject *split(PyUnicodeObject *self,
6677                PyUnicodeObject *substring,
6678                Py_ssize_t maxcount)
6679{
6680    if (maxcount < 0)
6681        maxcount = PY_SSIZE_T_MAX;
6682
6683    if (substring == NULL)
6684        return stringlib_split_whitespace(
6685            (PyObject*) self,  self->str, self->length, maxcount
6686            );
6687
6688    return stringlib_split(
6689        (PyObject*) self,  self->str, self->length,
6690        substring->str, substring->length,
6691        maxcount
6692        );
6693}
6694
6695static
6696PyObject *rsplit(PyUnicodeObject *self,
6697                 PyUnicodeObject *substring,
6698                 Py_ssize_t maxcount)
6699{
6700    if (maxcount < 0)
6701        maxcount = PY_SSIZE_T_MAX;
6702
6703    if (substring == NULL)
6704        return stringlib_rsplit_whitespace(
6705            (PyObject*) self,  self->str, self->length, maxcount
6706            );
6707
6708    return stringlib_rsplit(
6709        (PyObject*) self,  self->str, self->length,
6710        substring->str, substring->length,
6711        maxcount
6712        );
6713}
6714
6715static
6716PyObject *replace(PyUnicodeObject *self,
6717                  PyUnicodeObject *str1,
6718                  PyUnicodeObject *str2,
6719                  Py_ssize_t maxcount)
6720{
6721    PyUnicodeObject *u;
6722
6723    if (maxcount < 0)
6724        maxcount = PY_SSIZE_T_MAX;
6725    else if (maxcount == 0 || self->length == 0)
6726        goto nothing;
6727
6728    if (str1->length == str2->length) {
6729        Py_ssize_t i;
6730        /* same length */
6731        if (str1->length == 0)
6732            goto nothing;
6733        if (str1->length == 1) {
6734            /* replace characters */
6735            Py_UNICODE u1, u2;
6736            if (!findchar(self->str, self->length, str1->str[0]))
6737                goto nothing;
6738            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6739            if (!u)
6740                return NULL;
6741            Py_UNICODE_COPY(u->str, self->str, self->length);
6742            u1 = str1->str[0];
6743            u2 = str2->str[0];
6744            for (i = 0; i < u->length; i++)
6745                if (u->str[i] == u1) {
6746                    if (--maxcount < 0)
6747                        break;
6748                    u->str[i] = u2;
6749                }
6750        } else {
6751            i = stringlib_find(
6752                self->str, self->length, str1->str, str1->length, 0
6753                );
6754            if (i < 0)
6755                goto nothing;
6756            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6757            if (!u)
6758                return NULL;
6759            Py_UNICODE_COPY(u->str, self->str, self->length);
6760
6761            /* change everything in-place, starting with this one */
6762            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6763            i += str1->length;
6764
6765            while ( --maxcount > 0) {
6766                i = stringlib_find(self->str+i, self->length-i,
6767                                   str1->str, str1->length,
6768                                   i);
6769                if (i == -1)
6770                    break;
6771                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6772                i += str1->length;
6773            }
6774        }
6775    } else {
6776
6777        Py_ssize_t n, i, j, e;
6778        Py_ssize_t product, new_size, delta;
6779        Py_UNICODE *p;
6780
6781        /* replace strings */
6782        n = stringlib_count(self->str, self->length, str1->str, str1->length,
6783                            maxcount);
6784        if (n == 0)
6785            goto nothing;
6786        /* new_size = self->length + n * (str2->length - str1->length)); */
6787        delta = (str2->length - str1->length);
6788        if (delta == 0) {
6789            new_size = self->length;
6790        } else {
6791            product = n * (str2->length - str1->length);
6792            if ((product / (str2->length - str1->length)) != n) {
6793                PyErr_SetString(PyExc_OverflowError,
6794                                "replace string is too long");
6795                return NULL;
6796            }
6797            new_size = self->length + product;
6798            if (new_size < 0) {
6799                PyErr_SetString(PyExc_OverflowError,
6800                                "replace string is too long");
6801                return NULL;
6802            }
6803        }
6804        u = _PyUnicode_New(new_size);
6805        if (!u)
6806            return NULL;
6807        i = 0;
6808        p = u->str;
6809        e = self->length - str1->length;
6810        if (str1->length > 0) {
6811            while (n-- > 0) {
6812                /* look for next match */
6813                j = stringlib_find(self->str+i, self->length-i,
6814                                   str1->str, str1->length,
6815                                   i);
6816                if (j == -1)
6817                    break;
6818                else if (j > i) {
6819                    /* copy unchanged part [i:j] */
6820                    Py_UNICODE_COPY(p, self->str+i, j-i);
6821                    p += j - i;
6822                }
6823                /* copy substitution string */
6824                if (str2->length > 0) {
6825                    Py_UNICODE_COPY(p, str2->str, str2->length);
6826                    p += str2->length;
6827                }
6828                i = j + str1->length;
6829            }
6830            if (i < self->length)
6831                /* copy tail [i:] */
6832                Py_UNICODE_COPY(p, self->str+i, self->length-i);
6833        } else {
6834            /* interleave */
6835            while (n > 0) {
6836                Py_UNICODE_COPY(p, str2->str, str2->length);
6837                p += str2->length;
6838                if (--n <= 0)
6839                    break;
6840                *p++ = self->str[i++];
6841            }
6842            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6843        }
6844    }
6845    return (PyObject *) u;
6846
6847  nothing:
6848    /* nothing to replace; return original string (when possible) */
6849    if (PyUnicode_CheckExact(self)) {
6850        Py_INCREF(self);
6851        return (PyObject *) self;
6852    }
6853    return PyUnicode_FromUnicode(self->str, self->length);
6854}
6855
6856/* --- Unicode Object Methods --------------------------------------------- */
6857
6858PyDoc_STRVAR(title__doc__,
6859             "S.title() -> str\n\
6860\n\
6861Return a titlecased version of S, i.e. words start with title case\n\
6862characters, all remaining cased characters have lower case.");
6863
6864static PyObject*
6865unicode_title(PyUnicodeObject *self)
6866{
6867    return fixup(self, fixtitle);
6868}
6869
6870PyDoc_STRVAR(capitalize__doc__,
6871             "S.capitalize() -> str\n\
6872\n\
6873Return a capitalized version of S, i.e. make the first character\n\
6874have upper case and the rest lower case.");
6875
6876static PyObject*
6877unicode_capitalize(PyUnicodeObject *self)
6878{
6879    return fixup(self, fixcapitalize);
6880}
6881
6882#if 0
6883PyDoc_STRVAR(capwords__doc__,
6884             "S.capwords() -> str\n\
6885\n\
6886Apply .capitalize() to all words in S and return the result with\n\
6887normalized whitespace (all whitespace strings are replaced by ' ').");
6888
6889static PyObject*
6890unicode_capwords(PyUnicodeObject *self)
6891{
6892    PyObject *list;
6893    PyObject *item;
6894    Py_ssize_t i;
6895
6896    /* Split into words */
6897    list = split(self, NULL, -1);
6898    if (!list)
6899        return NULL;
6900
6901    /* Capitalize each word */
6902    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6903        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6904                     fixcapitalize);
6905        if (item == NULL)
6906            goto onError;
6907        Py_DECREF(PyList_GET_ITEM(list, i));
6908        PyList_SET_ITEM(list, i, item);
6909    }
6910
6911    /* Join the words to form a new string */
6912    item = PyUnicode_Join(NULL, list);
6913
6914  onError:
6915    Py_DECREF(list);
6916    return (PyObject *)item;
6917}
6918#endif
6919
6920/* Argument converter.  Coerces to a single unicode character */
6921
6922static int
6923convert_uc(PyObject *obj, void *addr)
6924{
6925    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6926    PyObject *uniobj;
6927    Py_UNICODE *unistr;
6928
6929    uniobj = PyUnicode_FromObject(obj);
6930    if (uniobj == NULL) {
6931        PyErr_SetString(PyExc_TypeError,
6932                        "The fill character cannot be converted to Unicode");
6933        return 0;
6934    }
6935    if (PyUnicode_GET_SIZE(uniobj) != 1) {
6936        PyErr_SetString(PyExc_TypeError,
6937                        "The fill character must be exactly one character long");
6938        Py_DECREF(uniobj);
6939        return 0;
6940    }
6941    unistr = PyUnicode_AS_UNICODE(uniobj);
6942    *fillcharloc = unistr[0];
6943    Py_DECREF(uniobj);
6944    return 1;
6945}
6946
6947PyDoc_STRVAR(center__doc__,
6948             "S.center(width[, fillchar]) -> str\n\
6949\n\
6950Return S centered in a string of length width. Padding is\n\
6951done using the specified fill character (default is a space)");
6952
6953static PyObject *
6954unicode_center(PyUnicodeObject *self, PyObject *args)
6955{
6956    Py_ssize_t marg, left;
6957    Py_ssize_t width;
6958    Py_UNICODE fillchar = ' ';
6959
6960    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6961        return NULL;
6962
6963    if (self->length >= width && PyUnicode_CheckExact(self)) {
6964        Py_INCREF(self);
6965        return (PyObject*) self;
6966    }
6967
6968    marg = width - self->length;
6969    left = marg / 2 + (marg & width & 1);
6970
6971    return (PyObject*) pad(self, left, marg - left, fillchar);
6972}
6973
6974#if 0
6975
6976/* This code should go into some future Unicode collation support
6977   module. The basic comparison should compare ordinals on a naive
6978   basis (this is what Java does and thus Jython too). */
6979
6980/* speedy UTF-16 code point order comparison */
6981/* gleaned from: */
6982/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6983
6984static short utf16Fixup[32] =
6985{
6986    0, 0, 0, 0, 0, 0, 0, 0,
6987    0, 0, 0, 0, 0, 0, 0, 0,
6988    0, 0, 0, 0, 0, 0, 0, 0,
6989    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6990};
6991
6992static int
6993unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6994{
6995    Py_ssize_t len1, len2;
6996
6997    Py_UNICODE *s1 = str1->str;
6998    Py_UNICODE *s2 = str2->str;
6999
7000    len1 = str1->length;
7001    len2 = str2->length;
7002
7003    while (len1 > 0 && len2 > 0) {
7004        Py_UNICODE c1, c2;
7005
7006        c1 = *s1++;
7007        c2 = *s2++;
7008
7009        if (c1 > (1<<11) * 26)
7010            c1 += utf16Fixup[c1>>11];
7011        if (c2 > (1<<11) * 26)
7012            c2 += utf16Fixup[c2>>11];
7013        /* now c1 and c2 are in UTF-32-compatible order */
7014
7015        if (c1 != c2)
7016            return (c1 < c2) ? -1 : 1;
7017
7018        len1--; len2--;
7019    }
7020
7021    return (len1 < len2) ? -1 : (len1 != len2);
7022}
7023
7024#else
7025
7026static int
7027unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7028{
7029    register Py_ssize_t len1, len2;
7030
7031    Py_UNICODE *s1 = str1->str;
7032    Py_UNICODE *s2 = str2->str;
7033
7034    len1 = str1->length;
7035    len2 = str2->length;
7036
7037    while (len1 > 0 && len2 > 0) {
7038        Py_UNICODE c1, c2;
7039
7040        c1 = *s1++;
7041        c2 = *s2++;
7042
7043        if (c1 != c2)
7044            return (c1 < c2) ? -1 : 1;
7045
7046        len1--; len2--;
7047    }
7048
7049    return (len1 < len2) ? -1 : (len1 != len2);
7050}
7051
7052#endif
7053
7054int PyUnicode_Compare(PyObject *left,
7055                      PyObject *right)
7056{
7057    if (PyUnicode_Check(left) && PyUnicode_Check(right))
7058        return unicode_compare((PyUnicodeObject *)left,
7059                               (PyUnicodeObject *)right);
7060    PyErr_Format(PyExc_TypeError,
7061                 "Can't compare %.100s and %.100s",
7062                 left->ob_type->tp_name,
7063                 right->ob_type->tp_name);
7064    return -1;
7065}
7066
7067int
7068PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7069{
7070    int i;
7071    Py_UNICODE *id;
7072    assert(PyUnicode_Check(uni));
7073    id = PyUnicode_AS_UNICODE(uni);
7074    /* Compare Unicode string and source character set string */
7075    for (i = 0; id[i] && str[i]; i++)
7076        if (id[i] != str[i])
7077            return ((int)id[i] < (int)str[i]) ? -1 : 1;
7078    /* This check keeps Python strings that end in '\0' from comparing equal
7079     to C strings identical up to that point. */
7080    if (PyUnicode_GET_SIZE(uni) != i || id[i])
7081        return 1; /* uni is longer */
7082    if (str[i])
7083        return -1; /* str is longer */
7084    return 0;
7085}
7086
7087
7088#define TEST_COND(cond)                         \
7089    ((cond) ? Py_True : Py_False)
7090
7091PyObject *PyUnicode_RichCompare(PyObject *left,
7092                                PyObject *right,
7093                                int op)
7094{
7095    int result;
7096
7097    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7098        PyObject *v;
7099        if (((PyUnicodeObject *) left)->length !=
7100            ((PyUnicodeObject *) right)->length) {
7101            if (op == Py_EQ) {
7102                Py_INCREF(Py_False);
7103                return Py_False;
7104            }
7105            if (op == Py_NE) {
7106                Py_INCREF(Py_True);
7107                return Py_True;
7108            }
7109        }
7110        if (left == right)
7111            result = 0;
7112        else
7113            result = unicode_compare((PyUnicodeObject *)left,
7114                                     (PyUnicodeObject *)right);
7115
7116        /* Convert the return value to a Boolean */
7117        switch (op) {
7118        case Py_EQ:
7119            v = TEST_COND(result == 0);
7120            break;
7121        case Py_NE:
7122            v = TEST_COND(result != 0);
7123            break;
7124        case Py_LE:
7125            v = TEST_COND(result <= 0);
7126            break;
7127        case Py_GE:
7128            v = TEST_COND(result >= 0);
7129            break;
7130        case Py_LT:
7131            v = TEST_COND(result == -1);
7132            break;
7133        case Py_GT:
7134            v = TEST_COND(result == 1);
7135            break;
7136        default:
7137            PyErr_BadArgument();
7138            return NULL;
7139        }
7140        Py_INCREF(v);
7141        return v;
7142    }
7143
7144    Py_INCREF(Py_NotImplemented);
7145    return Py_NotImplemented;
7146}
7147
7148int PyUnicode_Contains(PyObject *container,
7149                       PyObject *element)
7150{
7151    PyObject *str, *sub;
7152    int result;
7153
7154    /* Coerce the two arguments */
7155    sub = PyUnicode_FromObject(element);
7156    if (!sub) {
7157        PyErr_Format(PyExc_TypeError,
7158                     "'in <string>' requires string as left operand, not %s",
7159                     element->ob_type->tp_name);
7160        return -1;
7161    }
7162
7163    str = PyUnicode_FromObject(container);
7164    if (!str) {
7165        Py_DECREF(sub);
7166        return -1;
7167    }
7168
7169    result = stringlib_contains_obj(str, sub);
7170
7171    Py_DECREF(str);
7172    Py_DECREF(sub);
7173
7174    return result;
7175}
7176
7177/* Concat to string or Unicode object giving a new Unicode object. */
7178
7179PyObject *PyUnicode_Concat(PyObject *left,
7180                           PyObject *right)
7181{
7182    PyUnicodeObject *u = NULL, *v = NULL, *w;
7183
7184    /* Coerce the two arguments */
7185    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7186    if (u == NULL)
7187        goto onError;
7188    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7189    if (v == NULL)
7190        goto onError;
7191
7192    /* Shortcuts */
7193    if (v == unicode_empty) {
7194        Py_DECREF(v);
7195        return (PyObject *)u;
7196    }
7197    if (u == unicode_empty) {
7198        Py_DECREF(u);
7199        return (PyObject *)v;
7200    }
7201
7202    /* Concat the two Unicode strings */
7203    w = _PyUnicode_New(u->length + v->length);
7204    if (w == NULL)
7205        goto onError;
7206    Py_UNICODE_COPY(w->str, u->str, u->length);
7207    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7208
7209    Py_DECREF(u);
7210    Py_DECREF(v);
7211    return (PyObject *)w;
7212
7213  onError:
7214    Py_XDECREF(u);
7215    Py_XDECREF(v);
7216    return NULL;
7217}
7218
7219void
7220PyUnicode_Append(PyObject **pleft, PyObject *right)
7221{
7222    PyObject *new;
7223    if (*pleft == NULL)
7224        return;
7225    if (right == NULL || !PyUnicode_Check(*pleft)) {
7226        Py_DECREF(*pleft);
7227        *pleft = NULL;
7228        return;
7229    }
7230    new = PyUnicode_Concat(*pleft, right);
7231    Py_DECREF(*pleft);
7232    *pleft = new;
7233}
7234
7235void
7236PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7237{
7238    PyUnicode_Append(pleft, right);
7239    Py_XDECREF(right);
7240}
7241
7242PyDoc_STRVAR(count__doc__,
7243             "S.count(sub[, start[, end]]) -> int\n\
7244\n\
7245Return the number of non-overlapping occurrences of substring sub in\n\
7246string S[start:end].  Optional arguments start and end are\n\
7247interpreted as in slice notation.");
7248
7249static PyObject *
7250unicode_count(PyUnicodeObject *self, PyObject *args)
7251{
7252    PyUnicodeObject *substring;
7253    Py_ssize_t start = 0;
7254    Py_ssize_t end = PY_SSIZE_T_MAX;
7255    PyObject *result;
7256
7257    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
7258                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7259        return NULL;
7260
7261    substring = (PyUnicodeObject *)PyUnicode_FromObject(
7262        (PyObject *)substring);
7263    if (substring == NULL)
7264        return NULL;
7265
7266    ADJUST_INDICES(start, end, self->length);
7267    result = PyLong_FromSsize_t(
7268        stringlib_count(self->str + start, end - start,
7269                        substring->str, substring->length,
7270                        PY_SSIZE_T_MAX)
7271        );
7272
7273    Py_DECREF(substring);
7274
7275    return result;
7276}
7277
7278PyDoc_STRVAR(encode__doc__,
7279             "S.encode([encoding[, errors]]) -> bytes\n\
7280\n\
7281Encode S using the codec registered for encoding. encoding defaults\n\
7282to the default encoding. errors may be given to set a different error\n\
7283handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7284a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7285'xmlcharrefreplace' as well as any other name registered with\n\
7286codecs.register_error that can handle UnicodeEncodeErrors.");
7287
7288static PyObject *
7289unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7290{
7291    static char *kwlist[] = {"encoding", "errors", 0};
7292    char *encoding = NULL;
7293    char *errors = NULL;
7294    PyObject *v;
7295
7296    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7297                                     kwlist, &encoding, &errors))
7298        return NULL;
7299    v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7300    if (v == NULL)
7301        goto onError;
7302    if (!PyBytes_Check(v)) {
7303        PyErr_Format(PyExc_TypeError,
7304                     "encoder did not return a bytes object "
7305                     "(type=%.400s)",
7306                     Py_TYPE(v)->tp_name);
7307        Py_DECREF(v);
7308        return NULL;
7309    }
7310    return v;
7311
7312  onError:
7313    return NULL;
7314}
7315
7316PyDoc_STRVAR(expandtabs__doc__,
7317             "S.expandtabs([tabsize]) -> str\n\
7318\n\
7319Return a copy of S where all tab characters are expanded using spaces.\n\
7320If tabsize is not given, a tab size of 8 characters is assumed.");
7321
7322static PyObject*
7323unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7324{
7325    Py_UNICODE *e;
7326    Py_UNICODE *p;
7327    Py_UNICODE *q;
7328    Py_UNICODE *qe;
7329    Py_ssize_t i, j, incr;
7330    PyUnicodeObject *u;
7331    int tabsize = 8;
7332
7333    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
7334        return NULL;
7335
7336    /* First pass: determine size of output string */
7337    i = 0; /* chars up to and including most recent \n or \r */
7338    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7339    e = self->str + self->length; /* end of input */
7340    for (p = self->str; p < e; p++)
7341        if (*p == '\t') {
7342            if (tabsize > 0) {
7343                incr = tabsize - (j % tabsize); /* cannot overflow */
7344                if (j > PY_SSIZE_T_MAX - incr)
7345                    goto overflow1;
7346                j += incr;
7347            }
7348        }
7349        else {
7350            if (j > PY_SSIZE_T_MAX - 1)
7351                goto overflow1;
7352            j++;
7353            if (*p == '\n' || *p == '\r') {
7354                if (i > PY_SSIZE_T_MAX - j)
7355                    goto overflow1;
7356                i += j;
7357                j = 0;
7358            }
7359        }
7360
7361    if (i > PY_SSIZE_T_MAX - j)
7362        goto overflow1;
7363
7364    /* Second pass: create output string and fill it */
7365    u = _PyUnicode_New(i + j);
7366    if (!u)
7367        return NULL;
7368
7369    j = 0; /* same as in first pass */
7370    q = u->str; /* next output char */
7371    qe = u->str + u->length; /* end of output */
7372
7373    for (p = self->str; p < e; p++)
7374        if (*p == '\t') {
7375            if (tabsize > 0) {
7376                i = tabsize - (j % tabsize);
7377                j += i;
7378                while (i--) {
7379                    if (q >= qe)
7380                        goto overflow2;
7381                    *q++ = ' ';
7382                }
7383            }
7384        }
7385        else {
7386            if (q >= qe)
7387                goto overflow2;
7388            *q++ = *p;
7389            j++;
7390            if (*p == '\n' || *p == '\r')
7391                j = 0;
7392        }
7393
7394    return (PyObject*) u;
7395
7396  overflow2:
7397    Py_DECREF(u);
7398  overflow1:
7399    PyErr_SetString(PyExc_OverflowError, "new string is too long");
7400    return NULL;
7401}
7402
7403PyDoc_STRVAR(find__doc__,
7404             "S.find(sub[, start[, end]]) -> int\n\
7405\n\
7406Return the lowest index in S where substring sub is found,\n\
7407such that sub is contained within s[start:end].  Optional\n\
7408arguments start and end are interpreted as in slice notation.\n\
7409\n\
7410Return -1 on failure.");
7411
7412static PyObject *
7413unicode_find(PyUnicodeObject *self, PyObject *args)
7414{
7415    PyObject *substring;
7416    Py_ssize_t start;
7417    Py_ssize_t end;
7418    Py_ssize_t result;
7419
7420    if (!_ParseTupleFinds(args, &substring, &start, &end))
7421        return NULL;
7422
7423    result = stringlib_find_slice(
7424        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7425        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7426        start, end
7427        );
7428
7429    Py_DECREF(substring);
7430
7431    return PyLong_FromSsize_t(result);
7432}
7433
7434static PyObject *
7435unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7436{
7437    if (index < 0 || index >= self->length) {
7438        PyErr_SetString(PyExc_IndexError, "string index out of range");
7439        return NULL;
7440    }
7441
7442    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7443}
7444
7445/* Believe it or not, this produces the same value for ASCII strings
7446   as string_hash(). */
7447static long
7448unicode_hash(PyUnicodeObject *self)
7449{
7450    Py_ssize_t len;
7451    Py_UNICODE *p;
7452    long x;
7453
7454    if (self->hash != -1)
7455        return self->hash;
7456    len = Py_SIZE(self);
7457    p = self->str;
7458    x = *p << 7;
7459    while (--len >= 0)
7460        x = (1000003*x) ^ *p++;
7461    x ^= Py_SIZE(self);
7462    if (x == -1)
7463        x = -2;
7464    self->hash = x;
7465    return x;
7466}
7467
7468PyDoc_STRVAR(index__doc__,
7469             "S.index(sub[, start[, end]]) -> int\n\
7470\n\
7471Like S.find() but raise ValueError when the substring is not found.");
7472
7473static PyObject *
7474unicode_index(PyUnicodeObject *self, PyObject *args)
7475{
7476    Py_ssize_t result;
7477    PyObject *substring;
7478    Py_ssize_t start;
7479    Py_ssize_t end;
7480
7481    if (!_ParseTupleFinds(args, &substring, &start, &end))
7482        return NULL;
7483
7484    result = stringlib_find_slice(
7485        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7486        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7487        start, end
7488        );
7489
7490    Py_DECREF(substring);
7491
7492    if (result < 0) {
7493        PyErr_SetString(PyExc_ValueError, "substring not found");
7494        return NULL;
7495    }
7496
7497    return PyLong_FromSsize_t(result);
7498}
7499
7500PyDoc_STRVAR(islower__doc__,
7501             "S.islower() -> bool\n\
7502\n\
7503Return True if all cased characters in S are lowercase and there is\n\
7504at least one cased character in S, False otherwise.");
7505
7506static PyObject*
7507unicode_islower(PyUnicodeObject *self)
7508{
7509    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7510    register const Py_UNICODE *e;
7511    int cased;
7512
7513    /* Shortcut for single character strings */
7514    if (PyUnicode_GET_SIZE(self) == 1)
7515        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7516
7517    /* Special case for empty strings */
7518    if (PyUnicode_GET_SIZE(self) == 0)
7519        return PyBool_FromLong(0);
7520
7521    e = p + PyUnicode_GET_SIZE(self);
7522    cased = 0;
7523    for (; p < e; p++) {
7524        register const Py_UNICODE ch = *p;
7525
7526        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7527            return PyBool_FromLong(0);
7528        else if (!cased && Py_UNICODE_ISLOWER(ch))
7529            cased = 1;
7530    }
7531    return PyBool_FromLong(cased);
7532}
7533
7534PyDoc_STRVAR(isupper__doc__,
7535             "S.isupper() -> bool\n\
7536\n\
7537Return True if all cased characters in S are uppercase and there is\n\
7538at least one cased character in S, False otherwise.");
7539
7540static PyObject*
7541unicode_isupper(PyUnicodeObject *self)
7542{
7543    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7544    register const Py_UNICODE *e;
7545    int cased;
7546
7547    /* Shortcut for single character strings */
7548    if (PyUnicode_GET_SIZE(self) == 1)
7549        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7550
7551    /* Special case for empty strings */
7552    if (PyUnicode_GET_SIZE(self) == 0)
7553        return PyBool_FromLong(0);
7554
7555    e = p + PyUnicode_GET_SIZE(self);
7556    cased = 0;
7557    for (; p < e; p++) {
7558        register const Py_UNICODE ch = *p;
7559
7560        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7561            return PyBool_FromLong(0);
7562        else if (!cased && Py_UNICODE_ISUPPER(ch))
7563            cased = 1;
7564    }
7565    return PyBool_FromLong(cased);
7566}
7567
7568PyDoc_STRVAR(istitle__doc__,
7569             "S.istitle() -> bool\n\
7570\n\
7571Return True if S is a titlecased string and there is at least one\n\
7572character in S, i.e. upper- and titlecase characters may only\n\
7573follow uncased characters and lowercase characters only cased ones.\n\
7574Return False otherwise.");
7575
7576static PyObject*
7577unicode_istitle(PyUnicodeObject *self)
7578{
7579    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7580    register const Py_UNICODE *e;
7581    int cased, previous_is_cased;
7582
7583    /* Shortcut for single character strings */
7584    if (PyUnicode_GET_SIZE(self) == 1)
7585        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7586                               (Py_UNICODE_ISUPPER(*p) != 0));
7587
7588    /* Special case for empty strings */
7589    if (PyUnicode_GET_SIZE(self) == 0)
7590        return PyBool_FromLong(0);
7591
7592    e = p + PyUnicode_GET_SIZE(self);
7593    cased = 0;
7594    previous_is_cased = 0;
7595    for (; p < e; p++) {
7596        register const Py_UNICODE ch = *p;
7597
7598        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7599            if (previous_is_cased)
7600                return PyBool_FromLong(0);
7601            previous_is_cased = 1;
7602            cased = 1;
7603        }
7604        else if (Py_UNICODE_ISLOWER(ch)) {
7605            if (!previous_is_cased)
7606                return PyBool_FromLong(0);
7607            previous_is_cased = 1;
7608            cased = 1;
7609        }
7610        else
7611            previous_is_cased = 0;
7612    }
7613    return PyBool_FromLong(cased);
7614}
7615
7616PyDoc_STRVAR(isspace__doc__,
7617             "S.isspace() -> bool\n\
7618\n\
7619Return True if all characters in S are whitespace\n\
7620and there is at least one character in S, False otherwise.");
7621
7622static PyObject*
7623unicode_isspace(PyUnicodeObject *self)
7624{
7625    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7626    register const Py_UNICODE *e;
7627
7628    /* Shortcut for single character strings */
7629    if (PyUnicode_GET_SIZE(self) == 1 &&
7630        Py_UNICODE_ISSPACE(*p))
7631        return PyBool_FromLong(1);
7632
7633    /* Special case for empty strings */
7634    if (PyUnicode_GET_SIZE(self) == 0)
7635        return PyBool_FromLong(0);
7636
7637    e = p + PyUnicode_GET_SIZE(self);
7638    for (; p < e; p++) {
7639        if (!Py_UNICODE_ISSPACE(*p))
7640            return PyBool_FromLong(0);
7641    }
7642    return PyBool_FromLong(1);
7643}
7644
7645PyDoc_STRVAR(isalpha__doc__,
7646             "S.isalpha() -> bool\n\
7647\n\
7648Return True if all characters in S are alphabetic\n\
7649and there is at least one character in S, False otherwise.");
7650
7651static PyObject*
7652unicode_isalpha(PyUnicodeObject *self)
7653{
7654    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7655    register const Py_UNICODE *e;
7656
7657    /* Shortcut for single character strings */
7658    if (PyUnicode_GET_SIZE(self) == 1 &&
7659        Py_UNICODE_ISALPHA(*p))
7660        return PyBool_FromLong(1);
7661
7662    /* Special case for empty strings */
7663    if (PyUnicode_GET_SIZE(self) == 0)
7664        return PyBool_FromLong(0);
7665
7666    e = p + PyUnicode_GET_SIZE(self);
7667    for (; p < e; p++) {
7668        if (!Py_UNICODE_ISALPHA(*p))
7669            return PyBool_FromLong(0);
7670    }
7671    return PyBool_FromLong(1);
7672}
7673
7674PyDoc_STRVAR(isalnum__doc__,
7675             "S.isalnum() -> bool\n\
7676\n\
7677Return True if all characters in S are alphanumeric\n\
7678and there is at least one character in S, False otherwise.");
7679
7680static PyObject*
7681unicode_isalnum(PyUnicodeObject *self)
7682{
7683    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7684    register const Py_UNICODE *e;
7685
7686    /* Shortcut for single character strings */
7687    if (PyUnicode_GET_SIZE(self) == 1 &&
7688        Py_UNICODE_ISALNUM(*p))
7689        return PyBool_FromLong(1);
7690
7691    /* Special case for empty strings */
7692    if (PyUnicode_GET_SIZE(self) == 0)
7693        return PyBool_FromLong(0);
7694
7695    e = p + PyUnicode_GET_SIZE(self);
7696    for (; p < e; p++) {
7697        if (!Py_UNICODE_ISALNUM(*p))
7698            return PyBool_FromLong(0);
7699    }
7700    return PyBool_FromLong(1);
7701}
7702
7703PyDoc_STRVAR(isdecimal__doc__,
7704             "S.isdecimal() -> bool\n\
7705\n\
7706Return True if there are only decimal characters in S,\n\
7707False otherwise.");
7708
7709static PyObject*
7710unicode_isdecimal(PyUnicodeObject *self)
7711{
7712    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7713    register const Py_UNICODE *e;
7714
7715    /* Shortcut for single character strings */
7716    if (PyUnicode_GET_SIZE(self) == 1 &&
7717        Py_UNICODE_ISDECIMAL(*p))
7718        return PyBool_FromLong(1);
7719
7720    /* Special case for empty strings */
7721    if (PyUnicode_GET_SIZE(self) == 0)
7722        return PyBool_FromLong(0);
7723
7724    e = p + PyUnicode_GET_SIZE(self);
7725    for (; p < e; p++) {
7726        if (!Py_UNICODE_ISDECIMAL(*p))
7727            return PyBool_FromLong(0);
7728    }
7729    return PyBool_FromLong(1);
7730}
7731
7732PyDoc_STRVAR(isdigit__doc__,
7733             "S.isdigit() -> bool\n\
7734\n\
7735Return True if all characters in S are digits\n\
7736and there is at least one character in S, False otherwise.");
7737
7738static PyObject*
7739unicode_isdigit(PyUnicodeObject *self)
7740{
7741    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7742    register const Py_UNICODE *e;
7743
7744    /* Shortcut for single character strings */
7745    if (PyUnicode_GET_SIZE(self) == 1 &&
7746        Py_UNICODE_ISDIGIT(*p))
7747        return PyBool_FromLong(1);
7748
7749    /* Special case for empty strings */
7750    if (PyUnicode_GET_SIZE(self) == 0)
7751        return PyBool_FromLong(0);
7752
7753    e = p + PyUnicode_GET_SIZE(self);
7754    for (; p < e; p++) {
7755        if (!Py_UNICODE_ISDIGIT(*p))
7756            return PyBool_FromLong(0);
7757    }
7758    return PyBool_FromLong(1);
7759}
7760
7761PyDoc_STRVAR(isnumeric__doc__,
7762             "S.isnumeric() -> bool\n\
7763\n\
7764Return True if there are only numeric characters in S,\n\
7765False otherwise.");
7766
7767static PyObject*
7768unicode_isnumeric(PyUnicodeObject *self)
7769{
7770    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7771    register const Py_UNICODE *e;
7772
7773    /* Shortcut for single character strings */
7774    if (PyUnicode_GET_SIZE(self) == 1 &&
7775        Py_UNICODE_ISNUMERIC(*p))
7776        return PyBool_FromLong(1);
7777
7778    /* Special case for empty strings */
7779    if (PyUnicode_GET_SIZE(self) == 0)
7780        return PyBool_FromLong(0);
7781
7782    e = p + PyUnicode_GET_SIZE(self);
7783    for (; p < e; p++) {
7784        if (!Py_UNICODE_ISNUMERIC(*p))
7785            return PyBool_FromLong(0);
7786    }
7787    return PyBool_FromLong(1);
7788}
7789
7790int
7791PyUnicode_IsIdentifier(PyObject *self)
7792{
7793    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7794    register const Py_UNICODE *e;
7795
7796    /* Special case for empty strings */
7797    if (PyUnicode_GET_SIZE(self) == 0)
7798        return 0;
7799
7800    /* PEP 3131 says that the first character must be in
7801       XID_Start and subsequent characters in XID_Continue,
7802       and for the ASCII range, the 2.x rules apply (i.e
7803       start with letters and underscore, continue with
7804       letters, digits, underscore). However, given the current
7805       definition of XID_Start and XID_Continue, it is sufficient
7806       to check just for these, except that _ must be allowed
7807       as starting an identifier.  */
7808    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7809        return 0;
7810
7811    e = p + PyUnicode_GET_SIZE(self);
7812    for (p++; p < e; p++) {
7813        if (!_PyUnicode_IsXidContinue(*p))
7814            return 0;
7815    }
7816    return 1;
7817}
7818
7819PyDoc_STRVAR(isidentifier__doc__,
7820             "S.isidentifier() -> bool\n\
7821\n\
7822Return True if S is a valid identifier according\n\
7823to the language definition.");
7824
7825static PyObject*
7826unicode_isidentifier(PyObject *self)
7827{
7828    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7829}
7830
7831PyDoc_STRVAR(isprintable__doc__,
7832             "S.isprintable() -> bool\n\
7833\n\
7834Return True if all characters in S are considered\n\
7835printable in repr() or S is empty, False otherwise.");
7836
7837static PyObject*
7838unicode_isprintable(PyObject *self)
7839{
7840    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7841    register const Py_UNICODE *e;
7842
7843    /* Shortcut for single character strings */
7844    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7845        Py_RETURN_TRUE;
7846    }
7847
7848    e = p + PyUnicode_GET_SIZE(self);
7849    for (; p < e; p++) {
7850        if (!Py_UNICODE_ISPRINTABLE(*p)) {
7851            Py_RETURN_FALSE;
7852        }
7853    }
7854    Py_RETURN_TRUE;
7855}
7856
7857PyDoc_STRVAR(join__doc__,
7858             "S.join(iterable) -> str\n\
7859\n\
7860Return a string which is the concatenation of the strings in the\n\
7861iterable.  The separator between elements is S.");
7862
7863static PyObject*
7864unicode_join(PyObject *self, PyObject *data)
7865{
7866    return PyUnicode_Join(self, data);
7867}
7868
7869static Py_ssize_t
7870unicode_length(PyUnicodeObject *self)
7871{
7872    return self->length;
7873}
7874
7875PyDoc_STRVAR(ljust__doc__,
7876             "S.ljust(width[, fillchar]) -> str\n\
7877\n\
7878Return S left-justified in a Unicode string of length width. Padding is\n\
7879done using the specified fill character (default is a space).");
7880
7881static PyObject *
7882unicode_ljust(PyUnicodeObject *self, PyObject *args)
7883{
7884    Py_ssize_t width;
7885    Py_UNICODE fillchar = ' ';
7886
7887    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7888        return NULL;
7889
7890    if (self->length >= width && PyUnicode_CheckExact(self)) {
7891        Py_INCREF(self);
7892        return (PyObject*) self;
7893    }
7894
7895    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7896}
7897
7898PyDoc_STRVAR(lower__doc__,
7899             "S.lower() -> str\n\
7900\n\
7901Return a copy of the string S converted to lowercase.");
7902
7903static PyObject*
7904unicode_lower(PyUnicodeObject *self)
7905{
7906    return fixup(self, fixlower);
7907}
7908
7909#define LEFTSTRIP 0
7910#define RIGHTSTRIP 1
7911#define BOTHSTRIP 2
7912
7913/* Arrays indexed by above */
7914static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7915
7916#define STRIPNAME(i) (stripformat[i]+3)
7917
7918/* externally visible for str.strip(unicode) */
7919PyObject *
7920_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7921{
7922    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7923    Py_ssize_t len = PyUnicode_GET_SIZE(self);
7924    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7925    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7926    Py_ssize_t i, j;
7927
7928    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7929
7930    i = 0;
7931    if (striptype != RIGHTSTRIP) {
7932        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7933            i++;
7934        }
7935    }
7936
7937    j = len;
7938    if (striptype != LEFTSTRIP) {
7939        do {
7940            j--;
7941        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7942        j++;
7943    }
7944
7945    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7946        Py_INCREF(self);
7947        return (PyObject*)self;
7948    }
7949    else
7950        return PyUnicode_FromUnicode(s+i, j-i);
7951}
7952
7953
7954static PyObject *
7955do_strip(PyUnicodeObject *self, int striptype)
7956{
7957    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7958    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7959
7960    i = 0;
7961    if (striptype != RIGHTSTRIP) {
7962        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7963            i++;
7964        }
7965    }
7966
7967    j = len;
7968    if (striptype != LEFTSTRIP) {
7969        do {
7970            j--;
7971        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7972        j++;
7973    }
7974
7975    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7976        Py_INCREF(self);
7977        return (PyObject*)self;
7978    }
7979    else
7980        return PyUnicode_FromUnicode(s+i, j-i);
7981}
7982
7983
7984static PyObject *
7985do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7986{
7987    PyObject *sep = NULL;
7988
7989    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7990        return NULL;
7991
7992    if (sep != NULL && sep != Py_None) {
7993        if (PyUnicode_Check(sep))
7994            return _PyUnicode_XStrip(self, striptype, sep);
7995        else {
7996            PyErr_Format(PyExc_TypeError,
7997                         "%s arg must be None or str",
7998                         STRIPNAME(striptype));
7999            return NULL;
8000        }
8001    }
8002
8003    return do_strip(self, striptype);
8004}
8005
8006
8007PyDoc_STRVAR(strip__doc__,
8008             "S.strip([chars]) -> str\n\
8009\n\
8010Return a copy of the string S with leading and trailing\n\
8011whitespace removed.\n\
8012If chars is given and not None, remove characters in chars instead.");
8013
8014static PyObject *
8015unicode_strip(PyUnicodeObject *self, PyObject *args)
8016{
8017    if (PyTuple_GET_SIZE(args) == 0)
8018        return do_strip(self, BOTHSTRIP); /* Common case */
8019    else
8020        return do_argstrip(self, BOTHSTRIP, args);
8021}
8022
8023
8024PyDoc_STRVAR(lstrip__doc__,
8025             "S.lstrip([chars]) -> str\n\
8026\n\
8027Return a copy of the string S with leading whitespace removed.\n\
8028If chars is given and not None, remove characters in chars instead.");
8029
8030static PyObject *
8031unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8032{
8033    if (PyTuple_GET_SIZE(args) == 0)
8034        return do_strip(self, LEFTSTRIP); /* Common case */
8035    else
8036        return do_argstrip(self, LEFTSTRIP, args);
8037}
8038
8039
8040PyDoc_STRVAR(rstrip__doc__,
8041             "S.rstrip([chars]) -> str\n\
8042\n\
8043Return a copy of the string S with trailing whitespace removed.\n\
8044If chars is given and not None, remove characters in chars instead.");
8045
8046static PyObject *
8047unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8048{
8049    if (PyTuple_GET_SIZE(args) == 0)
8050        return do_strip(self, RIGHTSTRIP); /* Common case */
8051    else
8052        return do_argstrip(self, RIGHTSTRIP, args);
8053}
8054
8055
8056static PyObject*
8057unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
8058{
8059    PyUnicodeObject *u;
8060    Py_UNICODE *p;
8061    Py_ssize_t nchars;
8062    size_t nbytes;
8063
8064    if (len < 1) {
8065        Py_INCREF(unicode_empty);
8066        return (PyObject *)unicode_empty;
8067    }
8068
8069    if (len == 1 && PyUnicode_CheckExact(str)) {
8070        /* no repeat, return original string */
8071        Py_INCREF(str);
8072        return (PyObject*) str;
8073    }
8074
8075    /* ensure # of chars needed doesn't overflow int and # of bytes
8076     * needed doesn't overflow size_t
8077     */
8078    nchars = len * str->length;
8079    if (nchars / len != str->length) {
8080        PyErr_SetString(PyExc_OverflowError,
8081                        "repeated string is too long");
8082        return NULL;
8083    }
8084    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8085    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8086        PyErr_SetString(PyExc_OverflowError,
8087                        "repeated string is too long");
8088        return NULL;
8089    }
8090    u = _PyUnicode_New(nchars);
8091    if (!u)
8092        return NULL;
8093
8094    p = u->str;
8095
8096    if (str->length == 1) {
8097        Py_UNICODE_FILL(p, str->str[0], len);
8098    } else {
8099        Py_ssize_t done = str->length; /* number of characters copied this far */
8100        Py_UNICODE_COPY(p, str->str, str->length);
8101        while (done < nchars) {
8102            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
8103            Py_UNICODE_COPY(p+done, p, n);
8104            done += n;
8105        }
8106    }
8107
8108    return (PyObject*) u;
8109}
8110
8111PyObject *PyUnicode_Replace(PyObject *obj,
8112                            PyObject *subobj,
8113                            PyObject *replobj,
8114                            Py_ssize_t maxcount)
8115{
8116    PyObject *self;
8117    PyObject *str1;
8118    PyObject *str2;
8119    PyObject *result;
8120
8121    self = PyUnicode_FromObject(obj);
8122    if (self == NULL)
8123        return NULL;
8124    str1 = PyUnicode_FromObject(subobj);
8125    if (str1 == NULL) {
8126        Py_DECREF(self);
8127        return NULL;
8128    }
8129    str2 = PyUnicode_FromObject(replobj);
8130    if (str2 == NULL) {
8131        Py_DECREF(self);
8132        Py_DECREF(str1);
8133        return NULL;
8134    }
8135    result = replace((PyUnicodeObject *)self,
8136                     (PyUnicodeObject *)str1,
8137                     (PyUnicodeObject *)str2,
8138                     maxcount);
8139    Py_DECREF(self);
8140    Py_DECREF(str1);
8141    Py_DECREF(str2);
8142    return result;
8143}
8144
8145PyDoc_STRVAR(replace__doc__,
8146             "S.replace(old, new[, count]) -> str\n\
8147\n\
8148Return a copy of S with all occurrences of substring\n\
8149old replaced by new.  If the optional argument count is\n\
8150given, only the first count occurrences are replaced.");
8151
8152static PyObject*
8153unicode_replace(PyUnicodeObject *self, PyObject *args)
8154{
8155    PyUnicodeObject *str1;
8156    PyUnicodeObject *str2;
8157    Py_ssize_t maxcount = -1;
8158    PyObject *result;
8159
8160    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8161        return NULL;
8162    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8163    if (str1 == NULL)
8164        return NULL;
8165    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
8166    if (str2 == NULL) {
8167        Py_DECREF(str1);
8168        return NULL;
8169    }
8170
8171    result = replace(self, str1, str2, maxcount);
8172
8173    Py_DECREF(str1);
8174    Py_DECREF(str2);
8175    return result;
8176}
8177
8178static
8179PyObject *unicode_repr(PyObject *unicode)
8180{
8181    PyObject *repr;
8182    Py_UNICODE *p;
8183    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8184    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8185
8186    /* XXX(nnorwitz): rather than over-allocating, it would be
8187       better to choose a different scheme.  Perhaps scan the
8188       first N-chars of the string and allocate based on that size.
8189    */
8190    /* Initial allocation is based on the longest-possible unichr
8191       escape.
8192
8193       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8194       unichr, so in this case it's the longest unichr escape. In
8195       narrow (UTF-16) builds this is five chars per source unichr
8196       since there are two unichrs in the surrogate pair, so in narrow
8197       (UTF-16) builds it's not the longest unichr escape.
8198
8199       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8200       so in the narrow (UTF-16) build case it's the longest unichr
8201       escape.
8202    */
8203
8204    repr = PyUnicode_FromUnicode(NULL,
8205                                 2 /* quotes */
8206#ifdef Py_UNICODE_WIDE
8207                                 + 10*size
8208#else
8209                                 + 6*size
8210#endif
8211                                 + 1);
8212    if (repr == NULL)
8213        return NULL;
8214
8215    p = PyUnicode_AS_UNICODE(repr);
8216
8217    /* Add quote */
8218    *p++ = (findchar(s, size, '\'') &&
8219            !findchar(s, size, '"')) ? '"' : '\'';
8220    while (size-- > 0) {
8221        Py_UNICODE ch = *s++;
8222
8223        /* Escape quotes and backslashes */
8224        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8225            *p++ = '\\';
8226            *p++ = ch;
8227            continue;
8228        }
8229
8230        /* Map special whitespace to '\t', \n', '\r' */
8231        if (ch == '\t') {
8232            *p++ = '\\';
8233            *p++ = 't';
8234        }
8235        else if (ch == '\n') {
8236            *p++ = '\\';
8237            *p++ = 'n';
8238        }
8239        else if (ch == '\r') {
8240            *p++ = '\\';
8241            *p++ = 'r';
8242        }
8243
8244        /* Map non-printable US ASCII to '\xhh' */
8245        else if (ch < ' ' || ch == 0x7F) {
8246            *p++ = '\\';
8247            *p++ = 'x';
8248            *p++ = hexdigits[(ch >> 4) & 0x000F];
8249            *p++ = hexdigits[ch & 0x000F];
8250        }
8251
8252        /* Copy ASCII characters as-is */
8253        else if (ch < 0x7F) {
8254            *p++ = ch;
8255        }
8256
8257        /* Non-ASCII characters */
8258        else {
8259            Py_UCS4 ucs = ch;
8260
8261#ifndef Py_UNICODE_WIDE
8262            Py_UNICODE ch2 = 0;
8263            /* Get code point from surrogate pair */
8264            if (size > 0) {
8265                ch2 = *s;
8266                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
8267                    && ch2 <= 0xDFFF) {
8268                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
8269                        + 0x00010000;
8270                    s++;
8271                    size--;
8272                }
8273            }
8274#endif
8275            /* Map Unicode whitespace and control characters
8276               (categories Z* and C* except ASCII space)
8277            */
8278            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8279                /* Map 8-bit characters to '\xhh' */
8280                if (ucs <= 0xff) {
8281                    *p++ = '\\';
8282                    *p++ = 'x';
8283                    *p++ = hexdigits[(ch >> 4) & 0x000F];
8284                    *p++ = hexdigits[ch & 0x000F];
8285                }
8286                /* Map 21-bit characters to '\U00xxxxxx' */
8287                else if (ucs >= 0x10000) {
8288                    *p++ = '\\';
8289                    *p++ = 'U';
8290                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8291                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8292                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8293                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8294                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8295                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8296                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8297                    *p++ = hexdigits[ucs & 0x0000000F];
8298                }
8299                /* Map 16-bit characters to '\uxxxx' */
8300                else {
8301                    *p++ = '\\';
8302                    *p++ = 'u';
8303                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
8304                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
8305                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
8306                    *p++ = hexdigits[ucs & 0x000F];
8307                }
8308            }
8309            /* Copy characters as-is */
8310            else {
8311                *p++ = ch;
8312#ifndef Py_UNICODE_WIDE
8313                if (ucs >= 0x10000)
8314                    *p++ = ch2;
8315#endif
8316            }
8317        }
8318    }
8319    /* Add quote */
8320    *p++ = PyUnicode_AS_UNICODE(repr)[0];
8321
8322    *p = '\0';
8323    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8324    return repr;
8325}
8326
8327PyDoc_STRVAR(rfind__doc__,
8328             "S.rfind(sub[, start[, end]]) -> int\n\
8329\n\
8330Return the highest index in S where substring sub is found,\n\
8331such that sub is contained within s[start:end].  Optional\n\
8332arguments start and end are interpreted as in slice notation.\n\
8333\n\
8334Return -1 on failure.");
8335
8336static PyObject *
8337unicode_rfind(PyUnicodeObject *self, PyObject *args)
8338{
8339    PyObject *substring;
8340    Py_ssize_t start;
8341    Py_ssize_t end;
8342    Py_ssize_t result;
8343
8344    if (!_ParseTupleFinds(args, &substring, &start, &end))
8345        return NULL;
8346
8347    result = stringlib_rfind_slice(
8348        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8349        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8350        start, end
8351        );
8352
8353    Py_DECREF(substring);
8354
8355    return PyLong_FromSsize_t(result);
8356}
8357
8358PyDoc_STRVAR(rindex__doc__,
8359             "S.rindex(sub[, start[, end]]) -> int\n\
8360\n\
8361Like S.rfind() but raise ValueError when the substring is not found.");
8362
8363static PyObject *
8364unicode_rindex(PyUnicodeObject *self, PyObject *args)
8365{
8366    PyObject *substring;
8367    Py_ssize_t start;
8368    Py_ssize_t end;
8369    Py_ssize_t result;
8370
8371    if (!_ParseTupleFinds(args, &substring, &start, &end))
8372        return NULL;
8373
8374    result = stringlib_rfind_slice(
8375        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8376        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8377        start, end
8378        );
8379
8380    Py_DECREF(substring);
8381
8382    if (result < 0) {
8383        PyErr_SetString(PyExc_ValueError, "substring not found");
8384        return NULL;
8385    }
8386    return PyLong_FromSsize_t(result);
8387}
8388
8389PyDoc_STRVAR(rjust__doc__,
8390             "S.rjust(width[, fillchar]) -> str\n\
8391\n\
8392Return S right-justified in a string of length width. Padding is\n\
8393done using the specified fill character (default is a space).");
8394
8395static PyObject *
8396unicode_rjust(PyUnicodeObject *self, PyObject *args)
8397{
8398    Py_ssize_t width;
8399    Py_UNICODE fillchar = ' ';
8400
8401    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8402        return NULL;
8403
8404    if (self->length >= width && PyUnicode_CheckExact(self)) {
8405        Py_INCREF(self);
8406        return (PyObject*) self;
8407    }
8408
8409    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8410}
8411
8412PyObject *PyUnicode_Split(PyObject *s,
8413                          PyObject *sep,
8414                          Py_ssize_t maxsplit)
8415{
8416    PyObject *result;
8417
8418    s = PyUnicode_FromObject(s);
8419    if (s == NULL)
8420        return NULL;
8421    if (sep != NULL) {
8422        sep = PyUnicode_FromObject(sep);
8423        if (sep == NULL) {
8424            Py_DECREF(s);
8425            return NULL;
8426        }
8427    }
8428
8429    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8430
8431    Py_DECREF(s);
8432    Py_XDECREF(sep);
8433    return result;
8434}
8435
8436PyDoc_STRVAR(split__doc__,
8437             "S.split([sep[, maxsplit]]) -> list of strings\n\
8438\n\
8439Return a list of the words in S, using sep as the\n\
8440delimiter string.  If maxsplit is given, at most maxsplit\n\
8441splits are done. If sep is not specified or is None, any\n\
8442whitespace string is a separator and empty strings are\n\
8443removed from the result.");
8444
8445static PyObject*
8446unicode_split(PyUnicodeObject *self, PyObject *args)
8447{
8448    PyObject *substring = Py_None;
8449    Py_ssize_t maxcount = -1;
8450
8451    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8452        return NULL;
8453
8454    if (substring == Py_None)
8455        return split(self, NULL, maxcount);
8456    else if (PyUnicode_Check(substring))
8457        return split(self, (PyUnicodeObject *)substring, maxcount);
8458    else
8459        return PyUnicode_Split((PyObject *)self, substring, maxcount);
8460}
8461
8462PyObject *
8463PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8464{
8465    PyObject* str_obj;
8466    PyObject* sep_obj;
8467    PyObject* out;
8468
8469    str_obj = PyUnicode_FromObject(str_in);
8470    if (!str_obj)
8471        return NULL;
8472    sep_obj = PyUnicode_FromObject(sep_in);
8473    if (!sep_obj) {
8474        Py_DECREF(str_obj);
8475        return NULL;
8476    }
8477
8478    out = stringlib_partition(
8479        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8480        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8481        );
8482
8483    Py_DECREF(sep_obj);
8484    Py_DECREF(str_obj);
8485
8486    return out;
8487}
8488
8489
8490PyObject *
8491PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8492{
8493    PyObject* str_obj;
8494    PyObject* sep_obj;
8495    PyObject* out;
8496
8497    str_obj = PyUnicode_FromObject(str_in);
8498    if (!str_obj)
8499        return NULL;
8500    sep_obj = PyUnicode_FromObject(sep_in);
8501    if (!sep_obj) {
8502        Py_DECREF(str_obj);
8503        return NULL;
8504    }
8505
8506    out = stringlib_rpartition(
8507        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8508        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8509        );
8510
8511    Py_DECREF(sep_obj);
8512    Py_DECREF(str_obj);
8513
8514    return out;
8515}
8516
8517PyDoc_STRVAR(partition__doc__,
8518             "S.partition(sep) -> (head, sep, tail)\n\
8519\n\
8520Search for the separator sep in S, and return the part before it,\n\
8521the separator itself, and the part after it.  If the separator is not\n\
8522found, return S and two empty strings.");
8523
8524static PyObject*
8525unicode_partition(PyUnicodeObject *self, PyObject *separator)
8526{
8527    return PyUnicode_Partition((PyObject *)self, separator);
8528}
8529
8530PyDoc_STRVAR(rpartition__doc__,
8531             "S.rpartition(sep) -> (head, sep, tail)\n\
8532\n\
8533Search for the separator sep in S, starting at the end of S, and return\n\
8534the part before it, the separator itself, and the part after it.  If the\n\
8535separator is not found, return two empty strings and S.");
8536
8537static PyObject*
8538unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8539{
8540    return PyUnicode_RPartition((PyObject *)self, separator);
8541}
8542
8543PyObject *PyUnicode_RSplit(PyObject *s,
8544                           PyObject *sep,
8545                           Py_ssize_t maxsplit)
8546{
8547    PyObject *result;
8548
8549    s = PyUnicode_FromObject(s);
8550    if (s == NULL)
8551        return NULL;
8552    if (sep != NULL) {
8553        sep = PyUnicode_FromObject(sep);
8554        if (sep == NULL) {
8555            Py_DECREF(s);
8556            return NULL;
8557        }
8558    }
8559
8560    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8561
8562    Py_DECREF(s);
8563    Py_XDECREF(sep);
8564    return result;
8565}
8566
8567PyDoc_STRVAR(rsplit__doc__,
8568             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8569\n\
8570Return a list of the words in S, using sep as the\n\
8571delimiter string, starting at the end of the string and\n\
8572working to the front.  If maxsplit is given, at most maxsplit\n\
8573splits are done. If sep is not specified, any whitespace string\n\
8574is a separator.");
8575
8576static PyObject*
8577unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8578{
8579    PyObject *substring = Py_None;
8580    Py_ssize_t maxcount = -1;
8581
8582    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8583        return NULL;
8584
8585    if (substring == Py_None)
8586        return rsplit(self, NULL, maxcount);
8587    else if (PyUnicode_Check(substring))
8588        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8589    else
8590        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8591}
8592
8593PyDoc_STRVAR(splitlines__doc__,
8594             "S.splitlines([keepends]) -> list of strings\n\
8595\n\
8596Return a list of the lines in S, breaking at line boundaries.\n\
8597Line breaks are not included in the resulting list unless keepends\n\
8598is given and true.");
8599
8600static PyObject*
8601unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8602{
8603    int keepends = 0;
8604
8605    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8606        return NULL;
8607
8608    return PyUnicode_Splitlines((PyObject *)self, keepends);
8609}
8610
8611static
8612PyObject *unicode_str(PyObject *self)
8613{
8614    if (PyUnicode_CheckExact(self)) {
8615        Py_INCREF(self);
8616        return self;
8617    } else
8618        /* Subtype -- return genuine unicode string with the same value. */
8619        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8620                                     PyUnicode_GET_SIZE(self));
8621}
8622
8623PyDoc_STRVAR(swapcase__doc__,
8624             "S.swapcase() -> str\n\
8625\n\
8626Return a copy of S with uppercase characters converted to lowercase\n\
8627and vice versa.");
8628
8629static PyObject*
8630unicode_swapcase(PyUnicodeObject *self)
8631{
8632    return fixup(self, fixswapcase);
8633}
8634
8635PyDoc_STRVAR(maketrans__doc__,
8636             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8637\n\
8638Return a translation table usable for str.translate().\n\
8639If there is only one argument, it must be a dictionary mapping Unicode\n\
8640ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8641Character keys will be then converted to ordinals.\n\
8642If there are two arguments, they must be strings of equal length, and\n\
8643in the resulting dictionary, each character in x will be mapped to the\n\
8644character at the same position in y. If there is a third argument, it\n\
8645must be a string, whose characters will be mapped to None in the result.");
8646
8647static PyObject*
8648unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8649{
8650    PyObject *x, *y = NULL, *z = NULL;
8651    PyObject *new = NULL, *key, *value;
8652    Py_ssize_t i = 0;
8653    int res;
8654
8655    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8656        return NULL;
8657    new = PyDict_New();
8658    if (!new)
8659        return NULL;
8660    if (y != NULL) {
8661        /* x must be a string too, of equal length */
8662        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8663        if (!PyUnicode_Check(x)) {
8664            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8665                            "be a string if there is a second argument");
8666            goto err;
8667        }
8668        if (PyUnicode_GET_SIZE(x) != ylen) {
8669            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8670                            "arguments must have equal length");
8671            goto err;
8672        }
8673        /* create entries for translating chars in x to those in y */
8674        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8675            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8676            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8677            if (!key || !value)
8678                goto err;
8679            res = PyDict_SetItem(new, key, value);
8680            Py_DECREF(key);
8681            Py_DECREF(value);
8682            if (res < 0)
8683                goto err;
8684        }
8685        /* create entries for deleting chars in z */
8686        if (z != NULL) {
8687            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8688                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8689                if (!key)
8690                    goto err;
8691                res = PyDict_SetItem(new, key, Py_None);
8692                Py_DECREF(key);
8693                if (res < 0)
8694                    goto err;
8695            }
8696        }
8697    } else {
8698        /* x must be a dict */
8699        if (!PyDict_CheckExact(x)) {
8700            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8701                            "to maketrans it must be a dict");
8702            goto err;
8703        }
8704        /* copy entries into the new dict, converting string keys to int keys */
8705        while (PyDict_Next(x, &i, &key, &value)) {
8706            if (PyUnicode_Check(key)) {
8707                /* convert string keys to integer keys */
8708                PyObject *newkey;
8709                if (PyUnicode_GET_SIZE(key) != 1) {
8710                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8711                                    "table must be of length 1");
8712                    goto err;
8713                }
8714                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8715                if (!newkey)
8716                    goto err;
8717                res = PyDict_SetItem(new, newkey, value);
8718                Py_DECREF(newkey);
8719                if (res < 0)
8720                    goto err;
8721            } else if (PyLong_Check(key)) {
8722                /* just keep integer keys */
8723                if (PyDict_SetItem(new, key, value) < 0)
8724                    goto err;
8725            } else {
8726                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8727                                "be strings or integers");
8728                goto err;
8729            }
8730        }
8731    }
8732    return new;
8733  err:
8734    Py_DECREF(new);
8735    return NULL;
8736}
8737
8738PyDoc_STRVAR(translate__doc__,
8739             "S.translate(table) -> str\n\
8740\n\
8741Return a copy of the string S, where all characters have been mapped\n\
8742through the given translation table, which must be a mapping of\n\
8743Unicode ordinals to Unicode ordinals, strings, or None.\n\
8744Unmapped characters are left untouched. Characters mapped to None\n\
8745are deleted.");
8746
8747static PyObject*
8748unicode_translate(PyUnicodeObject *self, PyObject *table)
8749{
8750    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8751}
8752
8753PyDoc_STRVAR(upper__doc__,
8754             "S.upper() -> str\n\
8755\n\
8756Return a copy of S converted to uppercase.");
8757
8758static PyObject*
8759unicode_upper(PyUnicodeObject *self)
8760{
8761    return fixup(self, fixupper);
8762}
8763
8764PyDoc_STRVAR(zfill__doc__,
8765             "S.zfill(width) -> str\n\
8766\n\
8767Pad a numeric string S with zeros on the left, to fill a field\n\
8768of the specified width. The string S is never truncated.");
8769
8770static PyObject *
8771unicode_zfill(PyUnicodeObject *self, PyObject *args)
8772{
8773    Py_ssize_t fill;
8774    PyUnicodeObject *u;
8775
8776    Py_ssize_t width;
8777    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8778        return NULL;
8779
8780    if (self->length >= width) {
8781        if (PyUnicode_CheckExact(self)) {
8782            Py_INCREF(self);
8783            return (PyObject*) self;
8784        }
8785        else
8786            return PyUnicode_FromUnicode(
8787                PyUnicode_AS_UNICODE(self),
8788                PyUnicode_GET_SIZE(self)
8789                );
8790    }
8791
8792    fill = width - self->length;
8793
8794    u = pad(self, fill, 0, '0');
8795
8796    if (u == NULL)
8797        return NULL;
8798
8799    if (u->str[fill] == '+' || u->str[fill] == '-') {
8800        /* move sign to beginning of string */
8801        u->str[0] = u->str[fill];
8802        u->str[fill] = '0';
8803    }
8804
8805    return (PyObject*) u;
8806}
8807
8808#if 0
8809static PyObject*
8810unicode_freelistsize(PyUnicodeObject *self)
8811{
8812    return PyLong_FromLong(numfree);
8813}
8814#endif
8815
8816PyDoc_STRVAR(startswith__doc__,
8817             "S.startswith(prefix[, start[, end]]) -> bool\n\
8818\n\
8819Return True if S starts with the specified prefix, False otherwise.\n\
8820With optional start, test S beginning at that position.\n\
8821With optional end, stop comparing S at that position.\n\
8822prefix can also be a tuple of strings to try.");
8823
8824static PyObject *
8825unicode_startswith(PyUnicodeObject *self,
8826                   PyObject *args)
8827{
8828    PyObject *subobj;
8829    PyUnicodeObject *substring;
8830    Py_ssize_t start = 0;
8831    Py_ssize_t end = PY_SSIZE_T_MAX;
8832    int result;
8833
8834    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8835                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8836        return NULL;
8837    if (PyTuple_Check(subobj)) {
8838        Py_ssize_t i;
8839        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8840            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8841                PyTuple_GET_ITEM(subobj, i));
8842            if (substring == NULL)
8843                return NULL;
8844            result = tailmatch(self, substring, start, end, -1);
8845            Py_DECREF(substring);
8846            if (result) {
8847                Py_RETURN_TRUE;
8848            }
8849        }
8850        /* nothing matched */
8851        Py_RETURN_FALSE;
8852    }
8853    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8854    if (substring == NULL)
8855        return NULL;
8856    result = tailmatch(self, substring, start, end, -1);
8857    Py_DECREF(substring);
8858    return PyBool_FromLong(result);
8859}
8860
8861
8862PyDoc_STRVAR(endswith__doc__,
8863             "S.endswith(suffix[, start[, end]]) -> bool\n\
8864\n\
8865Return True if S ends with the specified suffix, False otherwise.\n\
8866With optional start, test S beginning at that position.\n\
8867With optional end, stop comparing S at that position.\n\
8868suffix can also be a tuple of strings to try.");
8869
8870static PyObject *
8871unicode_endswith(PyUnicodeObject *self,
8872                 PyObject *args)
8873{
8874    PyObject *subobj;
8875    PyUnicodeObject *substring;
8876    Py_ssize_t start = 0;
8877    Py_ssize_t end = PY_SSIZE_T_MAX;
8878    int result;
8879
8880    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8881                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8882        return NULL;
8883    if (PyTuple_Check(subobj)) {
8884        Py_ssize_t i;
8885        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8886            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8887                PyTuple_GET_ITEM(subobj, i));
8888            if (substring == NULL)
8889                return NULL;
8890            result = tailmatch(self, substring, start, end, +1);
8891            Py_DECREF(substring);
8892            if (result) {
8893                Py_RETURN_TRUE;
8894            }
8895        }
8896        Py_RETURN_FALSE;
8897    }
8898    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8899    if (substring == NULL)
8900        return NULL;
8901
8902    result = tailmatch(self, substring, start, end, +1);
8903    Py_DECREF(substring);
8904    return PyBool_FromLong(result);
8905}
8906
8907#include "stringlib/string_format.h"
8908
8909PyDoc_STRVAR(format__doc__,
8910             "S.format(*args, **kwargs) -> str\n\
8911\n\
8912");
8913
8914static PyObject *
8915unicode__format__(PyObject* self, PyObject* args)
8916{
8917    PyObject *format_spec;
8918
8919    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8920        return NULL;
8921
8922    return _PyUnicode_FormatAdvanced(self,
8923                                     PyUnicode_AS_UNICODE(format_spec),
8924                                     PyUnicode_GET_SIZE(format_spec));
8925}
8926
8927PyDoc_STRVAR(p_format__doc__,
8928             "S.__format__(format_spec) -> str\n\
8929\n\
8930");
8931
8932static PyObject *
8933unicode__sizeof__(PyUnicodeObject *v)
8934{
8935    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8936                              sizeof(Py_UNICODE) * (v->length + 1));
8937}
8938
8939PyDoc_STRVAR(sizeof__doc__,
8940             "S.__sizeof__() -> size of S in memory, in bytes");
8941
8942static PyObject *
8943unicode_getnewargs(PyUnicodeObject *v)
8944{
8945    return Py_BuildValue("(u#)", v->str, v->length);
8946}
8947
8948
8949static PyMethodDef unicode_methods[] = {
8950
8951    /* Order is according to common usage: often used methods should
8952       appear first, since lookup is done sequentially. */
8953
8954    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
8955    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8956    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8957    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8958    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8959    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8960    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8961    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8962    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8963    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8964    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8965    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8966    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8967    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8968    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8969    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8970    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8971    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8972    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8973    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8974    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8975    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8976    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8977    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8978    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8979    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8980    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8981    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8982    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8983    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8984    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8985    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8986    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8987    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8988    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8989    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8990    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8991    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8992    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
8993    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8994    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8995    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8996    {"maketrans", (PyCFunction) unicode_maketrans,
8997     METH_VARARGS | METH_STATIC, maketrans__doc__},
8998    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8999#if 0
9000    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
9001#endif
9002
9003#if 0
9004    /* This one is just used for debugging the implementation. */
9005    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
9006#endif
9007
9008    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
9009    {NULL, NULL}
9010};
9011
9012static PyObject *
9013unicode_mod(PyObject *v, PyObject *w)
9014{
9015    if (!PyUnicode_Check(v)) {
9016        Py_INCREF(Py_NotImplemented);
9017        return Py_NotImplemented;
9018    }
9019    return PyUnicode_Format(v, w);
9020}
9021
9022static PyNumberMethods unicode_as_number = {
9023    0,              /*nb_add*/
9024    0,              /*nb_subtract*/
9025    0,              /*nb_multiply*/
9026    unicode_mod,            /*nb_remainder*/
9027};
9028
9029static PySequenceMethods unicode_as_sequence = {
9030    (lenfunc) unicode_length,       /* sq_length */
9031    PyUnicode_Concat,           /* sq_concat */
9032    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
9033    (ssizeargfunc) unicode_getitem,     /* sq_item */
9034    0,                  /* sq_slice */
9035    0,                  /* sq_ass_item */
9036    0,                  /* sq_ass_slice */
9037    PyUnicode_Contains,         /* sq_contains */
9038};
9039
9040static PyObject*
9041unicode_subscript(PyUnicodeObject* self, PyObject* item)
9042{
9043    if (PyIndex_Check(item)) {
9044        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
9045        if (i == -1 && PyErr_Occurred())
9046            return NULL;
9047        if (i < 0)
9048            i += PyUnicode_GET_SIZE(self);
9049        return unicode_getitem(self, i);
9050    } else if (PySlice_Check(item)) {
9051        Py_ssize_t start, stop, step, slicelength, cur, i;
9052        Py_UNICODE* source_buf;
9053        Py_UNICODE* result_buf;
9054        PyObject* result;
9055
9056        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
9057                                 &start, &stop, &step, &slicelength) < 0) {
9058            return NULL;
9059        }
9060
9061        if (slicelength <= 0) {
9062            return PyUnicode_FromUnicode(NULL, 0);
9063        } else if (start == 0 && step == 1 && slicelength == self->length &&
9064                   PyUnicode_CheckExact(self)) {
9065            Py_INCREF(self);
9066            return (PyObject *)self;
9067        } else if (step == 1) {
9068            return PyUnicode_FromUnicode(self->str + start, slicelength);
9069        } else {
9070            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
9071            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9072                                                       sizeof(Py_UNICODE));
9073
9074            if (result_buf == NULL)
9075                return PyErr_NoMemory();
9076
9077            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9078                result_buf[i] = source_buf[cur];
9079            }
9080
9081            result = PyUnicode_FromUnicode(result_buf, slicelength);
9082            PyObject_FREE(result_buf);
9083            return result;
9084        }
9085    } else {
9086        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9087        return NULL;
9088    }
9089}
9090
9091static PyMappingMethods unicode_as_mapping = {
9092    (lenfunc)unicode_length,        /* mp_length */
9093    (binaryfunc)unicode_subscript,  /* mp_subscript */
9094    (objobjargproc)0,           /* mp_ass_subscript */
9095};
9096
9097
9098/* Helpers for PyUnicode_Format() */
9099
9100static PyObject *
9101getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
9102{
9103    Py_ssize_t argidx = *p_argidx;
9104    if (argidx < arglen) {
9105        (*p_argidx)++;
9106        if (arglen < 0)
9107            return args;
9108        else
9109            return PyTuple_GetItem(args, argidx);
9110    }
9111    PyErr_SetString(PyExc_TypeError,
9112                    "not enough arguments for format string");
9113    return NULL;
9114}
9115
9116/* Returns a new reference to a PyUnicode object, or NULL on failure. */
9117
9118static PyObject *
9119formatfloat(PyObject *v, int flags, int prec, int type)
9120{
9121    char *p;
9122    PyObject *result;
9123    double x;
9124
9125    x = PyFloat_AsDouble(v);
9126    if (x == -1.0 && PyErr_Occurred())
9127        return NULL;
9128
9129    if (prec < 0)
9130        prec = 6;
9131
9132    p = PyOS_double_to_string(x, type, prec,
9133                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
9134    if (p == NULL)
9135        return NULL;
9136    result = PyUnicode_FromStringAndSize(p, strlen(p));
9137    PyMem_Free(p);
9138    return result;
9139}
9140
9141static PyObject*
9142formatlong(PyObject *val, int flags, int prec, int type)
9143{
9144    char *buf;
9145    int len;
9146    PyObject *str; /* temporary string object. */
9147    PyObject *result;
9148
9149    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9150    if (!str)
9151        return NULL;
9152    result = PyUnicode_FromStringAndSize(buf, len);
9153    Py_DECREF(str);
9154    return result;
9155}
9156
9157static int
9158formatchar(Py_UNICODE *buf,
9159           size_t buflen,
9160           PyObject *v)
9161{
9162    /* presume that the buffer is at least 3 characters long */
9163    if (PyUnicode_Check(v)) {
9164        if (PyUnicode_GET_SIZE(v) == 1) {
9165            buf[0] = PyUnicode_AS_UNICODE(v)[0];
9166            buf[1] = '\0';
9167            return 1;
9168        }
9169#ifndef Py_UNICODE_WIDE
9170        if (PyUnicode_GET_SIZE(v) == 2) {
9171            /* Decode a valid surrogate pair */
9172            int c0 = PyUnicode_AS_UNICODE(v)[0];
9173            int c1 = PyUnicode_AS_UNICODE(v)[1];
9174            if (0xD800 <= c0 && c0 <= 0xDBFF &&
9175                0xDC00 <= c1 && c1 <= 0xDFFF) {
9176                buf[0] = c0;
9177                buf[1] = c1;
9178                buf[2] = '\0';
9179                return 2;
9180            }
9181        }
9182#endif
9183        goto onError;
9184    }
9185    else {
9186        /* Integer input truncated to a character */
9187        long x;
9188        x = PyLong_AsLong(v);
9189        if (x == -1 && PyErr_Occurred())
9190            goto onError;
9191
9192        if (x < 0 || x > 0x10ffff) {
9193            PyErr_SetString(PyExc_OverflowError,
9194                            "%c arg not in range(0x110000)");
9195            return -1;
9196        }
9197
9198#ifndef Py_UNICODE_WIDE
9199        if (x > 0xffff) {
9200            x -= 0x10000;
9201            buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9202            buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9203            return 2;
9204        }
9205#endif
9206        buf[0] = (Py_UNICODE) x;
9207        buf[1] = '\0';
9208        return 1;
9209    }
9210
9211  onError:
9212    PyErr_SetString(PyExc_TypeError,
9213                    "%c requires int or char");
9214    return -1;
9215}
9216
9217/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9218   FORMATBUFLEN is the length of the buffer in which chars are formatted.
9219*/
9220#define FORMATBUFLEN (size_t)10
9221
9222PyObject *PyUnicode_Format(PyObject *format,
9223                           PyObject *args)
9224{
9225    Py_UNICODE *fmt, *res;
9226    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
9227    int args_owned = 0;
9228    PyUnicodeObject *result = NULL;
9229    PyObject *dict = NULL;
9230    PyObject *uformat;
9231
9232    if (format == NULL || args == NULL) {
9233        PyErr_BadInternalCall();
9234        return NULL;
9235    }
9236    uformat = PyUnicode_FromObject(format);
9237    if (uformat == NULL)
9238        return NULL;
9239    fmt = PyUnicode_AS_UNICODE(uformat);
9240    fmtcnt = PyUnicode_GET_SIZE(uformat);
9241
9242    reslen = rescnt = fmtcnt + 100;
9243    result = _PyUnicode_New(reslen);
9244    if (result == NULL)
9245        goto onError;
9246    res = PyUnicode_AS_UNICODE(result);
9247
9248    if (PyTuple_Check(args)) {
9249        arglen = PyTuple_Size(args);
9250        argidx = 0;
9251    }
9252    else {
9253        arglen = -1;
9254        argidx = -2;
9255    }
9256    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9257        !PyUnicode_Check(args))
9258        dict = args;
9259
9260    while (--fmtcnt >= 0) {
9261        if (*fmt != '%') {
9262            if (--rescnt < 0) {
9263                rescnt = fmtcnt + 100;
9264                reslen += rescnt;
9265                if (_PyUnicode_Resize(&result, reslen) < 0)
9266                    goto onError;
9267                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9268                --rescnt;
9269            }
9270            *res++ = *fmt++;
9271        }
9272        else {
9273            /* Got a format specifier */
9274            int flags = 0;
9275            Py_ssize_t width = -1;
9276            int prec = -1;
9277            Py_UNICODE c = '\0';
9278            Py_UNICODE fill;
9279            int isnumok;
9280            PyObject *v = NULL;
9281            PyObject *temp = NULL;
9282            Py_UNICODE *pbuf;
9283            Py_UNICODE sign;
9284            Py_ssize_t len;
9285            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
9286
9287            fmt++;
9288            if (*fmt == '(') {
9289                Py_UNICODE *keystart;
9290                Py_ssize_t keylen;
9291                PyObject *key;
9292                int pcount = 1;
9293
9294                if (dict == NULL) {
9295                    PyErr_SetString(PyExc_TypeError,
9296                                    "format requires a mapping");
9297                    goto onError;
9298                }
9299                ++fmt;
9300                --fmtcnt;
9301                keystart = fmt;
9302                /* Skip over balanced parentheses */
9303                while (pcount > 0 && --fmtcnt >= 0) {
9304                    if (*fmt == ')')
9305                        --pcount;
9306                    else if (*fmt == '(')
9307                        ++pcount;
9308                    fmt++;
9309                }
9310                keylen = fmt - keystart - 1;
9311                if (fmtcnt < 0 || pcount > 0) {
9312                    PyErr_SetString(PyExc_ValueError,
9313                                    "incomplete format key");
9314                    goto onError;
9315                }
9316#if 0
9317                /* keys are converted to strings using UTF-8 and
9318                   then looked up since Python uses strings to hold
9319                   variables names etc. in its namespaces and we
9320                   wouldn't want to break common idioms. */
9321                key = PyUnicode_EncodeUTF8(keystart,
9322                                           keylen,
9323                                           NULL);
9324#else
9325                key = PyUnicode_FromUnicode(keystart, keylen);
9326#endif
9327                if (key == NULL)
9328                    goto onError;
9329                if (args_owned) {
9330                    Py_DECREF(args);
9331                    args_owned = 0;
9332                }
9333                args = PyObject_GetItem(dict, key);
9334                Py_DECREF(key);
9335                if (args == NULL) {
9336                    goto onError;
9337                }
9338                args_owned = 1;
9339                arglen = -1;
9340                argidx = -2;
9341            }
9342            while (--fmtcnt >= 0) {
9343                switch (c = *fmt++) {
9344                case '-': flags |= F_LJUST; continue;
9345                case '+': flags |= F_SIGN; continue;
9346                case ' ': flags |= F_BLANK; continue;
9347                case '#': flags |= F_ALT; continue;
9348                case '0': flags |= F_ZERO; continue;
9349                }
9350                break;
9351            }
9352            if (c == '*') {
9353                v = getnextarg(args, arglen, &argidx);
9354                if (v == NULL)
9355                    goto onError;
9356                if (!PyLong_Check(v)) {
9357                    PyErr_SetString(PyExc_TypeError,
9358                                    "* wants int");
9359                    goto onError;
9360                }
9361                width = PyLong_AsLong(v);
9362                if (width == -1 && PyErr_Occurred())
9363                    goto onError;
9364                if (width < 0) {
9365                    flags |= F_LJUST;
9366                    width = -width;
9367                }
9368                if (--fmtcnt >= 0)
9369                    c = *fmt++;
9370            }
9371            else if (c >= '0' && c <= '9') {
9372                width = c - '0';
9373                while (--fmtcnt >= 0) {
9374                    c = *fmt++;
9375                    if (c < '0' || c > '9')
9376                        break;
9377                    if ((width*10) / 10 != width) {
9378                        PyErr_SetString(PyExc_ValueError,
9379                                        "width too big");
9380                        goto onError;
9381                    }
9382                    width = width*10 + (c - '0');
9383                }
9384            }
9385            if (c == '.') {
9386                prec = 0;
9387                if (--fmtcnt >= 0)
9388                    c = *fmt++;
9389                if (c == '*') {
9390                    v = getnextarg(args, arglen, &argidx);
9391                    if (v == NULL)
9392                        goto onError;
9393                    if (!PyLong_Check(v)) {
9394                        PyErr_SetString(PyExc_TypeError,
9395                                        "* wants int");
9396                        goto onError;
9397                    }
9398                    prec = PyLong_AsLong(v);
9399                    if (prec == -1 && PyErr_Occurred())
9400                        goto onError;
9401                    if (prec < 0)
9402                        prec = 0;
9403                    if (--fmtcnt >= 0)
9404                        c = *fmt++;
9405                }
9406                else if (c >= '0' && c <= '9') {
9407                    prec = c - '0';
9408                    while (--fmtcnt >= 0) {
9409                        c = *fmt++;
9410                        if (c < '0' || c > '9')
9411                            break;
9412                        if ((prec*10) / 10 != prec) {
9413                            PyErr_SetString(PyExc_ValueError,
9414                                            "prec too big");
9415                            goto onError;
9416                        }
9417                        prec = prec*10 + (c - '0');
9418                    }
9419                }
9420            } /* prec */
9421            if (fmtcnt >= 0) {
9422                if (c == 'h' || c == 'l' || c == 'L') {
9423                    if (--fmtcnt >= 0)
9424                        c = *fmt++;
9425                }
9426            }
9427            if (fmtcnt < 0) {
9428                PyErr_SetString(PyExc_ValueError,
9429                                "incomplete format");
9430                goto onError;
9431            }
9432            if (c != '%') {
9433                v = getnextarg(args, arglen, &argidx);
9434                if (v == NULL)
9435                    goto onError;
9436            }
9437            sign = 0;
9438            fill = ' ';
9439            switch (c) {
9440
9441            case '%':
9442                pbuf = formatbuf;
9443                /* presume that buffer length is at least 1 */
9444                pbuf[0] = '%';
9445                len = 1;
9446                break;
9447
9448            case 's':
9449            case 'r':
9450            case 'a':
9451                if (PyUnicode_CheckExact(v) && c == 's') {
9452                    temp = v;
9453                    Py_INCREF(temp);
9454                }
9455                else {
9456                    if (c == 's')
9457                        temp = PyObject_Str(v);
9458                    else if (c == 'r')
9459                        temp = PyObject_Repr(v);
9460                    else
9461                        temp = PyObject_ASCII(v);
9462                    if (temp == NULL)
9463                        goto onError;
9464                    if (PyUnicode_Check(temp))
9465                        /* nothing to do */;
9466                    else {
9467                        Py_DECREF(temp);
9468                        PyErr_SetString(PyExc_TypeError,
9469                                        "%s argument has non-string str()");
9470                        goto onError;
9471                    }
9472                }
9473                pbuf = PyUnicode_AS_UNICODE(temp);
9474                len = PyUnicode_GET_SIZE(temp);
9475                if (prec >= 0 && len > prec)
9476                    len = prec;
9477                break;
9478
9479            case 'i':
9480            case 'd':
9481            case 'u':
9482            case 'o':
9483            case 'x':
9484            case 'X':
9485                if (c == 'i')
9486                    c = 'd';
9487                isnumok = 0;
9488                if (PyNumber_Check(v)) {
9489                    PyObject *iobj=NULL;
9490
9491                    if (PyLong_Check(v)) {
9492                        iobj = v;
9493                        Py_INCREF(iobj);
9494                    }
9495                    else {
9496                        iobj = PyNumber_Long(v);
9497                    }
9498                    if (iobj!=NULL) {
9499                        if (PyLong_Check(iobj)) {
9500                            isnumok = 1;
9501                            temp = formatlong(iobj, flags, prec, c);
9502                            Py_DECREF(iobj);
9503                            if (!temp)
9504                                goto onError;
9505                            pbuf = PyUnicode_AS_UNICODE(temp);
9506                            len = PyUnicode_GET_SIZE(temp);
9507                            sign = 1;
9508                        }
9509                        else {
9510                            Py_DECREF(iobj);
9511                        }
9512                    }
9513                }
9514                if (!isnumok) {
9515                    PyErr_Format(PyExc_TypeError,
9516                                 "%%%c format: a number is required, "
9517                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9518                    goto onError;
9519                }
9520                if (flags & F_ZERO)
9521                    fill = '0';
9522                break;
9523
9524            case 'e':
9525            case 'E':
9526            case 'f':
9527            case 'F':
9528            case 'g':
9529            case 'G':
9530                temp = formatfloat(v, flags, prec, c);
9531                if (!temp)
9532                    goto onError;
9533                pbuf = PyUnicode_AS_UNICODE(temp);
9534                len = PyUnicode_GET_SIZE(temp);
9535                sign = 1;
9536                if (flags & F_ZERO)
9537                    fill = '0';
9538                break;
9539
9540            case 'c':
9541                pbuf = formatbuf;
9542                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9543                if (len < 0)
9544                    goto onError;
9545                break;
9546
9547            default:
9548                PyErr_Format(PyExc_ValueError,
9549                             "unsupported format character '%c' (0x%x) "
9550                             "at index %zd",
9551                             (31<=c && c<=126) ? (char)c : '?',
9552                             (int)c,
9553                             (Py_ssize_t)(fmt - 1 -
9554                                          PyUnicode_AS_UNICODE(uformat)));
9555                goto onError;
9556            }
9557            if (sign) {
9558                if (*pbuf == '-' || *pbuf == '+') {
9559                    sign = *pbuf++;
9560                    len--;
9561                }
9562                else if (flags & F_SIGN)
9563                    sign = '+';
9564                else if (flags & F_BLANK)
9565                    sign = ' ';
9566                else
9567                    sign = 0;
9568            }
9569            if (width < len)
9570                width = len;
9571            if (rescnt - (sign != 0) < width) {
9572                reslen -= rescnt;
9573                rescnt = width + fmtcnt + 100;
9574                reslen += rescnt;
9575                if (reslen < 0) {
9576                    Py_XDECREF(temp);
9577                    PyErr_NoMemory();
9578                    goto onError;
9579                }
9580                if (_PyUnicode_Resize(&result, reslen) < 0) {
9581                    Py_XDECREF(temp);
9582                    goto onError;
9583                }
9584                res = PyUnicode_AS_UNICODE(result)
9585                    + reslen - rescnt;
9586            }
9587            if (sign) {
9588                if (fill != ' ')
9589                    *res++ = sign;
9590                rescnt--;
9591                if (width > len)
9592                    width--;
9593            }
9594            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9595                assert(pbuf[0] == '0');
9596                assert(pbuf[1] == c);
9597                if (fill != ' ') {
9598                    *res++ = *pbuf++;
9599                    *res++ = *pbuf++;
9600                }
9601                rescnt -= 2;
9602                width -= 2;
9603                if (width < 0)
9604                    width = 0;
9605                len -= 2;
9606            }
9607            if (width > len && !(flags & F_LJUST)) {
9608                do {
9609                    --rescnt;
9610                    *res++ = fill;
9611                } while (--width > len);
9612            }
9613            if (fill == ' ') {
9614                if (sign)
9615                    *res++ = sign;
9616                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9617                    assert(pbuf[0] == '0');
9618                    assert(pbuf[1] == c);
9619                    *res++ = *pbuf++;
9620                    *res++ = *pbuf++;
9621                }
9622            }
9623            Py_UNICODE_COPY(res, pbuf, len);
9624            res += len;
9625            rescnt -= len;
9626            while (--width >= len) {
9627                --rescnt;
9628                *res++ = ' ';
9629            }
9630            if (dict && (argidx < arglen) && c != '%') {
9631                PyErr_SetString(PyExc_TypeError,
9632                                "not all arguments converted during string formatting");
9633                Py_XDECREF(temp);
9634                goto onError;
9635            }
9636            Py_XDECREF(temp);
9637        } /* '%' */
9638    } /* until end */
9639    if (argidx < arglen && !dict) {
9640        PyErr_SetString(PyExc_TypeError,
9641                        "not all arguments converted during string formatting");
9642        goto onError;
9643    }
9644
9645    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9646        goto onError;
9647    if (args_owned) {
9648        Py_DECREF(args);
9649    }
9650    Py_DECREF(uformat);
9651    return (PyObject *)result;
9652
9653  onError:
9654    Py_XDECREF(result);
9655    Py_DECREF(uformat);
9656    if (args_owned) {
9657        Py_DECREF(args);
9658    }
9659    return NULL;
9660}
9661
9662static PyObject *
9663unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9664
9665static PyObject *
9666unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9667{
9668    PyObject *x = NULL;
9669    static char *kwlist[] = {"object", "encoding", "errors", 0};
9670    char *encoding = NULL;
9671    char *errors = NULL;
9672
9673    if (type != &PyUnicode_Type)
9674        return unicode_subtype_new(type, args, kwds);
9675    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9676                                     kwlist, &x, &encoding, &errors))
9677        return NULL;
9678    if (x == NULL)
9679        return (PyObject *)_PyUnicode_New(0);
9680    if (encoding == NULL && errors == NULL)
9681        return PyObject_Str(x);
9682    else
9683        return PyUnicode_FromEncodedObject(x, encoding, errors);
9684}
9685
9686static PyObject *
9687unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9688{
9689    PyUnicodeObject *tmp, *pnew;
9690    Py_ssize_t n;
9691
9692    assert(PyType_IsSubtype(type, &PyUnicode_Type));
9693    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9694    if (tmp == NULL)
9695        return NULL;
9696    assert(PyUnicode_Check(tmp));
9697    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9698    if (pnew == NULL) {
9699        Py_DECREF(tmp);
9700        return NULL;
9701    }
9702    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9703    if (pnew->str == NULL) {
9704        _Py_ForgetReference((PyObject *)pnew);
9705        PyObject_Del(pnew);
9706        Py_DECREF(tmp);
9707        return PyErr_NoMemory();
9708    }
9709    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9710    pnew->length = n;
9711    pnew->hash = tmp->hash;
9712    Py_DECREF(tmp);
9713    return (PyObject *)pnew;
9714}
9715
9716PyDoc_STRVAR(unicode_doc,
9717             "str(string[, encoding[, errors]]) -> str\n\
9718\n\
9719Create a new string object from the given encoded string.\n\
9720encoding defaults to the current default string encoding.\n\
9721errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9722
9723static PyObject *unicode_iter(PyObject *seq);
9724
9725PyTypeObject PyUnicode_Type = {
9726    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9727    "str",              /* tp_name */
9728    sizeof(PyUnicodeObject),        /* tp_size */
9729    0,                  /* tp_itemsize */
9730    /* Slots */
9731    (destructor)unicode_dealloc,    /* tp_dealloc */
9732    0,                  /* tp_print */
9733    0,                  /* tp_getattr */
9734    0,                  /* tp_setattr */
9735    0,                  /* tp_reserved */
9736    unicode_repr,           /* tp_repr */
9737    &unicode_as_number,         /* tp_as_number */
9738    &unicode_as_sequence,       /* tp_as_sequence */
9739    &unicode_as_mapping,        /* tp_as_mapping */
9740    (hashfunc) unicode_hash,        /* tp_hash*/
9741    0,                  /* tp_call*/
9742    (reprfunc) unicode_str,     /* tp_str */
9743    PyObject_GenericGetAttr,        /* tp_getattro */
9744    0,                  /* tp_setattro */
9745    0,                  /* tp_as_buffer */
9746    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9747    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
9748    unicode_doc,            /* tp_doc */
9749    0,                  /* tp_traverse */
9750    0,                  /* tp_clear */
9751    PyUnicode_RichCompare,      /* tp_richcompare */
9752    0,                  /* tp_weaklistoffset */
9753    unicode_iter,           /* tp_iter */
9754    0,                  /* tp_iternext */
9755    unicode_methods,            /* tp_methods */
9756    0,                  /* tp_members */
9757    0,                  /* tp_getset */
9758    &PyBaseObject_Type,         /* tp_base */
9759    0,                  /* tp_dict */
9760    0,                  /* tp_descr_get */
9761    0,                  /* tp_descr_set */
9762    0,                  /* tp_dictoffset */
9763    0,                  /* tp_init */
9764    0,                  /* tp_alloc */
9765    unicode_new,            /* tp_new */
9766    PyObject_Del,           /* tp_free */
9767};
9768
9769/* Initialize the Unicode implementation */
9770
9771void _PyUnicode_Init(void)
9772{
9773    int i;
9774
9775    /* XXX - move this array to unicodectype.c ? */
9776    Py_UNICODE linebreak[] = {
9777        0x000A, /* LINE FEED */
9778        0x000D, /* CARRIAGE RETURN */
9779        0x001C, /* FILE SEPARATOR */
9780        0x001D, /* GROUP SEPARATOR */
9781        0x001E, /* RECORD SEPARATOR */
9782        0x0085, /* NEXT LINE */
9783        0x2028, /* LINE SEPARATOR */
9784        0x2029, /* PARAGRAPH SEPARATOR */
9785    };
9786
9787    /* Init the implementation */
9788    free_list = NULL;
9789    numfree = 0;
9790    unicode_empty = _PyUnicode_New(0);
9791    if (!unicode_empty)
9792        return;
9793
9794    for (i = 0; i < 256; i++)
9795        unicode_latin1[i] = NULL;
9796    if (PyType_Ready(&PyUnicode_Type) < 0)
9797        Py_FatalError("Can't initialize 'unicode'");
9798
9799    /* initialize the linebreak bloom filter */
9800    bloom_linebreak = make_bloom_mask(
9801        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9802        );
9803
9804    PyType_Ready(&EncodingMapType);
9805}
9806
9807/* Finalize the Unicode implementation */
9808
9809int
9810PyUnicode_ClearFreeList(void)
9811{
9812    int freelist_size = numfree;
9813    PyUnicodeObject *u;
9814
9815    for (u = free_list; u != NULL;) {
9816        PyUnicodeObject *v = u;
9817        u = *(PyUnicodeObject **)u;
9818        if (v->str)
9819            PyObject_DEL(v->str);
9820        Py_XDECREF(v->defenc);
9821        PyObject_Del(v);
9822        numfree--;
9823    }
9824    free_list = NULL;
9825    assert(numfree == 0);
9826    return freelist_size;
9827}
9828
9829void
9830_PyUnicode_Fini(void)
9831{
9832    int i;
9833
9834    Py_XDECREF(unicode_empty);
9835    unicode_empty = NULL;
9836
9837    for (i = 0; i < 256; i++) {
9838        if (unicode_latin1[i]) {
9839            Py_DECREF(unicode_latin1[i]);
9840            unicode_latin1[i] = NULL;
9841        }
9842    }
9843    (void)PyUnicode_ClearFreeList();
9844}
9845
9846void
9847PyUnicode_InternInPlace(PyObject **p)
9848{
9849    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9850    PyObject *t;
9851    if (s == NULL || !PyUnicode_Check(s))
9852        Py_FatalError(
9853            "PyUnicode_InternInPlace: unicode strings only please!");
9854    /* If it's a subclass, we don't really know what putting
9855       it in the interned dict might do. */
9856    if (!PyUnicode_CheckExact(s))
9857        return;
9858    if (PyUnicode_CHECK_INTERNED(s))
9859        return;
9860    if (interned == NULL) {
9861        interned = PyDict_New();
9862        if (interned == NULL) {
9863            PyErr_Clear(); /* Don't leave an exception */
9864            return;
9865        }
9866    }
9867    /* It might be that the GetItem call fails even
9868       though the key is present in the dictionary,
9869       namely when this happens during a stack overflow. */
9870    Py_ALLOW_RECURSION
9871        t = PyDict_GetItem(interned, (PyObject *)s);
9872    Py_END_ALLOW_RECURSION
9873
9874        if (t) {
9875            Py_INCREF(t);
9876            Py_DECREF(*p);
9877            *p = t;
9878            return;
9879        }
9880
9881    PyThreadState_GET()->recursion_critical = 1;
9882    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9883        PyErr_Clear();
9884        PyThreadState_GET()->recursion_critical = 0;
9885        return;
9886    }
9887    PyThreadState_GET()->recursion_critical = 0;
9888    /* The two references in interned are not counted by refcnt.
9889       The deallocator will take care of this */
9890    Py_REFCNT(s) -= 2;
9891    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9892}
9893
9894void
9895PyUnicode_InternImmortal(PyObject **p)
9896{
9897    PyUnicode_InternInPlace(p);
9898    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9899        PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9900        Py_INCREF(*p);
9901    }
9902}
9903
9904PyObject *
9905PyUnicode_InternFromString(const char *cp)
9906{
9907    PyObject *s = PyUnicode_FromString(cp);
9908    if (s == NULL)
9909        return NULL;
9910    PyUnicode_InternInPlace(&s);
9911    return s;
9912}
9913
9914void _Py_ReleaseInternedUnicodeStrings(void)
9915{
9916    PyObject *keys;
9917    PyUnicodeObject *s;
9918    Py_ssize_t i, n;
9919    Py_ssize_t immortal_size = 0, mortal_size = 0;
9920
9921    if (interned == NULL || !PyDict_Check(interned))
9922        return;
9923    keys = PyDict_Keys(interned);
9924    if (keys == NULL || !PyList_Check(keys)) {
9925        PyErr_Clear();
9926        return;
9927    }
9928
9929    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9930       detector, interned unicode strings are not forcibly deallocated;
9931       rather, we give them their stolen references back, and then clear
9932       and DECREF the interned dict. */
9933
9934    n = PyList_GET_SIZE(keys);
9935    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9936            n);
9937    for (i = 0; i < n; i++) {
9938        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9939        switch (s->state) {
9940        case SSTATE_NOT_INTERNED:
9941            /* XXX Shouldn't happen */
9942            break;
9943        case SSTATE_INTERNED_IMMORTAL:
9944            Py_REFCNT(s) += 1;
9945            immortal_size += s->length;
9946            break;
9947        case SSTATE_INTERNED_MORTAL:
9948            Py_REFCNT(s) += 2;
9949            mortal_size += s->length;
9950            break;
9951        default:
9952            Py_FatalError("Inconsistent interned string state.");
9953        }
9954        s->state = SSTATE_NOT_INTERNED;
9955    }
9956    fprintf(stderr, "total size of all interned strings: "
9957            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9958            "mortal/immortal\n", mortal_size, immortal_size);
9959    Py_DECREF(keys);
9960    PyDict_Clear(interned);
9961    Py_DECREF(interned);
9962    interned = NULL;
9963}
9964
9965
9966/********************* Unicode Iterator **************************/
9967
9968typedef struct {
9969    PyObject_HEAD
9970    Py_ssize_t it_index;
9971    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9972} unicodeiterobject;
9973
9974static void
9975unicodeiter_dealloc(unicodeiterobject *it)
9976{
9977    _PyObject_GC_UNTRACK(it);
9978    Py_XDECREF(it->it_seq);
9979    PyObject_GC_Del(it);
9980}
9981
9982static int
9983unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9984{
9985    Py_VISIT(it->it_seq);
9986    return 0;
9987}
9988
9989static PyObject *
9990unicodeiter_next(unicodeiterobject *it)
9991{
9992    PyUnicodeObject *seq;
9993    PyObject *item;
9994
9995    assert(it != NULL);
9996    seq = it->it_seq;
9997    if (seq == NULL)
9998        return NULL;
9999    assert(PyUnicode_Check(seq));
10000
10001    if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10002        item = PyUnicode_FromUnicode(
10003            PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
10004        if (item != NULL)
10005            ++it->it_index;
10006        return item;
10007    }
10008
10009    Py_DECREF(seq);
10010    it->it_seq = NULL;
10011    return NULL;
10012}
10013
10014static PyObject *
10015unicodeiter_len(unicodeiterobject *it)
10016{
10017    Py_ssize_t len = 0;
10018    if (it->it_seq)
10019        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10020    return PyLong_FromSsize_t(len);
10021}
10022
10023PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10024
10025static PyMethodDef unicodeiter_methods[] = {
10026    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
10027     length_hint_doc},
10028    {NULL,      NULL}       /* sentinel */
10029};
10030
10031PyTypeObject PyUnicodeIter_Type = {
10032    PyVarObject_HEAD_INIT(&PyType_Type, 0)
10033    "str_iterator",         /* tp_name */
10034    sizeof(unicodeiterobject),      /* tp_basicsize */
10035    0,                  /* tp_itemsize */
10036    /* methods */
10037    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
10038    0,                  /* tp_print */
10039    0,                  /* tp_getattr */
10040    0,                  /* tp_setattr */
10041    0,                  /* tp_reserved */
10042    0,                  /* tp_repr */
10043    0,                  /* tp_as_number */
10044    0,                  /* tp_as_sequence */
10045    0,                  /* tp_as_mapping */
10046    0,                  /* tp_hash */
10047    0,                  /* tp_call */
10048    0,                  /* tp_str */
10049    PyObject_GenericGetAttr,        /* tp_getattro */
10050    0,                  /* tp_setattro */
10051    0,                  /* tp_as_buffer */
10052    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10053    0,                  /* tp_doc */
10054    (traverseproc)unicodeiter_traverse, /* tp_traverse */
10055    0,                  /* tp_clear */
10056    0,                  /* tp_richcompare */
10057    0,                  /* tp_weaklistoffset */
10058    PyObject_SelfIter,          /* tp_iter */
10059    (iternextfunc)unicodeiter_next,     /* tp_iternext */
10060    unicodeiter_methods,            /* tp_methods */
10061    0,
10062};
10063
10064static PyObject *
10065unicode_iter(PyObject *seq)
10066{
10067    unicodeiterobject *it;
10068
10069    if (!PyUnicode_Check(seq)) {
10070        PyErr_BadInternalCall();
10071        return NULL;
10072    }
10073    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10074    if (it == NULL)
10075        return NULL;
10076    it->it_index = 0;
10077    Py_INCREF(seq);
10078    it->it_seq = (PyUnicodeObject *)seq;
10079    _PyObject_GC_TRACK(it);
10080    return (PyObject *)it;
10081}
10082
10083size_t
10084Py_UNICODE_strlen(const Py_UNICODE *u)
10085{
10086    int res = 0;
10087    while(*u++)
10088        res++;
10089    return res;
10090}
10091
10092Py_UNICODE*
10093Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10094{
10095    Py_UNICODE *u = s1;
10096    while ((*u++ = *s2++));
10097    return s1;
10098}
10099
10100Py_UNICODE*
10101Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10102{
10103    Py_UNICODE *u = s1;
10104    while ((*u++ = *s2++))
10105        if (n-- == 0)
10106            break;
10107    return s1;
10108}
10109
10110Py_UNICODE*
10111Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10112{
10113    Py_UNICODE *u1 = s1;
10114    u1 += Py_UNICODE_strlen(u1);
10115    Py_UNICODE_strcpy(u1, s2);
10116    return s1;
10117}
10118
10119int
10120Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10121{
10122    while (*s1 && *s2 && *s1 == *s2)
10123        s1++, s2++;
10124    if (*s1 && *s2)
10125        return (*s1 < *s2) ? -1 : +1;
10126    if (*s1)
10127        return 1;
10128    if (*s2)
10129        return -1;
10130    return 0;
10131}
10132
10133int
10134Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10135{
10136    register Py_UNICODE u1, u2;
10137    for (; n != 0; n--) {
10138        u1 = *s1;
10139        u2 = *s2;
10140        if (u1 != u2)
10141            return (u1 < u2) ? -1 : +1;
10142        if (u1 == '\0')
10143            return 0;
10144        s1++;
10145        s2++;
10146    }
10147    return 0;
10148}
10149
10150Py_UNICODE*
10151Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10152{
10153    const Py_UNICODE *p;
10154    for (p = s; *p; p++)
10155        if (*p == c)
10156            return (Py_UNICODE*)p;
10157    return NULL;
10158}
10159
10160Py_UNICODE*
10161Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10162{
10163    const Py_UNICODE *p;
10164    p = s + Py_UNICODE_strlen(s);
10165    while (p != s) {
10166        p--;
10167        if (*p == c)
10168            return (Py_UNICODE*)p;
10169    }
10170    return NULL;
10171}
10172
10173Py_UNICODE*
10174PyUnicode_AsUnicodeCopy(PyObject *object)
10175{
10176    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10177    Py_UNICODE *copy;
10178    Py_ssize_t size;
10179
10180    /* Ensure we won't overflow the size. */
10181    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10182        PyErr_NoMemory();
10183        return NULL;
10184    }
10185    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10186    size *= sizeof(Py_UNICODE);
10187    copy = PyMem_Malloc(size);
10188    if (copy == NULL) {
10189        PyErr_NoMemory();
10190        return NULL;
10191    }
10192    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10193    return copy;
10194}
10195
10196/* A _string module, to export formatter_parser and formatter_field_name_split
10197   to the string.Formatter class implemented in Python. */
10198
10199static PyMethodDef _string_methods[] = {
10200    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10201     METH_O, PyDoc_STR("split the argument as a field name")},
10202    {"formatter_parser", (PyCFunction) formatter_parser,
10203     METH_O, PyDoc_STR("parse the argument as a format string")},
10204    {NULL, NULL}
10205};
10206
10207static struct PyModuleDef _string_module = {
10208    PyModuleDef_HEAD_INIT,
10209    "_string",
10210    PyDoc_STR("string helper module"),
10211    0,
10212    _string_methods,
10213    NULL,
10214    NULL,
10215    NULL,
10216    NULL
10217};
10218
10219PyMODINIT_FUNC
10220PyInit__string(void)
10221{
10222    return PyModule_Create(&_string_module);
10223}
10224
10225
10226#ifdef __cplusplus
10227}
10228#endif
10229