unicodeobject.c revision 51d2fd983bcc85342b631e27a33e214c691e53be
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "bytes_methods.h"
45
46#include "unicodeobject.h"
47#include "ucnhash.h"
48
49#ifdef MS_WINDOWS
50#include <windows.h>
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96/* This dictionary holds all interned unicode strings.  Note that references
97   to strings in this dictionary are *not* counted in the string's ob_refcnt.
98   When the interned string reaches a refcnt of 0 the string deallocation
99   function will delete the reference from this dictionary.
100
101   Another way to look at this is that to say that the actual reference
102   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
103*/
104static PyObject *interned;
105
106/* Free list for Unicode objects */
107static PyUnicodeObject *free_list;
108static int numfree;
109
110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114   shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119    0, 0, 0, 0, 0, 0, 0, 0,
120/*     case 0x0009: * CHARACTER TABULATION */
121/*     case 0x000A: * LINE FEED */
122/*     case 0x000B: * LINE TABULATION */
123/*     case 0x000C: * FORM FEED */
124/*     case 0x000D: * CARRIAGE RETURN */
125    0, 1, 1, 1, 1, 1, 0, 0,
126    0, 0, 0, 0, 0, 0, 0, 0,
127/*     case 0x001C: * FILE SEPARATOR */
128/*     case 0x001D: * GROUP SEPARATOR */
129/*     case 0x001E: * RECORD SEPARATOR */
130/*     case 0x001F: * UNIT SEPARATOR */
131    0, 0, 0, 0, 1, 1, 1, 1,
132/*     case 0x0020: * SPACE */
133    1, 0, 0, 0, 0, 0, 0, 0,
134    0, 0, 0, 0, 0, 0, 0, 0,
135    0, 0, 0, 0, 0, 0, 0, 0,
136    0, 0, 0, 0, 0, 0, 0, 0,
137
138    0, 0, 0, 0, 0, 0, 0, 0,
139    0, 0, 0, 0, 0, 0, 0, 0,
140    0, 0, 0, 0, 0, 0, 0, 0,
141    0, 0, 0, 0, 0, 0, 0, 0,
142    0, 0, 0, 0, 0, 0, 0, 0,
143    0, 0, 0, 0, 0, 0, 0, 0,
144    0, 0, 0, 0, 0, 0, 0, 0,
145    0, 0, 0, 0, 0, 0, 0, 0
146};
147
148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149       PyObject **errorHandler,const char *encoding, const char *reason,
150       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
153static void raise_encode_exception(PyObject **exceptionObject,
154                                   const char *encoding,
155                                   const Py_UNICODE *unicode, Py_ssize_t size,
156                                   Py_ssize_t startpos, Py_ssize_t endpos,
157                                   const char *reason);
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161    0, 0, 0, 0, 0, 0, 0, 0,
162/*         0x000A, * LINE FEED */
163/*         0x000B, * LINE TABULATION */
164/*         0x000C, * FORM FEED */
165/*         0x000D, * CARRIAGE RETURN */
166    0, 0, 1, 1, 1, 1, 0, 0,
167    0, 0, 0, 0, 0, 0, 0, 0,
168/*         0x001C, * FILE SEPARATOR */
169/*         0x001D, * GROUP SEPARATOR */
170/*         0x001E, * RECORD SEPARATOR */
171    0, 0, 0, 0, 1, 1, 1, 0,
172    0, 0, 0, 0, 0, 0, 0, 0,
173    0, 0, 0, 0, 0, 0, 0, 0,
174    0, 0, 0, 0, 0, 0, 0, 0,
175    0, 0, 0, 0, 0, 0, 0, 0,
176
177    0, 0, 0, 0, 0, 0, 0, 0,
178    0, 0, 0, 0, 0, 0, 0, 0,
179    0, 0, 0, 0, 0, 0, 0, 0,
180    0, 0, 0, 0, 0, 0, 0, 0,
181    0, 0, 0, 0, 0, 0, 0, 0,
182    0, 0, 0, 0, 0, 0, 0, 0,
183    0, 0, 0, 0, 0, 0, 0, 0,
184    0, 0, 0, 0, 0, 0, 0, 0
185};
186
187
188Py_UNICODE
189PyUnicode_GetMax(void)
190{
191#ifdef Py_UNICODE_WIDE
192    return 0x10FFFF;
193#else
194    /* This is actually an illegal character, so it should
195       not be passed to unichr. */
196    return 0xFFFF;
197#endif
198}
199
200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203   to keep things simple, we use a single bitmask, using the least 5
204   bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224
225#define BLOOM_LINEBREAK(ch)                                             \
226    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
227     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231    /* calculate simple bloom-style bitmask for a given unicode string */
232
233    BLOOM_MASK mask;
234    Py_ssize_t i;
235
236    mask = 0;
237    for (i = 0; i < len; i++)
238        BLOOM_ADD(mask, ptr[i]);
239
240    return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245    Py_ssize_t i;
246
247    for (i = 0; i < setlen; i++)
248        if (set[i] == chr)
249            return 1;
250
251    return 0;
252}
253
254#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
255    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
257/* --- Unicode Object ----------------------------------------------------- */
258
259static
260int unicode_resize(register PyUnicodeObject *unicode,
261                   Py_ssize_t length)
262{
263    void *oldstr;
264
265    /* Shortcut if there's nothing much to do. */
266    if (unicode->length == length)
267        goto reset;
268
269    /* Resizing shared object (unicode_empty or single character
270       objects) in-place is not allowed. Use PyUnicode_Resize()
271       instead ! */
272
273    if (unicode == unicode_empty ||
274        (unicode->length == 1 &&
275         unicode->str[0] < 256U &&
276         unicode_latin1[unicode->str[0]] == unicode)) {
277        PyErr_SetString(PyExc_SystemError,
278                        "can't resize shared str objects");
279        return -1;
280    }
281
282    /* We allocate one more byte to make sure the string is Ux0000 terminated.
283       The overallocation is also used by fastsearch, which assumes that it's
284       safe to look at str[length] (without making any assumptions about what
285       it contains). */
286
287    oldstr = unicode->str;
288    unicode->str = PyObject_REALLOC(unicode->str,
289                                    sizeof(Py_UNICODE) * (length + 1));
290    if (!unicode->str) {
291        unicode->str = (Py_UNICODE *)oldstr;
292        PyErr_NoMemory();
293        return -1;
294    }
295    unicode->str[length] = 0;
296    unicode->length = length;
297
298  reset:
299    /* Reset the object caches */
300    if (unicode->defenc) {
301        Py_CLEAR(unicode->defenc);
302    }
303    unicode->hash = -1;
304
305    return 0;
306}
307
308/* We allocate one more byte to make sure the string is
309   Ux0000 terminated; some code (e.g. new_identifier)
310   relies on that.
311
312   XXX This allocator could further be enhanced by assuring that the
313   free list never reduces its size below 1.
314
315*/
316
317static
318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
319{
320    register PyUnicodeObject *unicode;
321
322    /* Optimization for empty strings */
323    if (length == 0 && unicode_empty != NULL) {
324        Py_INCREF(unicode_empty);
325        return unicode_empty;
326    }
327
328    /* Ensure we won't overflow the size. */
329    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330        return (PyUnicodeObject *)PyErr_NoMemory();
331    }
332
333    /* Unicode freelist & memory allocation */
334    if (free_list) {
335        unicode = free_list;
336        free_list = *(PyUnicodeObject **)unicode;
337        numfree--;
338        if (unicode->str) {
339            /* Keep-Alive optimization: we only upsize the buffer,
340               never downsize it. */
341            if ((unicode->length < length) &&
342                unicode_resize(unicode, length) < 0) {
343                PyObject_DEL(unicode->str);
344                unicode->str = NULL;
345            }
346        }
347        else {
348            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
350        }
351        PyObject_INIT(unicode, &PyUnicode_Type);
352    }
353    else {
354        size_t new_size;
355        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
356        if (unicode == NULL)
357            return NULL;
358        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
360    }
361
362    if (!unicode->str) {
363        PyErr_NoMemory();
364        goto onError;
365    }
366    /* Initialize the first element to guard against cases where
367     * the caller fails before initializing str -- unicode_resize()
368     * reads str[0], and the Keep-Alive optimization can keep memory
369     * allocated for str alive across a call to unicode_dealloc(unicode).
370     * We don't want unicode_resize to read uninitialized memory in
371     * that case.
372     */
373    unicode->str[0] = 0;
374    unicode->str[length] = 0;
375    unicode->length = length;
376    unicode->hash = -1;
377    unicode->state = 0;
378    unicode->defenc = NULL;
379    return unicode;
380
381  onError:
382    /* XXX UNREF/NEWREF interface should be more symmetrical */
383    _Py_DEC_REFTOTAL;
384    _Py_ForgetReference((PyObject *)unicode);
385    PyObject_Del(unicode);
386    return NULL;
387}
388
389static
390void unicode_dealloc(register PyUnicodeObject *unicode)
391{
392    switch (PyUnicode_CHECK_INTERNED(unicode)) {
393    case SSTATE_NOT_INTERNED:
394        break;
395
396    case SSTATE_INTERNED_MORTAL:
397        /* revive dead object temporarily for DelItem */
398        Py_REFCNT(unicode) = 3;
399        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400            Py_FatalError(
401                "deletion of interned string failed");
402        break;
403
404    case SSTATE_INTERNED_IMMORTAL:
405        Py_FatalError("Immortal interned string died.");
406
407    default:
408        Py_FatalError("Inconsistent interned string state.");
409    }
410
411    if (PyUnicode_CheckExact(unicode) &&
412        numfree < PyUnicode_MAXFREELIST) {
413        /* Keep-Alive optimization */
414        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415            PyObject_DEL(unicode->str);
416            unicode->str = NULL;
417            unicode->length = 0;
418        }
419        if (unicode->defenc) {
420            Py_CLEAR(unicode->defenc);
421        }
422        /* Add to free list */
423        *(PyUnicodeObject **)unicode = free_list;
424        free_list = unicode;
425        numfree++;
426    }
427    else {
428        PyObject_DEL(unicode->str);
429        Py_XDECREF(unicode->defenc);
430        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
431    }
432}
433
434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
436{
437    register PyUnicodeObject *v;
438
439    /* Argument checks */
440    if (unicode == NULL) {
441        PyErr_BadInternalCall();
442        return -1;
443    }
444    v = *unicode;
445    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
446        PyErr_BadInternalCall();
447        return -1;
448    }
449
450    /* Resizing unicode_empty and single character objects is not
451       possible since these are being shared. We simply return a fresh
452       copy with the same Unicode content. */
453    if (v->length != length &&
454        (v == unicode_empty || v->length == 1)) {
455        PyUnicodeObject *w = _PyUnicode_New(length);
456        if (w == NULL)
457            return -1;
458        Py_UNICODE_COPY(w->str, v->str,
459                        length < v->length ? length : v->length);
460        Py_DECREF(*unicode);
461        *unicode = w;
462        return 0;
463    }
464
465    /* Note that we don't have to modify *unicode for unshared Unicode
466       objects, since we can modify them in-place. */
467    return unicode_resize(v, length);
468}
469
470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
474
475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
476                                Py_ssize_t size)
477{
478    PyUnicodeObject *unicode;
479
480    /* If the Unicode data is known at construction time, we can apply
481       some optimizations which share commonly used objects. */
482    if (u != NULL) {
483
484        /* Optimization for empty strings */
485        if (size == 0 && unicode_empty != NULL) {
486            Py_INCREF(unicode_empty);
487            return (PyObject *)unicode_empty;
488        }
489
490        /* Single character Unicode objects in the Latin-1 range are
491           shared when using this constructor */
492        if (size == 1 && *u < 256) {
493            unicode = unicode_latin1[*u];
494            if (!unicode) {
495                unicode = _PyUnicode_New(1);
496                if (!unicode)
497                    return NULL;
498                unicode->str[0] = *u;
499                unicode_latin1[*u] = unicode;
500            }
501            Py_INCREF(unicode);
502            return (PyObject *)unicode;
503        }
504    }
505
506    unicode = _PyUnicode_New(size);
507    if (!unicode)
508        return NULL;
509
510    /* Copy the Unicode data into the new object */
511    if (u != NULL)
512        Py_UNICODE_COPY(unicode->str, u, size);
513
514    return (PyObject *)unicode;
515}
516
517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
518{
519    PyUnicodeObject *unicode;
520
521    if (size < 0) {
522        PyErr_SetString(PyExc_SystemError,
523                        "Negative size passed to PyUnicode_FromStringAndSize");
524        return NULL;
525    }
526
527    /* If the Unicode data is known at construction time, we can apply
528       some optimizations which share commonly used objects.
529       Also, this means the input must be UTF-8, so fall back to the
530       UTF-8 decoder at the end. */
531    if (u != NULL) {
532
533        /* Optimization for empty strings */
534        if (size == 0 && unicode_empty != NULL) {
535            Py_INCREF(unicode_empty);
536            return (PyObject *)unicode_empty;
537        }
538
539        /* Single characters are shared when using this constructor.
540           Restrict to ASCII, since the input must be UTF-8. */
541        if (size == 1 && Py_CHARMASK(*u) < 128) {
542            unicode = unicode_latin1[Py_CHARMASK(*u)];
543            if (!unicode) {
544                unicode = _PyUnicode_New(1);
545                if (!unicode)
546                    return NULL;
547                unicode->str[0] = Py_CHARMASK(*u);
548                unicode_latin1[Py_CHARMASK(*u)] = unicode;
549            }
550            Py_INCREF(unicode);
551            return (PyObject *)unicode;
552        }
553
554        return PyUnicode_DecodeUTF8(u, size, NULL);
555    }
556
557    unicode = _PyUnicode_New(size);
558    if (!unicode)
559        return NULL;
560
561    return (PyObject *)unicode;
562}
563
564PyObject *PyUnicode_FromString(const char *u)
565{
566    size_t size = strlen(u);
567    if (size > PY_SSIZE_T_MAX) {
568        PyErr_SetString(PyExc_OverflowError, "input too long");
569        return NULL;
570    }
571
572    return PyUnicode_FromStringAndSize(u, size);
573}
574
575#ifdef HAVE_WCHAR_H
576
577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584   to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587                                 Py_ssize_t size)
588{
589    PyUnicodeObject *unicode;
590    register Py_ssize_t i;
591    Py_ssize_t alloc;
592    const wchar_t *orig_w;
593
594    if (w == NULL) {
595        if (size == 0)
596            return PyUnicode_FromStringAndSize(NULL, 0);
597        PyErr_BadInternalCall();
598        return NULL;
599    }
600
601    if (size == -1) {
602        size = wcslen(w);
603    }
604
605    alloc = size;
606    orig_w = w;
607    for (i = size; i > 0; i--) {
608        if (*w > 0xFFFF)
609            alloc++;
610        w++;
611    }
612    w = orig_w;
613    unicode = _PyUnicode_New(alloc);
614    if (!unicode)
615        return NULL;
616
617    /* Copy the wchar_t data into the new object */
618    {
619        register Py_UNICODE *u;
620        u = PyUnicode_AS_UNICODE(unicode);
621        for (i = size; i > 0; i--) {
622            if (*w > 0xFFFF) {
623                wchar_t ordinal = *w++;
624                ordinal -= 0x10000;
625                *u++ = 0xD800 | (ordinal >> 10);
626                *u++ = 0xDC00 | (ordinal & 0x3FF);
627            }
628            else
629                *u++ = *w++;
630        }
631    }
632    return (PyObject *)unicode;
633}
634
635#else
636
637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
638                                 Py_ssize_t size)
639{
640    PyUnicodeObject *unicode;
641
642    if (w == NULL) {
643        if (size == 0)
644            return PyUnicode_FromStringAndSize(NULL, 0);
645        PyErr_BadInternalCall();
646        return NULL;
647    }
648
649    if (size == -1) {
650        size = wcslen(w);
651    }
652
653    unicode = _PyUnicode_New(size);
654    if (!unicode)
655        return NULL;
656
657    /* Copy the wchar_t data into the new object */
658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
659    memcpy(unicode->str, w, size * sizeof(wchar_t));
660#else
661    {
662        register Py_UNICODE *u;
663        register Py_ssize_t i;
664        u = PyUnicode_AS_UNICODE(unicode);
665        for (i = size; i > 0; i--)
666            *u++ = *w++;
667    }
668#endif
669
670    return (PyObject *)unicode;
671}
672
673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
677static void
678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679        int zeropad, int width, int precision, char c)
680{
681    *fmt++ = '%';
682    if (width) {
683        if (zeropad)
684            *fmt++ = '0';
685        fmt += sprintf(fmt, "%d", width);
686    }
687    if (precision)
688        fmt += sprintf(fmt, ".%d", precision);
689    if (longflag)
690        *fmt++ = 'l';
691    else if (longlongflag) {
692        /* longlongflag should only ever be nonzero on machines with
693           HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695        char *f = PY_FORMAT_LONG_LONG;
696        while (*f)
697            *fmt++ = *f++;
698#else
699        /* we shouldn't ever get here */
700        assert(0);
701        *fmt++ = 'l';
702#endif
703    }
704    else if (size_tflag) {
705        char *f = PY_FORMAT_SIZE_T;
706        while (*f)
707            *fmt++ = *f++;
708    }
709    *fmt++ = c;
710    *fmt = '\0';
711}
712
713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld.  21 characters
718   allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
728    va_list count;
729    Py_ssize_t callcount = 0;
730    PyObject **callresults = NULL;
731    PyObject **callresult = NULL;
732    Py_ssize_t n = 0;
733    int width = 0;
734    int precision = 0;
735    int zeropad;
736    const char* f;
737    Py_UNICODE *s;
738    PyObject *string;
739    /* used by sprintf */
740    char buffer[ITEM_BUFFER_LEN+1];
741    /* use abuffer instead of buffer, if we need more space
742     * (which can happen if there's a format specifier with width). */
743    char *abuffer = NULL;
744    char *realbuffer;
745    Py_ssize_t abuffersize = 0;
746    char fmt[61]; /* should be enough for %0width.precisionlld */
747    const char *copy;
748
749    Py_VA_COPY(count, vargs);
750    /* step 1: count the number of %S/%R/%A/%s format specifications
751     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753     * result in an array) */
754    for (f = format; *f; f++) {
755         if (*f == '%') {
756             if (*(f+1)=='%')
757                 continue;
758             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759                 ++callcount;
760             while (Py_ISDIGIT((unsigned)*f))
761                 width = (width*10) + *f++ - '0';
762             while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
763                 ;
764             if (*f == 's')
765                 ++callcount;
766         }
767         else if (128 <= (unsigned char)*f) {
768             PyErr_Format(PyExc_ValueError,
769                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
770                "string, got a non-ASCII byte: 0x%02x",
771                (unsigned char)*f);
772             return NULL;
773         }
774    }
775    /* step 2: allocate memory for the results of
776     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
777    if (callcount) {
778        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
779        if (!callresults) {
780            PyErr_NoMemory();
781            return NULL;
782        }
783        callresult = callresults;
784    }
785    /* step 3: figure out how large a buffer we need */
786    for (f = format; *f; f++) {
787        if (*f == '%') {
788#ifdef HAVE_LONG_LONG
789            int longlongflag = 0;
790#endif
791            const char* p = f;
792            width = 0;
793            while (Py_ISDIGIT((unsigned)*f))
794                width = (width*10) + *f++ - '0';
795            while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
796                ;
797
798            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
799             * they don't affect the amount of space we reserve.
800             */
801            if (*f == 'l') {
802                if (f[1] == 'd' || f[1] == 'u') {
803                    ++f;
804                }
805#ifdef HAVE_LONG_LONG
806                else if (f[1] == 'l' &&
807                         (f[2] == 'd' || f[2] == 'u')) {
808                    longlongflag = 1;
809                    f += 2;
810                }
811#endif
812            }
813            else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
814                ++f;
815            }
816
817            switch (*f) {
818            case 'c':
819                (void)va_arg(count, int);
820                /* fall through... */
821            case '%':
822                n++;
823                break;
824            case 'd': case 'u': case 'i': case 'x':
825                (void) va_arg(count, int);
826#ifdef HAVE_LONG_LONG
827                if (longlongflag) {
828                    if (width < MAX_LONG_LONG_CHARS)
829                        width = MAX_LONG_LONG_CHARS;
830                }
831                else
832#endif
833                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
834                       including sign.  Decimal takes the most space.  This
835                       isn't enough for octal.  If a width is specified we
836                       need more (which we allocate later). */
837                    if (width < MAX_LONG_CHARS)
838                        width = MAX_LONG_CHARS;
839                n += width;
840                /* XXX should allow for large precision here too. */
841                if (abuffersize < width)
842                    abuffersize = width;
843                break;
844            case 's':
845            {
846                /* UTF-8 */
847                const char *s = va_arg(count, const char*);
848                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
849                if (!str)
850                    goto fail;
851                n += PyUnicode_GET_SIZE(str);
852                /* Remember the str and switch to the next slot */
853                *callresult++ = str;
854                break;
855            }
856            case 'U':
857            {
858                PyObject *obj = va_arg(count, PyObject *);
859                assert(obj && PyUnicode_Check(obj));
860                n += PyUnicode_GET_SIZE(obj);
861                break;
862            }
863            case 'V':
864            {
865                PyObject *obj = va_arg(count, PyObject *);
866                const char *str = va_arg(count, const char *);
867                assert(obj || str);
868                assert(!obj || PyUnicode_Check(obj));
869                if (obj)
870                    n += PyUnicode_GET_SIZE(obj);
871                else
872                    n += strlen(str);
873                break;
874            }
875            case 'S':
876            {
877                PyObject *obj = va_arg(count, PyObject *);
878                PyObject *str;
879                assert(obj);
880                str = PyObject_Str(obj);
881                if (!str)
882                    goto fail;
883                n += PyUnicode_GET_SIZE(str);
884                /* Remember the str and switch to the next slot */
885                *callresult++ = str;
886                break;
887            }
888            case 'R':
889            {
890                PyObject *obj = va_arg(count, PyObject *);
891                PyObject *repr;
892                assert(obj);
893                repr = PyObject_Repr(obj);
894                if (!repr)
895                    goto fail;
896                n += PyUnicode_GET_SIZE(repr);
897                /* Remember the repr and switch to the next slot */
898                *callresult++ = repr;
899                break;
900            }
901            case 'A':
902            {
903                PyObject *obj = va_arg(count, PyObject *);
904                PyObject *ascii;
905                assert(obj);
906                ascii = PyObject_ASCII(obj);
907                if (!ascii)
908                    goto fail;
909                n += PyUnicode_GET_SIZE(ascii);
910                /* Remember the repr and switch to the next slot */
911                *callresult++ = ascii;
912                break;
913            }
914            case 'p':
915                (void) va_arg(count, int);
916                /* maximum 64-bit pointer representation:
917                 * 0xffffffffffffffff
918                 * so 19 characters is enough.
919                 * XXX I count 18 -- what's the extra for?
920                 */
921                n += 19;
922                break;
923            default:
924                /* if we stumble upon an unknown
925                   formatting code, copy the rest of
926                   the format string to the output
927                   string. (we cannot just skip the
928                   code, since there's no way to know
929                   what's in the argument list) */
930                n += strlen(p);
931                goto expand;
932            }
933        } else
934            n++;
935    }
936  expand:
937    if (abuffersize > ITEM_BUFFER_LEN) {
938        /* add 1 for sprintf's trailing null byte */
939        abuffer = PyObject_Malloc(abuffersize + 1);
940        if (!abuffer) {
941            PyErr_NoMemory();
942            goto fail;
943        }
944        realbuffer = abuffer;
945    }
946    else
947        realbuffer = buffer;
948    /* step 4: fill the buffer */
949    /* Since we've analyzed how much space we need for the worst case,
950       we don't have to resize the string.
951       There can be no errors beyond this point. */
952    string = PyUnicode_FromUnicode(NULL, n);
953    if (!string)
954        goto fail;
955
956    s = PyUnicode_AS_UNICODE(string);
957    callresult = callresults;
958
959    for (f = format; *f; f++) {
960        if (*f == '%') {
961            const char* p = f++;
962            int longflag = 0;
963            int longlongflag = 0;
964            int size_tflag = 0;
965            zeropad = (*f == '0');
966            /* parse the width.precision part */
967            width = 0;
968            while (Py_ISDIGIT((unsigned)*f))
969                width = (width*10) + *f++ - '0';
970            precision = 0;
971            if (*f == '.') {
972                f++;
973                while (Py_ISDIGIT((unsigned)*f))
974                    precision = (precision*10) + *f++ - '0';
975            }
976            /* Handle %ld, %lu, %lld and %llu. */
977            if (*f == 'l') {
978                if (f[1] == 'd' || f[1] == 'u') {
979                    longflag = 1;
980                    ++f;
981                }
982#ifdef HAVE_LONG_LONG
983                else if (f[1] == 'l' &&
984                         (f[2] == 'd' || f[2] == 'u')) {
985                    longlongflag = 1;
986                    f += 2;
987                }
988#endif
989            }
990            /* handle the size_t flag. */
991            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
992                size_tflag = 1;
993                ++f;
994            }
995
996            switch (*f) {
997            case 'c':
998                *s++ = va_arg(vargs, int);
999                break;
1000            case 'd':
1001                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1002                        width, precision, 'd');
1003                if (longflag)
1004                    sprintf(realbuffer, fmt, va_arg(vargs, long));
1005#ifdef HAVE_LONG_LONG
1006                else if (longlongflag)
1007                    sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1008#endif
1009                else if (size_tflag)
1010                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1011                else
1012                    sprintf(realbuffer, fmt, va_arg(vargs, int));
1013                appendstring(realbuffer);
1014                break;
1015            case 'u':
1016                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1017                        width, precision, 'u');
1018                if (longflag)
1019                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
1020#ifdef HAVE_LONG_LONG
1021                else if (longlongflag)
1022                    sprintf(realbuffer, fmt, va_arg(vargs,
1023                                                    unsigned PY_LONG_LONG));
1024#endif
1025                else if (size_tflag)
1026                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1027                else
1028                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1029                appendstring(realbuffer);
1030                break;
1031            case 'i':
1032                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
1033                sprintf(realbuffer, fmt, va_arg(vargs, int));
1034                appendstring(realbuffer);
1035                break;
1036            case 'x':
1037                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1038                sprintf(realbuffer, fmt, va_arg(vargs, int));
1039                appendstring(realbuffer);
1040                break;
1041            case 's':
1042            {
1043                /* unused, since we already have the result */
1044                (void) va_arg(vargs, char *);
1045                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1046                                PyUnicode_GET_SIZE(*callresult));
1047                s += PyUnicode_GET_SIZE(*callresult);
1048                /* We're done with the unicode()/repr() => forget it */
1049                Py_DECREF(*callresult);
1050                /* switch to next unicode()/repr() result */
1051                ++callresult;
1052                break;
1053            }
1054            case 'U':
1055            {
1056                PyObject *obj = va_arg(vargs, PyObject *);
1057                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1058                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1059                s += size;
1060                break;
1061            }
1062            case 'V':
1063            {
1064                PyObject *obj = va_arg(vargs, PyObject *);
1065                const char *str = va_arg(vargs, const char *);
1066                if (obj) {
1067                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1068                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1069                    s += size;
1070                } else {
1071                    appendstring(str);
1072                }
1073                break;
1074            }
1075            case 'S':
1076            case 'R':
1077            case 'A':
1078            {
1079                Py_UNICODE *ucopy;
1080                Py_ssize_t usize;
1081                Py_ssize_t upos;
1082                /* unused, since we already have the result */
1083                (void) va_arg(vargs, PyObject *);
1084                ucopy = PyUnicode_AS_UNICODE(*callresult);
1085                usize = PyUnicode_GET_SIZE(*callresult);
1086                for (upos = 0; upos<usize;)
1087                    *s++ = ucopy[upos++];
1088                /* We're done with the unicode()/repr() => forget it */
1089                Py_DECREF(*callresult);
1090                /* switch to next unicode()/repr() result */
1091                ++callresult;
1092                break;
1093            }
1094            case 'p':
1095                sprintf(buffer, "%p", va_arg(vargs, void*));
1096                /* %p is ill-defined:  ensure leading 0x. */
1097                if (buffer[1] == 'X')
1098                    buffer[1] = 'x';
1099                else if (buffer[1] != 'x') {
1100                    memmove(buffer+2, buffer, strlen(buffer)+1);
1101                    buffer[0] = '0';
1102                    buffer[1] = 'x';
1103                }
1104                appendstring(buffer);
1105                break;
1106            case '%':
1107                *s++ = '%';
1108                break;
1109            default:
1110                appendstring(p);
1111                goto end;
1112            }
1113        }
1114        else
1115            *s++ = *f;
1116    }
1117
1118  end:
1119    if (callresults)
1120        PyObject_Free(callresults);
1121    if (abuffer)
1122        PyObject_Free(abuffer);
1123    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1124    return string;
1125  fail:
1126    if (callresults) {
1127        PyObject **callresult2 = callresults;
1128        while (callresult2 < callresult) {
1129            Py_DECREF(*callresult2);
1130            ++callresult2;
1131        }
1132        PyObject_Free(callresults);
1133    }
1134    if (abuffer)
1135        PyObject_Free(abuffer);
1136    return NULL;
1137}
1138
1139#undef appendstring
1140
1141PyObject *
1142PyUnicode_FromFormat(const char *format, ...)
1143{
1144    PyObject* ret;
1145    va_list vargs;
1146
1147#ifdef HAVE_STDARG_PROTOTYPES
1148    va_start(vargs, format);
1149#else
1150    va_start(vargs);
1151#endif
1152    ret = PyUnicode_FromFormatV(format, vargs);
1153    va_end(vargs);
1154    return ret;
1155}
1156
1157/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1158   convert a Unicode object to a wide character string.
1159
1160   - If w is NULL: return the number of wide characters (including the nul
1161     character) required to convert the unicode object. Ignore size argument.
1162
1163   - Otherwise: return the number of wide characters (excluding the nul
1164     character) written into w. Write at most size wide characters (including
1165     the nul character). */
1166static Py_ssize_t
1167unicode_aswidechar(PyUnicodeObject *unicode,
1168                   wchar_t *w,
1169                   Py_ssize_t size)
1170{
1171#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1172    Py_ssize_t res;
1173    if (w != NULL) {
1174        res = PyUnicode_GET_SIZE(unicode);
1175        if (size > res)
1176            size = res + 1;
1177        else
1178            res = size;
1179        memcpy(w, unicode->str, size * sizeof(wchar_t));
1180        return res;
1181    }
1182    else
1183        return PyUnicode_GET_SIZE(unicode) + 1;
1184#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1185    register const Py_UNICODE *u;
1186    const Py_UNICODE *uend;
1187    const wchar_t *worig, *wend;
1188    Py_ssize_t nchar;
1189
1190    u = PyUnicode_AS_UNICODE(unicode);
1191    uend = u + PyUnicode_GET_SIZE(unicode);
1192    if (w != NULL) {
1193        worig = w;
1194        wend = w + size;
1195        while (u != uend && w != wend) {
1196            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1197                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1198            {
1199                *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1200                u += 2;
1201            }
1202            else {
1203                *w = *u;
1204                u++;
1205            }
1206            w++;
1207        }
1208        if (w != wend)
1209            *w = L'\0';
1210        return w - worig;
1211    }
1212    else {
1213        nchar = 1; /* nul character at the end */
1214        while (u != uend) {
1215            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1216                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1217                u += 2;
1218            else
1219                u++;
1220            nchar++;
1221        }
1222    }
1223    return nchar;
1224#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1225    register Py_UNICODE *u, *uend, ordinal;
1226    register Py_ssize_t i;
1227    wchar_t *worig, *wend;
1228    Py_ssize_t nchar;
1229
1230    u = PyUnicode_AS_UNICODE(unicode);
1231    uend = u + PyUnicode_GET_SIZE(u);
1232    if (w != NULL) {
1233        worig = w;
1234        wend = w + size;
1235        while (u != uend && w != wend) {
1236            ordinal = *u;
1237            if (ordinal > 0xffff) {
1238                ordinal -= 0x10000;
1239                *w++ = 0xD800 | (ordinal >> 10);
1240                *w++ = 0xDC00 | (ordinal & 0x3FF);
1241            }
1242            else
1243                *w++ = ordinal;
1244            u++;
1245        }
1246        if (w != wend)
1247            *w = 0;
1248        return w - worig;
1249    }
1250    else {
1251        nchar = 1; /* nul character */
1252        while (u != uend) {
1253            if (*u > 0xffff)
1254                nchar += 2;
1255            else
1256                nchar++;
1257            u++;
1258        }
1259        return nchar;
1260    }
1261#else
1262#  error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
1263#endif
1264}
1265
1266Py_ssize_t
1267PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1268                     wchar_t *w,
1269                     Py_ssize_t size)
1270{
1271    if (unicode == NULL) {
1272        PyErr_BadInternalCall();
1273        return -1;
1274    }
1275    return unicode_aswidechar(unicode, w, size);
1276}
1277
1278wchar_t*
1279PyUnicode_AsWideCharString(PyObject *unicode,
1280                           Py_ssize_t *size)
1281{
1282    wchar_t* buffer;
1283    Py_ssize_t buflen;
1284
1285    if (unicode == NULL) {
1286        PyErr_BadInternalCall();
1287        return NULL;
1288    }
1289
1290    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
1291    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
1292        PyErr_NoMemory();
1293        return NULL;
1294    }
1295
1296    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1297    if (buffer == NULL) {
1298        PyErr_NoMemory();
1299        return NULL;
1300    }
1301    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
1302    if (size != NULL)
1303        *size = buflen;
1304    return buffer;
1305}
1306
1307#endif
1308
1309PyObject *PyUnicode_FromOrdinal(int ordinal)
1310{
1311    Py_UNICODE s[2];
1312
1313    if (ordinal < 0 || ordinal > 0x10ffff) {
1314        PyErr_SetString(PyExc_ValueError,
1315                        "chr() arg not in range(0x110000)");
1316        return NULL;
1317    }
1318
1319#ifndef Py_UNICODE_WIDE
1320    if (ordinal > 0xffff) {
1321        ordinal -= 0x10000;
1322        s[0] = 0xD800 | (ordinal >> 10);
1323        s[1] = 0xDC00 | (ordinal & 0x3FF);
1324        return PyUnicode_FromUnicode(s, 2);
1325    }
1326#endif
1327
1328    s[0] = (Py_UNICODE)ordinal;
1329    return PyUnicode_FromUnicode(s, 1);
1330}
1331
1332PyObject *PyUnicode_FromObject(register PyObject *obj)
1333{
1334    /* XXX Perhaps we should make this API an alias of
1335       PyObject_Str() instead ?! */
1336    if (PyUnicode_CheckExact(obj)) {
1337        Py_INCREF(obj);
1338        return obj;
1339    }
1340    if (PyUnicode_Check(obj)) {
1341        /* For a Unicode subtype that's not a Unicode object,
1342           return a true Unicode object with the same data. */
1343        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1344                                     PyUnicode_GET_SIZE(obj));
1345    }
1346    PyErr_Format(PyExc_TypeError,
1347                 "Can't convert '%.100s' object to str implicitly",
1348                 Py_TYPE(obj)->tp_name);
1349    return NULL;
1350}
1351
1352PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1353                                      const char *encoding,
1354                                      const char *errors)
1355{
1356    Py_buffer buffer;
1357    PyObject *v;
1358
1359    if (obj == NULL) {
1360        PyErr_BadInternalCall();
1361        return NULL;
1362    }
1363
1364    /* Decoding bytes objects is the most common case and should be fast */
1365    if (PyBytes_Check(obj)) {
1366        if (PyBytes_GET_SIZE(obj) == 0) {
1367            Py_INCREF(unicode_empty);
1368            v = (PyObject *) unicode_empty;
1369        }
1370        else {
1371            v = PyUnicode_Decode(
1372                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1373                    encoding, errors);
1374        }
1375        return v;
1376    }
1377
1378    if (PyUnicode_Check(obj)) {
1379        PyErr_SetString(PyExc_TypeError,
1380                        "decoding str is not supported");
1381        return NULL;
1382    }
1383
1384    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1385    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1386        PyErr_Format(PyExc_TypeError,
1387                     "coercing to str: need bytes, bytearray "
1388                     "or buffer-like object, %.80s found",
1389                     Py_TYPE(obj)->tp_name);
1390        return NULL;
1391    }
1392
1393    if (buffer.len == 0) {
1394        Py_INCREF(unicode_empty);
1395        v = (PyObject *) unicode_empty;
1396    }
1397    else
1398        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1399
1400    PyBuffer_Release(&buffer);
1401    return v;
1402}
1403
1404/* Convert encoding to lower case and replace '_' with '-' in order to
1405   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1406   1 on success. */
1407static int
1408normalize_encoding(const char *encoding,
1409                   char *lower,
1410                   size_t lower_len)
1411{
1412    const char *e;
1413    char *l;
1414    char *l_end;
1415
1416    e = encoding;
1417    l = lower;
1418    l_end = &lower[lower_len - 1];
1419    while (*e) {
1420        if (l == l_end)
1421            return 0;
1422        if (Py_ISUPPER(*e)) {
1423            *l++ = Py_TOLOWER(*e++);
1424        }
1425        else if (*e == '_') {
1426            *l++ = '-';
1427            e++;
1428        }
1429        else {
1430            *l++ = *e++;
1431        }
1432    }
1433    *l = '\0';
1434    return 1;
1435}
1436
1437PyObject *PyUnicode_Decode(const char *s,
1438                           Py_ssize_t size,
1439                           const char *encoding,
1440                           const char *errors)
1441{
1442    PyObject *buffer = NULL, *unicode;
1443    Py_buffer info;
1444    char lower[11];  /* Enough for any encoding shortcut */
1445
1446    if (encoding == NULL)
1447        encoding = PyUnicode_GetDefaultEncoding();
1448
1449    /* Shortcuts for common default encodings */
1450    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1451        if (strcmp(lower, "utf-8") == 0)
1452            return PyUnicode_DecodeUTF8(s, size, errors);
1453        else if ((strcmp(lower, "latin-1") == 0) ||
1454                 (strcmp(lower, "iso-8859-1") == 0))
1455            return PyUnicode_DecodeLatin1(s, size, errors);
1456#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1457        else if (strcmp(lower, "mbcs") == 0)
1458            return PyUnicode_DecodeMBCS(s, size, errors);
1459#endif
1460        else if (strcmp(lower, "ascii") == 0)
1461            return PyUnicode_DecodeASCII(s, size, errors);
1462        else if (strcmp(lower, "utf-16") == 0)
1463            return PyUnicode_DecodeUTF16(s, size, errors, 0);
1464        else if (strcmp(lower, "utf-32") == 0)
1465            return PyUnicode_DecodeUTF32(s, size, errors, 0);
1466    }
1467
1468    /* Decode via the codec registry */
1469    buffer = NULL;
1470    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1471        goto onError;
1472    buffer = PyMemoryView_FromBuffer(&info);
1473    if (buffer == NULL)
1474        goto onError;
1475    unicode = PyCodec_Decode(buffer, encoding, errors);
1476    if (unicode == NULL)
1477        goto onError;
1478    if (!PyUnicode_Check(unicode)) {
1479        PyErr_Format(PyExc_TypeError,
1480                     "decoder did not return a str object (type=%.400s)",
1481                     Py_TYPE(unicode)->tp_name);
1482        Py_DECREF(unicode);
1483        goto onError;
1484    }
1485    Py_DECREF(buffer);
1486    return unicode;
1487
1488  onError:
1489    Py_XDECREF(buffer);
1490    return NULL;
1491}
1492
1493PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1494                                    const char *encoding,
1495                                    const char *errors)
1496{
1497    PyObject *v;
1498
1499    if (!PyUnicode_Check(unicode)) {
1500        PyErr_BadArgument();
1501        goto onError;
1502    }
1503
1504    if (encoding == NULL)
1505        encoding = PyUnicode_GetDefaultEncoding();
1506
1507    /* Decode via the codec registry */
1508    v = PyCodec_Decode(unicode, encoding, errors);
1509    if (v == NULL)
1510        goto onError;
1511    return v;
1512
1513  onError:
1514    return NULL;
1515}
1516
1517PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1518                                     const char *encoding,
1519                                     const char *errors)
1520{
1521    PyObject *v;
1522
1523    if (!PyUnicode_Check(unicode)) {
1524        PyErr_BadArgument();
1525        goto onError;
1526    }
1527
1528    if (encoding == NULL)
1529        encoding = PyUnicode_GetDefaultEncoding();
1530
1531    /* Decode via the codec registry */
1532    v = PyCodec_Decode(unicode, encoding, errors);
1533    if (v == NULL)
1534        goto onError;
1535    if (!PyUnicode_Check(v)) {
1536        PyErr_Format(PyExc_TypeError,
1537                     "decoder did not return a str object (type=%.400s)",
1538                     Py_TYPE(v)->tp_name);
1539        Py_DECREF(v);
1540        goto onError;
1541    }
1542    return v;
1543
1544  onError:
1545    return NULL;
1546}
1547
1548PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1549                           Py_ssize_t size,
1550                           const char *encoding,
1551                           const char *errors)
1552{
1553    PyObject *v, *unicode;
1554
1555    unicode = PyUnicode_FromUnicode(s, size);
1556    if (unicode == NULL)
1557        return NULL;
1558    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1559    Py_DECREF(unicode);
1560    return v;
1561}
1562
1563PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1564                                    const char *encoding,
1565                                    const char *errors)
1566{
1567    PyObject *v;
1568
1569    if (!PyUnicode_Check(unicode)) {
1570        PyErr_BadArgument();
1571        goto onError;
1572    }
1573
1574    if (encoding == NULL)
1575        encoding = PyUnicode_GetDefaultEncoding();
1576
1577    /* Encode via the codec registry */
1578    v = PyCodec_Encode(unicode, encoding, errors);
1579    if (v == NULL)
1580        goto onError;
1581    return v;
1582
1583  onError:
1584    return NULL;
1585}
1586
1587PyObject *
1588PyUnicode_EncodeFSDefault(PyObject *unicode)
1589{
1590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1591    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1592                                PyUnicode_GET_SIZE(unicode),
1593                                NULL);
1594#elif defined(__APPLE__)
1595    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1596                                PyUnicode_GET_SIZE(unicode),
1597                                "surrogateescape");
1598#else
1599    if (Py_FileSystemDefaultEncoding) {
1600        return PyUnicode_AsEncodedString(unicode,
1601                                         Py_FileSystemDefaultEncoding,
1602                                         "surrogateescape");
1603    }
1604    else {
1605        /* locale encoding with surrogateescape */
1606        wchar_t *wchar;
1607        char *bytes;
1608        PyObject *bytes_obj;
1609
1610        wchar = PyUnicode_AsWideCharString(unicode, NULL);
1611        if (wchar == NULL)
1612            return NULL;
1613        bytes = _Py_wchar2char(wchar);
1614        PyMem_Free(wchar);
1615        if (bytes == NULL)
1616            return NULL;
1617
1618        bytes_obj = PyBytes_FromString(bytes);
1619        PyMem_Free(bytes);
1620        return bytes_obj;
1621    }
1622#endif
1623}
1624
1625PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1626                                    const char *encoding,
1627                                    const char *errors)
1628{
1629    PyObject *v;
1630    char lower[11];  /* Enough for any encoding shortcut */
1631
1632    if (!PyUnicode_Check(unicode)) {
1633        PyErr_BadArgument();
1634        return NULL;
1635    }
1636
1637    if (encoding == NULL)
1638        encoding = PyUnicode_GetDefaultEncoding();
1639
1640    /* Shortcuts for common default encodings */
1641    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1642        if (strcmp(lower, "utf-8") == 0)
1643            return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1644                                        PyUnicode_GET_SIZE(unicode),
1645                                        errors);
1646        else if ((strcmp(lower, "latin-1") == 0) ||
1647                 (strcmp(lower, "iso-8859-1") == 0))
1648            return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1649                                          PyUnicode_GET_SIZE(unicode),
1650                                          errors);
1651#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1652        else if (strcmp(lower, "mbcs") == 0)
1653            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1654                                        PyUnicode_GET_SIZE(unicode),
1655                                        errors);
1656#endif
1657        else if (strcmp(lower, "ascii") == 0)
1658            return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1659                                         PyUnicode_GET_SIZE(unicode),
1660                                         errors);
1661    }
1662    /* During bootstrap, we may need to find the encodings
1663       package, to load the file system encoding, and require the
1664       file system encoding in order to load the encodings
1665       package.
1666
1667       Break out of this dependency by assuming that the path to
1668       the encodings module is ASCII-only.  XXX could try wcstombs
1669       instead, if the file system encoding is the locale's
1670       encoding. */
1671    if (Py_FileSystemDefaultEncoding &&
1672             strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1673             !PyThreadState_GET()->interp->codecs_initialized)
1674        return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1675                                     PyUnicode_GET_SIZE(unicode),
1676                                     errors);
1677
1678    /* Encode via the codec registry */
1679    v = PyCodec_Encode(unicode, encoding, errors);
1680    if (v == NULL)
1681        return NULL;
1682
1683    /* The normal path */
1684    if (PyBytes_Check(v))
1685        return v;
1686
1687    /* If the codec returns a buffer, raise a warning and convert to bytes */
1688    if (PyByteArray_Check(v)) {
1689        int error;
1690        PyObject *b;
1691
1692        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1693            "encoder %s returned bytearray instead of bytes",
1694            encoding);
1695        if (error) {
1696            Py_DECREF(v);
1697            return NULL;
1698        }
1699
1700        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1701        Py_DECREF(v);
1702        return b;
1703    }
1704
1705    PyErr_Format(PyExc_TypeError,
1706                 "encoder did not return a bytes object (type=%.400s)",
1707                 Py_TYPE(v)->tp_name);
1708    Py_DECREF(v);
1709    return NULL;
1710}
1711
1712PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1713                                     const char *encoding,
1714                                     const char *errors)
1715{
1716    PyObject *v;
1717
1718    if (!PyUnicode_Check(unicode)) {
1719        PyErr_BadArgument();
1720        goto onError;
1721    }
1722
1723    if (encoding == NULL)
1724        encoding = PyUnicode_GetDefaultEncoding();
1725
1726    /* Encode via the codec registry */
1727    v = PyCodec_Encode(unicode, encoding, errors);
1728    if (v == NULL)
1729        goto onError;
1730    if (!PyUnicode_Check(v)) {
1731        PyErr_Format(PyExc_TypeError,
1732                     "encoder did not return an str object (type=%.400s)",
1733                     Py_TYPE(v)->tp_name);
1734        Py_DECREF(v);
1735        goto onError;
1736    }
1737    return v;
1738
1739  onError:
1740    return NULL;
1741}
1742
1743PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1744                                            const char *errors)
1745{
1746    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1747    if (v)
1748        return v;
1749    if (errors != NULL)
1750        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1751    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1752                             PyUnicode_GET_SIZE(unicode),
1753                             NULL);
1754    if (!v)
1755        return NULL;
1756    ((PyUnicodeObject *)unicode)->defenc = v;
1757    return v;
1758}
1759
1760PyObject*
1761PyUnicode_DecodeFSDefault(const char *s) {
1762    Py_ssize_t size = (Py_ssize_t)strlen(s);
1763    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1764}
1765
1766PyObject*
1767PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1768{
1769#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1770    return PyUnicode_DecodeMBCS(s, size, NULL);
1771#elif defined(__APPLE__)
1772    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1773#else
1774    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1775       can be undefined. If it is case, decode using UTF-8. The following assumes
1776       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1777       bootstrapping process where the codecs aren't ready yet.
1778    */
1779    if (Py_FileSystemDefaultEncoding) {
1780        return PyUnicode_Decode(s, size,
1781                                Py_FileSystemDefaultEncoding,
1782                                "surrogateescape");
1783    }
1784    else {
1785        /* locale encoding with surrogateescape */
1786        wchar_t *wchar;
1787        PyObject *unicode;
1788        size_t len;
1789
1790        if (s[size] != '\0' || size != strlen(s)) {
1791            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1792            return NULL;
1793        }
1794
1795        wchar = _Py_char2wchar(s, &len);
1796        if (wchar == NULL)
1797            return NULL;
1798
1799        unicode = PyUnicode_FromWideChar(wchar, len);
1800        PyMem_Free(wchar);
1801        return unicode;
1802    }
1803#endif
1804}
1805
1806
1807int
1808PyUnicode_FSConverter(PyObject* arg, void* addr)
1809{
1810    PyObject *output = NULL;
1811    Py_ssize_t size;
1812    void *data;
1813    if (arg == NULL) {
1814        Py_DECREF(*(PyObject**)addr);
1815        return 1;
1816    }
1817    if (PyBytes_Check(arg)) {
1818        output = arg;
1819        Py_INCREF(output);
1820    }
1821    else {
1822        arg = PyUnicode_FromObject(arg);
1823        if (!arg)
1824            return 0;
1825        output = PyUnicode_EncodeFSDefault(arg);
1826        Py_DECREF(arg);
1827        if (!output)
1828            return 0;
1829        if (!PyBytes_Check(output)) {
1830            Py_DECREF(output);
1831            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1832            return 0;
1833        }
1834    }
1835    size = PyBytes_GET_SIZE(output);
1836    data = PyBytes_AS_STRING(output);
1837    if (size != strlen(data)) {
1838        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1839        Py_DECREF(output);
1840        return 0;
1841    }
1842    *(PyObject**)addr = output;
1843    return Py_CLEANUP_SUPPORTED;
1844}
1845
1846
1847int
1848PyUnicode_FSDecoder(PyObject* arg, void* addr)
1849{
1850    PyObject *output = NULL;
1851    Py_ssize_t size;
1852    void *data;
1853    if (arg == NULL) {
1854        Py_DECREF(*(PyObject**)addr);
1855        return 1;
1856    }
1857    if (PyUnicode_Check(arg)) {
1858        output = arg;
1859        Py_INCREF(output);
1860    }
1861    else {
1862        arg = PyBytes_FromObject(arg);
1863        if (!arg)
1864            return 0;
1865        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1866                                                  PyBytes_GET_SIZE(arg));
1867        Py_DECREF(arg);
1868        if (!output)
1869            return 0;
1870        if (!PyUnicode_Check(output)) {
1871            Py_DECREF(output);
1872            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1873            return 0;
1874        }
1875    }
1876    size = PyUnicode_GET_SIZE(output);
1877    data = PyUnicode_AS_UNICODE(output);
1878    if (size != Py_UNICODE_strlen(data)) {
1879        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1880        Py_DECREF(output);
1881        return 0;
1882    }
1883    *(PyObject**)addr = output;
1884    return Py_CLEANUP_SUPPORTED;
1885}
1886
1887
1888char*
1889_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1890{
1891    PyObject *bytes;
1892    if (!PyUnicode_Check(unicode)) {
1893        PyErr_BadArgument();
1894        return NULL;
1895    }
1896    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1897    if (bytes == NULL)
1898        return NULL;
1899    if (psize != NULL)
1900        *psize = PyBytes_GET_SIZE(bytes);
1901    return PyBytes_AS_STRING(bytes);
1902}
1903
1904char*
1905_PyUnicode_AsString(PyObject *unicode)
1906{
1907    return _PyUnicode_AsStringAndSize(unicode, NULL);
1908}
1909
1910Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1911{
1912    if (!PyUnicode_Check(unicode)) {
1913        PyErr_BadArgument();
1914        goto onError;
1915    }
1916    return PyUnicode_AS_UNICODE(unicode);
1917
1918  onError:
1919    return NULL;
1920}
1921
1922Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1923{
1924    if (!PyUnicode_Check(unicode)) {
1925        PyErr_BadArgument();
1926        goto onError;
1927    }
1928    return PyUnicode_GET_SIZE(unicode);
1929
1930  onError:
1931    return -1;
1932}
1933
1934const char *PyUnicode_GetDefaultEncoding(void)
1935{
1936    return "utf-8";
1937}
1938
1939/* create or adjust a UnicodeDecodeError */
1940static void
1941make_decode_exception(PyObject **exceptionObject,
1942                      const char *encoding,
1943                      const char *input, Py_ssize_t length,
1944                      Py_ssize_t startpos, Py_ssize_t endpos,
1945                      const char *reason)
1946{
1947    if (*exceptionObject == NULL) {
1948        *exceptionObject = PyUnicodeDecodeError_Create(
1949            encoding, input, length, startpos, endpos, reason);
1950    }
1951    else {
1952        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1953            goto onError;
1954        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1955            goto onError;
1956        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1957            goto onError;
1958    }
1959    return;
1960
1961onError:
1962    Py_DECREF(*exceptionObject);
1963    *exceptionObject = NULL;
1964}
1965
1966/* error handling callback helper:
1967   build arguments, call the callback and check the arguments,
1968   if no exception occurred, copy the replacement to the output
1969   and adjust various state variables.
1970   return 0 on success, -1 on error
1971*/
1972
1973static
1974int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1975                                     const char *encoding, const char *reason,
1976                                     const char **input, const char **inend, Py_ssize_t *startinpos,
1977                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1978                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1979{
1980    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
1981
1982    PyObject *restuple = NULL;
1983    PyObject *repunicode = NULL;
1984    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1985    Py_ssize_t insize;
1986    Py_ssize_t requiredsize;
1987    Py_ssize_t newpos;
1988    Py_UNICODE *repptr;
1989    PyObject *inputobj = NULL;
1990    Py_ssize_t repsize;
1991    int res = -1;
1992
1993    if (*errorHandler == NULL) {
1994        *errorHandler = PyCodec_LookupError(errors);
1995        if (*errorHandler == NULL)
1996            goto onError;
1997    }
1998
1999    make_decode_exception(exceptionObject,
2000        encoding,
2001        *input, *inend - *input,
2002        *startinpos, *endinpos,
2003        reason);
2004    if (*exceptionObject == NULL)
2005        goto onError;
2006
2007    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2008    if (restuple == NULL)
2009        goto onError;
2010    if (!PyTuple_Check(restuple)) {
2011        PyErr_SetString(PyExc_TypeError, &argparse[4]);
2012        goto onError;
2013    }
2014    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
2015        goto onError;
2016
2017    /* Copy back the bytes variables, which might have been modified by the
2018       callback */
2019    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2020    if (!inputobj)
2021        goto onError;
2022    if (!PyBytes_Check(inputobj)) {
2023        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
2024    }
2025    *input = PyBytes_AS_STRING(inputobj);
2026    insize = PyBytes_GET_SIZE(inputobj);
2027    *inend = *input + insize;
2028    /* we can DECREF safely, as the exception has another reference,
2029       so the object won't go away. */
2030    Py_DECREF(inputobj);
2031
2032    if (newpos<0)
2033        newpos = insize+newpos;
2034    if (newpos<0 || newpos>insize) {
2035        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2036        goto onError;
2037    }
2038
2039    /* need more space? (at least enough for what we
2040       have+the replacement+the rest of the string (starting
2041       at the new input position), so we won't have to check space
2042       when there are no errors in the rest of the string) */
2043    repptr = PyUnicode_AS_UNICODE(repunicode);
2044    repsize = PyUnicode_GET_SIZE(repunicode);
2045    requiredsize = *outpos + repsize + insize-newpos;
2046    if (requiredsize > outsize) {
2047        if (requiredsize<2*outsize)
2048            requiredsize = 2*outsize;
2049        if (_PyUnicode_Resize(output, requiredsize) < 0)
2050            goto onError;
2051        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
2052    }
2053    *endinpos = newpos;
2054    *inptr = *input + newpos;
2055    Py_UNICODE_COPY(*outptr, repptr, repsize);
2056    *outptr += repsize;
2057    *outpos += repsize;
2058
2059    /* we made it! */
2060    res = 0;
2061
2062  onError:
2063    Py_XDECREF(restuple);
2064    return res;
2065}
2066
2067/* --- UTF-7 Codec -------------------------------------------------------- */
2068
2069/* See RFC2152 for details.  We encode conservatively and decode liberally. */
2070
2071/* Three simple macros defining base-64. */
2072
2073/* Is c a base-64 character? */
2074
2075#define IS_BASE64(c) \
2076    (((c) >= 'A' && (c) <= 'Z') ||     \
2077     ((c) >= 'a' && (c) <= 'z') ||     \
2078     ((c) >= '0' && (c) <= '9') ||     \
2079     (c) == '+' || (c) == '/')
2080
2081/* given that c is a base-64 character, what is its base-64 value? */
2082
2083#define FROM_BASE64(c)                                                  \
2084    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
2085     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
2086     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
2087     (c) == '+' ? 62 : 63)
2088
2089/* What is the base-64 character of the bottom 6 bits of n? */
2090
2091#define TO_BASE64(n)  \
2092    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2093
2094/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2095 * decoded as itself.  We are permissive on decoding; the only ASCII
2096 * byte not decoding to itself is the + which begins a base64
2097 * string. */
2098
2099#define DECODE_DIRECT(c)                                \
2100    ((c) <= 127 && (c) != '+')
2101
2102/* The UTF-7 encoder treats ASCII characters differently according to
2103 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2104 * the above).  See RFC2152.  This array identifies these different
2105 * sets:
2106 * 0 : "Set D"
2107 *     alphanumeric and '(),-./:?
2108 * 1 : "Set O"
2109 *     !"#$%&*;<=>@[]^_`{|}
2110 * 2 : "whitespace"
2111 *     ht nl cr sp
2112 * 3 : special (must be base64 encoded)
2113 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2114 */
2115
2116static
2117char utf7_category[128] = {
2118/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
2119    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
2120/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
2121    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2122/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
2123    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
2124/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
2125    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
2126/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
2127    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2128/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
2129    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
2130/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
2131    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2132/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
2133    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
2134};
2135
2136/* ENCODE_DIRECT: this character should be encoded as itself.  The
2137 * answer depends on whether we are encoding set O as itself, and also
2138 * on whether we are encoding whitespace as itself.  RFC2152 makes it
2139 * clear that the answers to these questions vary between
2140 * applications, so this code needs to be flexible.  */
2141
2142#define ENCODE_DIRECT(c, directO, directWS)             \
2143    ((c) < 128 && (c) > 0 &&                            \
2144     ((utf7_category[(c)] == 0) ||                      \
2145      (directWS && (utf7_category[(c)] == 2)) ||        \
2146      (directO && (utf7_category[(c)] == 1))))
2147
2148PyObject *PyUnicode_DecodeUTF7(const char *s,
2149                               Py_ssize_t size,
2150                               const char *errors)
2151{
2152    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2153}
2154
2155/* The decoder.  The only state we preserve is our read position,
2156 * i.e. how many characters we have consumed.  So if we end in the
2157 * middle of a shift sequence we have to back off the read position
2158 * and the output to the beginning of the sequence, otherwise we lose
2159 * all the shift state (seen bits, number of bits seen, high
2160 * surrogate). */
2161
2162PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
2163                                       Py_ssize_t size,
2164                                       const char *errors,
2165                                       Py_ssize_t *consumed)
2166{
2167    const char *starts = s;
2168    Py_ssize_t startinpos;
2169    Py_ssize_t endinpos;
2170    Py_ssize_t outpos;
2171    const char *e;
2172    PyUnicodeObject *unicode;
2173    Py_UNICODE *p;
2174    const char *errmsg = "";
2175    int inShift = 0;
2176    Py_UNICODE *shiftOutStart;
2177    unsigned int base64bits = 0;
2178    unsigned long base64buffer = 0;
2179    Py_UNICODE surrogate = 0;
2180    PyObject *errorHandler = NULL;
2181    PyObject *exc = NULL;
2182
2183    unicode = _PyUnicode_New(size);
2184    if (!unicode)
2185        return NULL;
2186    if (size == 0) {
2187        if (consumed)
2188            *consumed = 0;
2189        return (PyObject *)unicode;
2190    }
2191
2192    p = unicode->str;
2193    shiftOutStart = p;
2194    e = s + size;
2195
2196    while (s < e) {
2197        Py_UNICODE ch;
2198      restart:
2199        ch = (unsigned char) *s;
2200
2201        if (inShift) { /* in a base-64 section */
2202            if (IS_BASE64(ch)) { /* consume a base-64 character */
2203                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2204                base64bits += 6;
2205                s++;
2206                if (base64bits >= 16) {
2207                    /* we have enough bits for a UTF-16 value */
2208                    Py_UNICODE outCh = (Py_UNICODE)
2209                                       (base64buffer >> (base64bits-16));
2210                    base64bits -= 16;
2211                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2212                    if (surrogate) {
2213                        /* expecting a second surrogate */
2214                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2215#ifdef Py_UNICODE_WIDE
2216                            *p++ = (((surrogate & 0x3FF)<<10)
2217                                    | (outCh & 0x3FF)) + 0x10000;
2218#else
2219                            *p++ = surrogate;
2220                            *p++ = outCh;
2221#endif
2222                            surrogate = 0;
2223                        }
2224                        else {
2225                            surrogate = 0;
2226                            errmsg = "second surrogate missing";
2227                            goto utf7Error;
2228                        }
2229                    }
2230                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2231                        /* first surrogate */
2232                        surrogate = outCh;
2233                    }
2234                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2235                        errmsg = "unexpected second surrogate";
2236                        goto utf7Error;
2237                    }
2238                    else {
2239                        *p++ = outCh;
2240                    }
2241                }
2242            }
2243            else { /* now leaving a base-64 section */
2244                inShift = 0;
2245                s++;
2246                if (surrogate) {
2247                    errmsg = "second surrogate missing at end of shift sequence";
2248                    goto utf7Error;
2249                }
2250                if (base64bits > 0) { /* left-over bits */
2251                    if (base64bits >= 6) {
2252                        /* We've seen at least one base-64 character */
2253                        errmsg = "partial character in shift sequence";
2254                        goto utf7Error;
2255                    }
2256                    else {
2257                        /* Some bits remain; they should be zero */
2258                        if (base64buffer != 0) {
2259                            errmsg = "non-zero padding bits in shift sequence";
2260                            goto utf7Error;
2261                        }
2262                    }
2263                }
2264                if (ch != '-') {
2265                    /* '-' is absorbed; other terminating
2266                       characters are preserved */
2267                    *p++ = ch;
2268                }
2269            }
2270        }
2271        else if ( ch == '+' ) {
2272            startinpos = s-starts;
2273            s++; /* consume '+' */
2274            if (s < e && *s == '-') { /* '+-' encodes '+' */
2275                s++;
2276                *p++ = '+';
2277            }
2278            else { /* begin base64-encoded section */
2279                inShift = 1;
2280                shiftOutStart = p;
2281                base64bits = 0;
2282            }
2283        }
2284        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
2285            *p++ = ch;
2286            s++;
2287        }
2288        else {
2289            startinpos = s-starts;
2290            s++;
2291            errmsg = "unexpected special character";
2292            goto utf7Error;
2293        }
2294        continue;
2295utf7Error:
2296        outpos = p-PyUnicode_AS_UNICODE(unicode);
2297        endinpos = s-starts;
2298        if (unicode_decode_call_errorhandler(
2299                errors, &errorHandler,
2300                "utf7", errmsg,
2301                &starts, &e, &startinpos, &endinpos, &exc, &s,
2302                &unicode, &outpos, &p))
2303            goto onError;
2304    }
2305
2306    /* end of string */
2307
2308    if (inShift && !consumed) { /* in shift sequence, no more to follow */
2309        /* if we're in an inconsistent state, that's an error */
2310        if (surrogate ||
2311                (base64bits >= 6) ||
2312                (base64bits > 0 && base64buffer != 0)) {
2313            outpos = p-PyUnicode_AS_UNICODE(unicode);
2314            endinpos = size;
2315            if (unicode_decode_call_errorhandler(
2316                    errors, &errorHandler,
2317                    "utf7", "unterminated shift sequence",
2318                    &starts, &e, &startinpos, &endinpos, &exc, &s,
2319                    &unicode, &outpos, &p))
2320                goto onError;
2321            if (s < e)
2322                goto restart;
2323        }
2324    }
2325
2326    /* return state */
2327    if (consumed) {
2328        if (inShift) {
2329            p = shiftOutStart; /* back off output */
2330            *consumed = startinpos;
2331        }
2332        else {
2333            *consumed = s-starts;
2334        }
2335    }
2336
2337    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2338        goto onError;
2339
2340    Py_XDECREF(errorHandler);
2341    Py_XDECREF(exc);
2342    return (PyObject *)unicode;
2343
2344  onError:
2345    Py_XDECREF(errorHandler);
2346    Py_XDECREF(exc);
2347    Py_DECREF(unicode);
2348    return NULL;
2349}
2350
2351
2352PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2353                               Py_ssize_t size,
2354                               int base64SetO,
2355                               int base64WhiteSpace,
2356                               const char *errors)
2357{
2358    PyObject *v;
2359    /* It might be possible to tighten this worst case */
2360    Py_ssize_t allocated = 8 * size;
2361    int inShift = 0;
2362    Py_ssize_t i = 0;
2363    unsigned int base64bits = 0;
2364    unsigned long base64buffer = 0;
2365    char * out;
2366    char * start;
2367
2368    if (size == 0)
2369        return PyBytes_FromStringAndSize(NULL, 0);
2370
2371    if (allocated / 8 != size)
2372        return PyErr_NoMemory();
2373
2374    v = PyBytes_FromStringAndSize(NULL, allocated);
2375    if (v == NULL)
2376        return NULL;
2377
2378    start = out = PyBytes_AS_STRING(v);
2379    for (;i < size; ++i) {
2380        Py_UNICODE ch = s[i];
2381
2382        if (inShift) {
2383            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2384                /* shifting out */
2385                if (base64bits) { /* output remaining bits */
2386                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
2387                    base64buffer = 0;
2388                    base64bits = 0;
2389                }
2390                inShift = 0;
2391                /* Characters not in the BASE64 set implicitly unshift the sequence
2392                   so no '-' is required, except if the character is itself a '-' */
2393                if (IS_BASE64(ch) || ch == '-') {
2394                    *out++ = '-';
2395                }
2396                *out++ = (char) ch;
2397            }
2398            else {
2399                goto encode_char;
2400            }
2401        }
2402        else { /* not in a shift sequence */
2403            if (ch == '+') {
2404                *out++ = '+';
2405                        *out++ = '-';
2406            }
2407            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2408                *out++ = (char) ch;
2409            }
2410            else {
2411                *out++ = '+';
2412                inShift = 1;
2413                goto encode_char;
2414            }
2415        }
2416        continue;
2417encode_char:
2418#ifdef Py_UNICODE_WIDE
2419        if (ch >= 0x10000) {
2420            /* code first surrogate */
2421            base64bits += 16;
2422            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2423            while (base64bits >= 6) {
2424                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2425                base64bits -= 6;
2426            }
2427            /* prepare second surrogate */
2428            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
2429        }
2430#endif
2431        base64bits += 16;
2432        base64buffer = (base64buffer << 16) | ch;
2433        while (base64bits >= 6) {
2434            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2435            base64bits -= 6;
2436        }
2437    }
2438    if (base64bits)
2439        *out++= TO_BASE64(base64buffer << (6-base64bits) );
2440    if (inShift)
2441        *out++ = '-';
2442    if (_PyBytes_Resize(&v, out - start) < 0)
2443        return NULL;
2444    return v;
2445}
2446
2447#undef IS_BASE64
2448#undef FROM_BASE64
2449#undef TO_BASE64
2450#undef DECODE_DIRECT
2451#undef ENCODE_DIRECT
2452
2453/* --- UTF-8 Codec -------------------------------------------------------- */
2454
2455static
2456char utf8_code_length[256] = {
2457    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
2458       illegal prefix.  See RFC 3629 for details */
2459    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2460    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2461    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2462    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2463    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2464    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2465    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2466    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2467    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2468    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2469    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2470    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2471    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2472    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2473    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2474    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
2475};
2476
2477PyObject *PyUnicode_DecodeUTF8(const char *s,
2478                               Py_ssize_t size,
2479                               const char *errors)
2480{
2481    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2482}
2483
2484/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2485#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2486
2487/* Mask to quickly check whether a C 'long' contains a
2488   non-ASCII, UTF8-encoded char. */
2489#if (SIZEOF_LONG == 8)
2490# define ASCII_CHAR_MASK 0x8080808080808080L
2491#elif (SIZEOF_LONG == 4)
2492# define ASCII_CHAR_MASK 0x80808080L
2493#else
2494# error C 'long' size should be either 4 or 8!
2495#endif
2496
2497PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2498                                       Py_ssize_t size,
2499                                       const char *errors,
2500                                       Py_ssize_t *consumed)
2501{
2502    const char *starts = s;
2503    int n;
2504    int k;
2505    Py_ssize_t startinpos;
2506    Py_ssize_t endinpos;
2507    Py_ssize_t outpos;
2508    const char *e, *aligned_end;
2509    PyUnicodeObject *unicode;
2510    Py_UNICODE *p;
2511    const char *errmsg = "";
2512    PyObject *errorHandler = NULL;
2513    PyObject *exc = NULL;
2514
2515    /* Note: size will always be longer than the resulting Unicode
2516       character count */
2517    unicode = _PyUnicode_New(size);
2518    if (!unicode)
2519        return NULL;
2520    if (size == 0) {
2521        if (consumed)
2522            *consumed = 0;
2523        return (PyObject *)unicode;
2524    }
2525
2526    /* Unpack UTF-8 encoded data */
2527    p = unicode->str;
2528    e = s + size;
2529    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2530
2531    while (s < e) {
2532        Py_UCS4 ch = (unsigned char)*s;
2533
2534        if (ch < 0x80) {
2535            /* Fast path for runs of ASCII characters. Given that common UTF-8
2536               input will consist of an overwhelming majority of ASCII
2537               characters, we try to optimize for this case by checking
2538               as many characters as a C 'long' can contain.
2539               First, check if we can do an aligned read, as most CPUs have
2540               a penalty for unaligned reads.
2541            */
2542            if (!((size_t) s & LONG_PTR_MASK)) {
2543                /* Help register allocation */
2544                register const char *_s = s;
2545                register Py_UNICODE *_p = p;
2546                while (_s < aligned_end) {
2547                    /* Read a whole long at a time (either 4 or 8 bytes),
2548                       and do a fast unrolled copy if it only contains ASCII
2549                       characters. */
2550                    unsigned long data = *(unsigned long *) _s;
2551                    if (data & ASCII_CHAR_MASK)
2552                        break;
2553                    _p[0] = (unsigned char) _s[0];
2554                    _p[1] = (unsigned char) _s[1];
2555                    _p[2] = (unsigned char) _s[2];
2556                    _p[3] = (unsigned char) _s[3];
2557#if (SIZEOF_LONG == 8)
2558                    _p[4] = (unsigned char) _s[4];
2559                    _p[5] = (unsigned char) _s[5];
2560                    _p[6] = (unsigned char) _s[6];
2561                    _p[7] = (unsigned char) _s[7];
2562#endif
2563                    _s += SIZEOF_LONG;
2564                    _p += SIZEOF_LONG;
2565                }
2566                s = _s;
2567                p = _p;
2568                if (s == e)
2569                    break;
2570                ch = (unsigned char)*s;
2571            }
2572        }
2573
2574        if (ch < 0x80) {
2575            *p++ = (Py_UNICODE)ch;
2576            s++;
2577            continue;
2578        }
2579
2580        n = utf8_code_length[ch];
2581
2582        if (s + n > e) {
2583            if (consumed)
2584                break;
2585            else {
2586                errmsg = "unexpected end of data";
2587                startinpos = s-starts;
2588                endinpos = startinpos+1;
2589                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2590                    endinpos++;
2591                goto utf8Error;
2592            }
2593        }
2594
2595        switch (n) {
2596
2597        case 0:
2598            errmsg = "invalid start byte";
2599            startinpos = s-starts;
2600            endinpos = startinpos+1;
2601            goto utf8Error;
2602
2603        case 1:
2604            errmsg = "internal error";
2605            startinpos = s-starts;
2606            endinpos = startinpos+1;
2607            goto utf8Error;
2608
2609        case 2:
2610            if ((s[1] & 0xc0) != 0x80) {
2611                errmsg = "invalid continuation byte";
2612                startinpos = s-starts;
2613                endinpos = startinpos + 1;
2614                goto utf8Error;
2615            }
2616            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2617            assert ((ch > 0x007F) && (ch <= 0x07FF));
2618            *p++ = (Py_UNICODE)ch;
2619            break;
2620
2621        case 3:
2622            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2623               will result in surrogates in range d800-dfff. Surrogates are
2624               not valid UTF-8 so they are rejected.
2625               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2626               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2627            if ((s[1] & 0xc0) != 0x80 ||
2628                (s[2] & 0xc0) != 0x80 ||
2629                ((unsigned char)s[0] == 0xE0 &&
2630                 (unsigned char)s[1] < 0xA0) ||
2631                ((unsigned char)s[0] == 0xED &&
2632                 (unsigned char)s[1] > 0x9F)) {
2633                errmsg = "invalid continuation byte";
2634                startinpos = s-starts;
2635                endinpos = startinpos + 1;
2636
2637                /* if s[1] first two bits are 1 and 0, then the invalid
2638                   continuation byte is s[2], so increment endinpos by 1,
2639                   if not, s[1] is invalid and endinpos doesn't need to
2640                   be incremented. */
2641                if ((s[1] & 0xC0) == 0x80)
2642                    endinpos++;
2643                goto utf8Error;
2644            }
2645            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2646            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2647            *p++ = (Py_UNICODE)ch;
2648            break;
2649
2650        case 4:
2651            if ((s[1] & 0xc0) != 0x80 ||
2652                (s[2] & 0xc0) != 0x80 ||
2653                (s[3] & 0xc0) != 0x80 ||
2654                ((unsigned char)s[0] == 0xF0 &&
2655                 (unsigned char)s[1] < 0x90) ||
2656                ((unsigned char)s[0] == 0xF4 &&
2657                 (unsigned char)s[1] > 0x8F)) {
2658                errmsg = "invalid continuation byte";
2659                startinpos = s-starts;
2660                endinpos = startinpos + 1;
2661                if ((s[1] & 0xC0) == 0x80) {
2662                    endinpos++;
2663                    if ((s[2] & 0xC0) == 0x80)
2664                        endinpos++;
2665                }
2666                goto utf8Error;
2667            }
2668            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2669                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2670            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2671
2672#ifdef Py_UNICODE_WIDE
2673            *p++ = (Py_UNICODE)ch;
2674#else
2675            /*  compute and append the two surrogates: */
2676
2677            /*  translate from 10000..10FFFF to 0..FFFF */
2678            ch -= 0x10000;
2679
2680            /*  high surrogate = top 10 bits added to D800 */
2681            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2682
2683            /*  low surrogate = bottom 10 bits added to DC00 */
2684            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2685#endif
2686            break;
2687        }
2688        s += n;
2689        continue;
2690
2691      utf8Error:
2692        outpos = p-PyUnicode_AS_UNICODE(unicode);
2693        if (unicode_decode_call_errorhandler(
2694                errors, &errorHandler,
2695                "utf8", errmsg,
2696                &starts, &e, &startinpos, &endinpos, &exc, &s,
2697                &unicode, &outpos, &p))
2698            goto onError;
2699        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2700    }
2701    if (consumed)
2702        *consumed = s-starts;
2703
2704    /* Adjust length */
2705    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2706        goto onError;
2707
2708    Py_XDECREF(errorHandler);
2709    Py_XDECREF(exc);
2710    return (PyObject *)unicode;
2711
2712  onError:
2713    Py_XDECREF(errorHandler);
2714    Py_XDECREF(exc);
2715    Py_DECREF(unicode);
2716    return NULL;
2717}
2718
2719#undef ASCII_CHAR_MASK
2720
2721#ifdef __APPLE__
2722
2723/* Simplified UTF-8 decoder using surrogateescape error handler,
2724   used to decode the command line arguments on Mac OS X. */
2725
2726wchar_t*
2727_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2728{
2729    int n;
2730    const char *e;
2731    wchar_t *unicode, *p;
2732
2733    /* Note: size will always be longer than the resulting Unicode
2734       character count */
2735    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2736        PyErr_NoMemory();
2737        return NULL;
2738    }
2739    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2740    if (!unicode)
2741        return NULL;
2742
2743    /* Unpack UTF-8 encoded data */
2744    p = unicode;
2745    e = s + size;
2746    while (s < e) {
2747        Py_UCS4 ch = (unsigned char)*s;
2748
2749        if (ch < 0x80) {
2750            *p++ = (wchar_t)ch;
2751            s++;
2752            continue;
2753        }
2754
2755        n = utf8_code_length[ch];
2756        if (s + n > e) {
2757            goto surrogateescape;
2758        }
2759
2760        switch (n) {
2761        case 0:
2762        case 1:
2763            goto surrogateescape;
2764
2765        case 2:
2766            if ((s[1] & 0xc0) != 0x80)
2767                goto surrogateescape;
2768            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2769            assert ((ch > 0x007F) && (ch <= 0x07FF));
2770            *p++ = (wchar_t)ch;
2771            break;
2772
2773        case 3:
2774            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2775               will result in surrogates in range d800-dfff. Surrogates are
2776               not valid UTF-8 so they are rejected.
2777               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2778               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2779            if ((s[1] & 0xc0) != 0x80 ||
2780                (s[2] & 0xc0) != 0x80 ||
2781                ((unsigned char)s[0] == 0xE0 &&
2782                 (unsigned char)s[1] < 0xA0) ||
2783                ((unsigned char)s[0] == 0xED &&
2784                 (unsigned char)s[1] > 0x9F)) {
2785
2786                goto surrogateescape;
2787            }
2788            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2789            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2790            *p++ = (Py_UNICODE)ch;
2791            break;
2792
2793        case 4:
2794            if ((s[1] & 0xc0) != 0x80 ||
2795                (s[2] & 0xc0) != 0x80 ||
2796                (s[3] & 0xc0) != 0x80 ||
2797                ((unsigned char)s[0] == 0xF0 &&
2798                 (unsigned char)s[1] < 0x90) ||
2799                ((unsigned char)s[0] == 0xF4 &&
2800                 (unsigned char)s[1] > 0x8F)) {
2801                goto surrogateescape;
2802            }
2803            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2804                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2805            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2806
2807#if SIZEOF_WCHAR_T == 4
2808            *p++ = (wchar_t)ch;
2809#else
2810            /*  compute and append the two surrogates: */
2811
2812            /*  translate from 10000..10FFFF to 0..FFFF */
2813            ch -= 0x10000;
2814
2815            /*  high surrogate = top 10 bits added to D800 */
2816            *p++ = (wchar_t)(0xD800 + (ch >> 10));
2817
2818            /*  low surrogate = bottom 10 bits added to DC00 */
2819            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2820#endif
2821            break;
2822        }
2823        s += n;
2824        continue;
2825
2826      surrogateescape:
2827        *p++ = 0xDC00 + ch;
2828        s++;
2829    }
2830    *p = L'\0';
2831    return unicode;
2832}
2833
2834#endif /* __APPLE__ */
2835
2836/* Allocation strategy:  if the string is short, convert into a stack buffer
2837   and allocate exactly as much space needed at the end.  Else allocate the
2838   maximum possible needed (4 result bytes per Unicode character), and return
2839   the excess memory at the end.
2840*/
2841PyObject *
2842PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2843                     Py_ssize_t size,
2844                     const char *errors)
2845{
2846#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2847
2848    Py_ssize_t i;                /* index into s of next input byte */
2849    PyObject *result;            /* result string object */
2850    char *p;                     /* next free byte in output buffer */
2851    Py_ssize_t nallocated;      /* number of result bytes allocated */
2852    Py_ssize_t nneeded;            /* number of result bytes needed */
2853    char stackbuf[MAX_SHORT_UNICHARS * 4];
2854    PyObject *errorHandler = NULL;
2855    PyObject *exc = NULL;
2856
2857    assert(s != NULL);
2858    assert(size >= 0);
2859
2860    if (size <= MAX_SHORT_UNICHARS) {
2861        /* Write into the stack buffer; nallocated can't overflow.
2862         * At the end, we'll allocate exactly as much heap space as it
2863         * turns out we need.
2864         */
2865        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2866        result = NULL;   /* will allocate after we're done */
2867        p = stackbuf;
2868    }
2869    else {
2870        /* Overallocate on the heap, and give the excess back at the end. */
2871        nallocated = size * 4;
2872        if (nallocated / 4 != size)  /* overflow! */
2873            return PyErr_NoMemory();
2874        result = PyBytes_FromStringAndSize(NULL, nallocated);
2875        if (result == NULL)
2876            return NULL;
2877        p = PyBytes_AS_STRING(result);
2878    }
2879
2880    for (i = 0; i < size;) {
2881        Py_UCS4 ch = s[i++];
2882
2883        if (ch < 0x80)
2884            /* Encode ASCII */
2885            *p++ = (char) ch;
2886
2887        else if (ch < 0x0800) {
2888            /* Encode Latin-1 */
2889            *p++ = (char)(0xc0 | (ch >> 6));
2890            *p++ = (char)(0x80 | (ch & 0x3f));
2891        } else if (0xD800 <= ch && ch <= 0xDFFF) {
2892#ifndef Py_UNICODE_WIDE
2893            /* Special case: check for high and low surrogate */
2894            if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2895                Py_UCS4 ch2 = s[i];
2896                /* Combine the two surrogates to form a UCS4 value */
2897                ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2898                i++;
2899
2900                /* Encode UCS4 Unicode ordinals */
2901                *p++ = (char)(0xf0 | (ch >> 18));
2902                *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2903                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2904                *p++ = (char)(0x80 | (ch & 0x3f));
2905            } else {
2906#endif
2907                Py_ssize_t newpos;
2908                PyObject *rep;
2909                Py_ssize_t repsize, k;
2910                rep = unicode_encode_call_errorhandler
2911                    (errors, &errorHandler, "utf-8", "surrogates not allowed",
2912                     s, size, &exc, i-1, i, &newpos);
2913                if (!rep)
2914                    goto error;
2915
2916                if (PyBytes_Check(rep))
2917                    repsize = PyBytes_GET_SIZE(rep);
2918                else
2919                    repsize = PyUnicode_GET_SIZE(rep);
2920
2921                if (repsize > 4) {
2922                    Py_ssize_t offset;
2923
2924                    if (result == NULL)
2925                        offset = p - stackbuf;
2926                    else
2927                        offset = p - PyBytes_AS_STRING(result);
2928
2929                    if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2930                        /* integer overflow */
2931                        PyErr_NoMemory();
2932                        goto error;
2933                    }
2934                    nallocated += repsize - 4;
2935                    if (result != NULL) {
2936                        if (_PyBytes_Resize(&result, nallocated) < 0)
2937                            goto error;
2938                    } else {
2939                        result = PyBytes_FromStringAndSize(NULL, nallocated);
2940                        if (result == NULL)
2941                            goto error;
2942                        Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2943                    }
2944                    p = PyBytes_AS_STRING(result) + offset;
2945                }
2946
2947                if (PyBytes_Check(rep)) {
2948                    char *prep = PyBytes_AS_STRING(rep);
2949                    for(k = repsize; k > 0; k--)
2950                        *p++ = *prep++;
2951                } else /* rep is unicode */ {
2952                    Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2953                    Py_UNICODE c;
2954
2955                    for(k=0; k<repsize; k++) {
2956                        c = prep[k];
2957                        if (0x80 <= c) {
2958                            raise_encode_exception(&exc, "utf-8", s, size,
2959                                                   i-1, i, "surrogates not allowed");
2960                            goto error;
2961                        }
2962                        *p++ = (char)prep[k];
2963                    }
2964                }
2965                Py_DECREF(rep);
2966#ifndef Py_UNICODE_WIDE
2967            }
2968#endif
2969        } else if (ch < 0x10000) {
2970            *p++ = (char)(0xe0 | (ch >> 12));
2971            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2972            *p++ = (char)(0x80 | (ch & 0x3f));
2973        } else /* ch >= 0x10000 */ {
2974            /* Encode UCS4 Unicode ordinals */
2975            *p++ = (char)(0xf0 | (ch >> 18));
2976            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2977            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2978            *p++ = (char)(0x80 | (ch & 0x3f));
2979        }
2980    }
2981
2982    if (result == NULL) {
2983        /* This was stack allocated. */
2984        nneeded = p - stackbuf;
2985        assert(nneeded <= nallocated);
2986        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2987    }
2988    else {
2989        /* Cut back to size actually needed. */
2990        nneeded = p - PyBytes_AS_STRING(result);
2991        assert(nneeded <= nallocated);
2992        _PyBytes_Resize(&result, nneeded);
2993    }
2994    Py_XDECREF(errorHandler);
2995    Py_XDECREF(exc);
2996    return result;
2997 error:
2998    Py_XDECREF(errorHandler);
2999    Py_XDECREF(exc);
3000    Py_XDECREF(result);
3001    return NULL;
3002
3003#undef MAX_SHORT_UNICHARS
3004}
3005
3006PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3007{
3008    if (!PyUnicode_Check(unicode)) {
3009        PyErr_BadArgument();
3010        return NULL;
3011    }
3012    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
3013                                PyUnicode_GET_SIZE(unicode),
3014                                NULL);
3015}
3016
3017/* --- UTF-32 Codec ------------------------------------------------------- */
3018
3019PyObject *
3020PyUnicode_DecodeUTF32(const char *s,
3021                      Py_ssize_t size,
3022                      const char *errors,
3023                      int *byteorder)
3024{
3025    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3026}
3027
3028PyObject *
3029PyUnicode_DecodeUTF32Stateful(const char *s,
3030                              Py_ssize_t size,
3031                              const char *errors,
3032                              int *byteorder,
3033                              Py_ssize_t *consumed)
3034{
3035    const char *starts = s;
3036    Py_ssize_t startinpos;
3037    Py_ssize_t endinpos;
3038    Py_ssize_t outpos;
3039    PyUnicodeObject *unicode;
3040    Py_UNICODE *p;
3041#ifndef Py_UNICODE_WIDE
3042    int pairs = 0;
3043    const unsigned char *qq;
3044#else
3045    const int pairs = 0;
3046#endif
3047    const unsigned char *q, *e;
3048    int bo = 0;       /* assume native ordering by default */
3049    const char *errmsg = "";
3050    /* Offsets from q for retrieving bytes in the right order. */
3051#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3052    int iorder[] = {0, 1, 2, 3};
3053#else
3054    int iorder[] = {3, 2, 1, 0};
3055#endif
3056    PyObject *errorHandler = NULL;
3057    PyObject *exc = NULL;
3058
3059    q = (unsigned char *)s;
3060    e = q + size;
3061
3062    if (byteorder)
3063        bo = *byteorder;
3064
3065    /* Check for BOM marks (U+FEFF) in the input and adjust current
3066       byte order setting accordingly. In native mode, the leading BOM
3067       mark is skipped, in all other modes, it is copied to the output
3068       stream as-is (giving a ZWNBSP character). */
3069    if (bo == 0) {
3070        if (size >= 4) {
3071            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3072                (q[iorder[1]] << 8) | q[iorder[0]];
3073#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3074            if (bom == 0x0000FEFF) {
3075                q += 4;
3076                bo = -1;
3077            }
3078            else if (bom == 0xFFFE0000) {
3079                q += 4;
3080                bo = 1;
3081            }
3082#else
3083            if (bom == 0x0000FEFF) {
3084                q += 4;
3085                bo = 1;
3086            }
3087            else if (bom == 0xFFFE0000) {
3088                q += 4;
3089                bo = -1;
3090            }
3091#endif
3092        }
3093    }
3094
3095    if (bo == -1) {
3096        /* force LE */
3097        iorder[0] = 0;
3098        iorder[1] = 1;
3099        iorder[2] = 2;
3100        iorder[3] = 3;
3101    }
3102    else if (bo == 1) {
3103        /* force BE */
3104        iorder[0] = 3;
3105        iorder[1] = 2;
3106        iorder[2] = 1;
3107        iorder[3] = 0;
3108    }
3109
3110    /* On narrow builds we split characters outside the BMP into two
3111       codepoints => count how much extra space we need. */
3112#ifndef Py_UNICODE_WIDE
3113    for (qq = q; qq < e; qq += 4)
3114        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3115            pairs++;
3116#endif
3117
3118    /* This might be one to much, because of a BOM */
3119    unicode = _PyUnicode_New((size+3)/4+pairs);
3120    if (!unicode)
3121        return NULL;
3122    if (size == 0)
3123        return (PyObject *)unicode;
3124
3125    /* Unpack UTF-32 encoded data */
3126    p = unicode->str;
3127
3128    while (q < e) {
3129        Py_UCS4 ch;
3130        /* remaining bytes at the end? (size should be divisible by 4) */
3131        if (e-q<4) {
3132            if (consumed)
3133                break;
3134            errmsg = "truncated data";
3135            startinpos = ((const char *)q)-starts;
3136            endinpos = ((const char *)e)-starts;
3137            goto utf32Error;
3138            /* The remaining input chars are ignored if the callback
3139               chooses to skip the input */
3140        }
3141        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3142            (q[iorder[1]] << 8) | q[iorder[0]];
3143
3144        if (ch >= 0x110000)
3145        {
3146            errmsg = "codepoint not in range(0x110000)";
3147            startinpos = ((const char *)q)-starts;
3148            endinpos = startinpos+4;
3149            goto utf32Error;
3150        }
3151#ifndef Py_UNICODE_WIDE
3152        if (ch >= 0x10000)
3153        {
3154            *p++ = 0xD800 | ((ch-0x10000) >> 10);
3155            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3156        }
3157        else
3158#endif
3159            *p++ = ch;
3160        q += 4;
3161        continue;
3162      utf32Error:
3163        outpos = p-PyUnicode_AS_UNICODE(unicode);
3164        if (unicode_decode_call_errorhandler(
3165                errors, &errorHandler,
3166                "utf32", errmsg,
3167                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3168                &unicode, &outpos, &p))
3169            goto onError;
3170    }
3171
3172    if (byteorder)
3173        *byteorder = bo;
3174
3175    if (consumed)
3176        *consumed = (const char *)q-starts;
3177
3178    /* Adjust length */
3179    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3180        goto onError;
3181
3182    Py_XDECREF(errorHandler);
3183    Py_XDECREF(exc);
3184    return (PyObject *)unicode;
3185
3186  onError:
3187    Py_DECREF(unicode);
3188    Py_XDECREF(errorHandler);
3189    Py_XDECREF(exc);
3190    return NULL;
3191}
3192
3193PyObject *
3194PyUnicode_EncodeUTF32(const Py_UNICODE *s,
3195                      Py_ssize_t size,
3196                      const char *errors,
3197                      int byteorder)
3198{
3199    PyObject *v;
3200    unsigned char *p;
3201    Py_ssize_t nsize, bytesize;
3202#ifndef Py_UNICODE_WIDE
3203    Py_ssize_t i, pairs;
3204#else
3205    const int pairs = 0;
3206#endif
3207    /* Offsets from p for storing byte pairs in the right order. */
3208#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3209    int iorder[] = {0, 1, 2, 3};
3210#else
3211    int iorder[] = {3, 2, 1, 0};
3212#endif
3213
3214#define STORECHAR(CH)                           \
3215    do {                                        \
3216        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
3217        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
3218        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
3219        p[iorder[0]] = (CH) & 0xff;             \
3220        p += 4;                                 \
3221    } while(0)
3222
3223    /* In narrow builds we can output surrogate pairs as one codepoint,
3224       so we need less space. */
3225#ifndef Py_UNICODE_WIDE
3226    for (i = pairs = 0; i < size-1; i++)
3227        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3228            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3229            pairs++;
3230#endif
3231    nsize = (size - pairs + (byteorder == 0));
3232    bytesize = nsize * 4;
3233    if (bytesize / 4 != nsize)
3234        return PyErr_NoMemory();
3235    v = PyBytes_FromStringAndSize(NULL, bytesize);
3236    if (v == NULL)
3237        return NULL;
3238
3239    p = (unsigned char *)PyBytes_AS_STRING(v);
3240    if (byteorder == 0)
3241        STORECHAR(0xFEFF);
3242    if (size == 0)
3243        goto done;
3244
3245    if (byteorder == -1) {
3246        /* force LE */
3247        iorder[0] = 0;
3248        iorder[1] = 1;
3249        iorder[2] = 2;
3250        iorder[3] = 3;
3251    }
3252    else if (byteorder == 1) {
3253        /* force BE */
3254        iorder[0] = 3;
3255        iorder[1] = 2;
3256        iorder[2] = 1;
3257        iorder[3] = 0;
3258    }
3259
3260    while (size-- > 0) {
3261        Py_UCS4 ch = *s++;
3262#ifndef Py_UNICODE_WIDE
3263        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3264            Py_UCS4 ch2 = *s;
3265            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3266                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3267                s++;
3268                size--;
3269            }
3270        }
3271#endif
3272        STORECHAR(ch);
3273    }
3274
3275  done:
3276    return v;
3277#undef STORECHAR
3278}
3279
3280PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3281{
3282    if (!PyUnicode_Check(unicode)) {
3283        PyErr_BadArgument();
3284        return NULL;
3285    }
3286    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
3287                                 PyUnicode_GET_SIZE(unicode),
3288                                 NULL,
3289                                 0);
3290}
3291
3292/* --- UTF-16 Codec ------------------------------------------------------- */
3293
3294PyObject *
3295PyUnicode_DecodeUTF16(const char *s,
3296                      Py_ssize_t size,
3297                      const char *errors,
3298                      int *byteorder)
3299{
3300    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3301}
3302
3303/* Two masks for fast checking of whether a C 'long' may contain
3304   UTF16-encoded surrogate characters. This is an efficient heuristic,
3305   assuming that non-surrogate characters with a code point >= 0x8000 are
3306   rare in most input.
3307   FAST_CHAR_MASK is used when the input is in native byte ordering,
3308   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
3309*/
3310#if (SIZEOF_LONG == 8)
3311# define FAST_CHAR_MASK         0x8000800080008000L
3312# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3313#elif (SIZEOF_LONG == 4)
3314# define FAST_CHAR_MASK         0x80008000L
3315# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3316#else
3317# error C 'long' size should be either 4 or 8!
3318#endif
3319
3320PyObject *
3321PyUnicode_DecodeUTF16Stateful(const char *s,
3322                              Py_ssize_t size,
3323                              const char *errors,
3324                              int *byteorder,
3325                              Py_ssize_t *consumed)
3326{
3327    const char *starts = s;
3328    Py_ssize_t startinpos;
3329    Py_ssize_t endinpos;
3330    Py_ssize_t outpos;
3331    PyUnicodeObject *unicode;
3332    Py_UNICODE *p;
3333    const unsigned char *q, *e, *aligned_end;
3334    int bo = 0;       /* assume native ordering by default */
3335    int native_ordering = 0;
3336    const char *errmsg = "";
3337    /* Offsets from q for retrieving byte pairs in the right order. */
3338#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3339    int ihi = 1, ilo = 0;
3340#else
3341    int ihi = 0, ilo = 1;
3342#endif
3343    PyObject *errorHandler = NULL;
3344    PyObject *exc = NULL;
3345
3346    /* Note: size will always be longer than the resulting Unicode
3347       character count */
3348    unicode = _PyUnicode_New(size);
3349    if (!unicode)
3350        return NULL;
3351    if (size == 0)
3352        return (PyObject *)unicode;
3353
3354    /* Unpack UTF-16 encoded data */
3355    p = unicode->str;
3356    q = (unsigned char *)s;
3357    e = q + size - 1;
3358
3359    if (byteorder)
3360        bo = *byteorder;
3361
3362    /* Check for BOM marks (U+FEFF) in the input and adjust current
3363       byte order setting accordingly. In native mode, the leading BOM
3364       mark is skipped, in all other modes, it is copied to the output
3365       stream as-is (giving a ZWNBSP character). */
3366    if (bo == 0) {
3367        if (size >= 2) {
3368            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
3369#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3370            if (bom == 0xFEFF) {
3371                q += 2;
3372                bo = -1;
3373            }
3374            else if (bom == 0xFFFE) {
3375                q += 2;
3376                bo = 1;
3377            }
3378#else
3379            if (bom == 0xFEFF) {
3380                q += 2;
3381                bo = 1;
3382            }
3383            else if (bom == 0xFFFE) {
3384                q += 2;
3385                bo = -1;
3386            }
3387#endif
3388        }
3389    }
3390
3391    if (bo == -1) {
3392        /* force LE */
3393        ihi = 1;
3394        ilo = 0;
3395    }
3396    else if (bo == 1) {
3397        /* force BE */
3398        ihi = 0;
3399        ilo = 1;
3400    }
3401#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3402    native_ordering = ilo < ihi;
3403#else
3404    native_ordering = ilo > ihi;
3405#endif
3406
3407    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3408    while (q < e) {
3409        Py_UNICODE ch;
3410        /* First check for possible aligned read of a C 'long'. Unaligned
3411           reads are more expensive, better to defer to another iteration. */
3412        if (!((size_t) q & LONG_PTR_MASK)) {
3413            /* Fast path for runs of non-surrogate chars. */
3414            register const unsigned char *_q = q;
3415            Py_UNICODE *_p = p;
3416            if (native_ordering) {
3417                /* Native ordering is simple: as long as the input cannot
3418                   possibly contain a surrogate char, do an unrolled copy
3419                   of several 16-bit code points to the target object.
3420                   The non-surrogate check is done on several input bytes
3421                   at a time (as many as a C 'long' can contain). */
3422                while (_q < aligned_end) {
3423                    unsigned long data = * (unsigned long *) _q;
3424                    if (data & FAST_CHAR_MASK)
3425                        break;
3426                    _p[0] = ((unsigned short *) _q)[0];
3427                    _p[1] = ((unsigned short *) _q)[1];
3428#if (SIZEOF_LONG == 8)
3429                    _p[2] = ((unsigned short *) _q)[2];
3430                    _p[3] = ((unsigned short *) _q)[3];
3431#endif
3432                    _q += SIZEOF_LONG;
3433                    _p += SIZEOF_LONG / 2;
3434                }
3435            }
3436            else {
3437                /* Byteswapped ordering is similar, but we must decompose
3438                   the copy bytewise, and take care of zero'ing out the
3439                   upper bytes if the target object is in 32-bit units
3440                   (that is, in UCS-4 builds). */
3441                while (_q < aligned_end) {
3442                    unsigned long data = * (unsigned long *) _q;
3443                    if (data & SWAPPED_FAST_CHAR_MASK)
3444                        break;
3445                    /* Zero upper bytes in UCS-4 builds */
3446#if (Py_UNICODE_SIZE > 2)
3447                    _p[0] = 0;
3448                    _p[1] = 0;
3449#if (SIZEOF_LONG == 8)
3450                    _p[2] = 0;
3451                    _p[3] = 0;
3452#endif
3453#endif
3454                    /* Issue #4916; UCS-4 builds on big endian machines must
3455                       fill the two last bytes of each 4-byte unit. */
3456#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3457# define OFF 2
3458#else
3459# define OFF 0
3460#endif
3461                    ((unsigned char *) _p)[OFF + 1] = _q[0];
3462                    ((unsigned char *) _p)[OFF + 0] = _q[1];
3463                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3464                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3465#if (SIZEOF_LONG == 8)
3466                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3467                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3468                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3469                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3470#endif
3471#undef OFF
3472                    _q += SIZEOF_LONG;
3473                    _p += SIZEOF_LONG / 2;
3474                }
3475            }
3476            p = _p;
3477            q = _q;
3478            if (q >= e)
3479                break;
3480        }
3481        ch = (q[ihi] << 8) | q[ilo];
3482
3483        q += 2;
3484
3485        if (ch < 0xD800 || ch > 0xDFFF) {
3486            *p++ = ch;
3487            continue;
3488        }
3489
3490        /* UTF-16 code pair: */
3491        if (q > e) {
3492            errmsg = "unexpected end of data";
3493            startinpos = (((const char *)q) - 2) - starts;
3494            endinpos = ((const char *)e) + 1 - starts;
3495            goto utf16Error;
3496        }
3497        if (0xD800 <= ch && ch <= 0xDBFF) {
3498            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3499            q += 2;
3500            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3501#ifndef Py_UNICODE_WIDE
3502                *p++ = ch;
3503                *p++ = ch2;
3504#else
3505                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3506#endif
3507                continue;
3508            }
3509            else {
3510                errmsg = "illegal UTF-16 surrogate";
3511                startinpos = (((const char *)q)-4)-starts;
3512                endinpos = startinpos+2;
3513                goto utf16Error;
3514            }
3515
3516        }
3517        errmsg = "illegal encoding";
3518        startinpos = (((const char *)q)-2)-starts;
3519        endinpos = startinpos+2;
3520        /* Fall through to report the error */
3521
3522      utf16Error:
3523        outpos = p - PyUnicode_AS_UNICODE(unicode);
3524        if (unicode_decode_call_errorhandler(
3525                errors,
3526                &errorHandler,
3527                "utf16", errmsg,
3528                &starts,
3529                (const char **)&e,
3530                &startinpos,
3531                &endinpos,
3532                &exc,
3533                (const char **)&q,
3534                &unicode,
3535                &outpos,
3536                &p))
3537            goto onError;
3538    }
3539    /* remaining byte at the end? (size should be even) */
3540    if (e == q) {
3541        if (!consumed) {
3542            errmsg = "truncated data";
3543            startinpos = ((const char *)q) - starts;
3544            endinpos = ((const char *)e) + 1 - starts;
3545            outpos = p - PyUnicode_AS_UNICODE(unicode);
3546            if (unicode_decode_call_errorhandler(
3547                    errors,
3548                    &errorHandler,
3549                    "utf16", errmsg,
3550                    &starts,
3551                    (const char **)&e,
3552                    &startinpos,
3553                    &endinpos,
3554                    &exc,
3555                    (const char **)&q,
3556                    &unicode,
3557                    &outpos,
3558                    &p))
3559                goto onError;
3560            /* The remaining input chars are ignored if the callback
3561               chooses to skip the input */
3562        }
3563    }
3564
3565    if (byteorder)
3566        *byteorder = bo;
3567
3568    if (consumed)
3569        *consumed = (const char *)q-starts;
3570
3571    /* Adjust length */
3572    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3573        goto onError;
3574
3575    Py_XDECREF(errorHandler);
3576    Py_XDECREF(exc);
3577    return (PyObject *)unicode;
3578
3579  onError:
3580    Py_DECREF(unicode);
3581    Py_XDECREF(errorHandler);
3582    Py_XDECREF(exc);
3583    return NULL;
3584}
3585
3586#undef FAST_CHAR_MASK
3587#undef SWAPPED_FAST_CHAR_MASK
3588
3589PyObject *
3590PyUnicode_EncodeUTF16(const Py_UNICODE *s,
3591                      Py_ssize_t size,
3592                      const char *errors,
3593                      int byteorder)
3594{
3595    PyObject *v;
3596    unsigned char *p;
3597    Py_ssize_t nsize, bytesize;
3598#ifdef Py_UNICODE_WIDE
3599    Py_ssize_t i, pairs;
3600#else
3601    const int pairs = 0;
3602#endif
3603    /* Offsets from p for storing byte pairs in the right order. */
3604#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3605    int ihi = 1, ilo = 0;
3606#else
3607    int ihi = 0, ilo = 1;
3608#endif
3609
3610#define STORECHAR(CH)                           \
3611    do {                                        \
3612        p[ihi] = ((CH) >> 8) & 0xff;            \
3613        p[ilo] = (CH) & 0xff;                   \
3614        p += 2;                                 \
3615    } while(0)
3616
3617#ifdef Py_UNICODE_WIDE
3618    for (i = pairs = 0; i < size; i++)
3619        if (s[i] >= 0x10000)
3620            pairs++;
3621#endif
3622    /* 2 * (size + pairs + (byteorder == 0)) */
3623    if (size > PY_SSIZE_T_MAX ||
3624        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
3625        return PyErr_NoMemory();
3626    nsize = size + pairs + (byteorder == 0);
3627    bytesize = nsize * 2;
3628    if (bytesize / 2 != nsize)
3629        return PyErr_NoMemory();
3630    v = PyBytes_FromStringAndSize(NULL, bytesize);
3631    if (v == NULL)
3632        return NULL;
3633
3634    p = (unsigned char *)PyBytes_AS_STRING(v);
3635    if (byteorder == 0)
3636        STORECHAR(0xFEFF);
3637    if (size == 0)
3638        goto done;
3639
3640    if (byteorder == -1) {
3641        /* force LE */
3642        ihi = 1;
3643        ilo = 0;
3644    }
3645    else if (byteorder == 1) {
3646        /* force BE */
3647        ihi = 0;
3648        ilo = 1;
3649    }
3650
3651    while (size-- > 0) {
3652        Py_UNICODE ch = *s++;
3653        Py_UNICODE ch2 = 0;
3654#ifdef Py_UNICODE_WIDE
3655        if (ch >= 0x10000) {
3656            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3657            ch  = 0xD800 | ((ch-0x10000) >> 10);
3658        }
3659#endif
3660        STORECHAR(ch);
3661        if (ch2)
3662            STORECHAR(ch2);
3663    }
3664
3665  done:
3666    return v;
3667#undef STORECHAR
3668}
3669
3670PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3671{
3672    if (!PyUnicode_Check(unicode)) {
3673        PyErr_BadArgument();
3674        return NULL;
3675    }
3676    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3677                                 PyUnicode_GET_SIZE(unicode),
3678                                 NULL,
3679                                 0);
3680}
3681
3682/* --- Unicode Escape Codec ----------------------------------------------- */
3683
3684static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3685
3686PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3687                                        Py_ssize_t size,
3688                                        const char *errors)
3689{
3690    const char *starts = s;
3691    Py_ssize_t startinpos;
3692    Py_ssize_t endinpos;
3693    Py_ssize_t outpos;
3694    int i;
3695    PyUnicodeObject *v;
3696    Py_UNICODE *p;
3697    const char *end;
3698    char* message;
3699    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3700    PyObject *errorHandler = NULL;
3701    PyObject *exc = NULL;
3702
3703    /* Escaped strings will always be longer than the resulting
3704       Unicode string, so we start with size here and then reduce the
3705       length after conversion to the true value.
3706       (but if the error callback returns a long replacement string
3707       we'll have to allocate more space) */
3708    v = _PyUnicode_New(size);
3709    if (v == NULL)
3710        goto onError;
3711    if (size == 0)
3712        return (PyObject *)v;
3713
3714    p = PyUnicode_AS_UNICODE(v);
3715    end = s + size;
3716
3717    while (s < end) {
3718        unsigned char c;
3719        Py_UNICODE x;
3720        int digits;
3721
3722        /* Non-escape characters are interpreted as Unicode ordinals */
3723        if (*s != '\\') {
3724            *p++ = (unsigned char) *s++;
3725            continue;
3726        }
3727
3728        startinpos = s-starts;
3729        /* \ - Escapes */
3730        s++;
3731        c = *s++;
3732        if (s > end)
3733            c = '\0'; /* Invalid after \ */
3734        switch (c) {
3735
3736            /* \x escapes */
3737        case '\n': break;
3738        case '\\': *p++ = '\\'; break;
3739        case '\'': *p++ = '\''; break;
3740        case '\"': *p++ = '\"'; break;
3741        case 'b': *p++ = '\b'; break;
3742        case 'f': *p++ = '\014'; break; /* FF */
3743        case 't': *p++ = '\t'; break;
3744        case 'n': *p++ = '\n'; break;
3745        case 'r': *p++ = '\r'; break;
3746        case 'v': *p++ = '\013'; break; /* VT */
3747        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3748
3749            /* \OOO (octal) escapes */
3750        case '0': case '1': case '2': case '3':
3751        case '4': case '5': case '6': case '7':
3752            x = s[-1] - '0';
3753            if (s < end && '0' <= *s && *s <= '7') {
3754                x = (x<<3) + *s++ - '0';
3755                if (s < end && '0' <= *s && *s <= '7')
3756                    x = (x<<3) + *s++ - '0';
3757            }
3758            *p++ = x;
3759            break;
3760
3761            /* hex escapes */
3762            /* \xXX */
3763        case 'x':
3764            digits = 2;
3765            message = "truncated \\xXX escape";
3766            goto hexescape;
3767
3768            /* \uXXXX */
3769        case 'u':
3770            digits = 4;
3771            message = "truncated \\uXXXX escape";
3772            goto hexescape;
3773
3774            /* \UXXXXXXXX */
3775        case 'U':
3776            digits = 8;
3777            message = "truncated \\UXXXXXXXX escape";
3778        hexescape:
3779            chr = 0;
3780            outpos = p-PyUnicode_AS_UNICODE(v);
3781            if (s+digits>end) {
3782                endinpos = size;
3783                if (unicode_decode_call_errorhandler(
3784                        errors, &errorHandler,
3785                        "unicodeescape", "end of string in escape sequence",
3786                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3787                        &v, &outpos, &p))
3788                    goto onError;
3789                goto nextByte;
3790            }
3791            for (i = 0; i < digits; ++i) {
3792                c = (unsigned char) s[i];
3793                if (!Py_ISXDIGIT(c)) {
3794                    endinpos = (s+i+1)-starts;
3795                    if (unicode_decode_call_errorhandler(
3796                            errors, &errorHandler,
3797                            "unicodeescape", message,
3798                            &starts, &end, &startinpos, &endinpos, &exc, &s,
3799                            &v, &outpos, &p))
3800                        goto onError;
3801                    goto nextByte;
3802                }
3803                chr = (chr<<4) & ~0xF;
3804                if (c >= '0' && c <= '9')
3805                    chr += c - '0';
3806                else if (c >= 'a' && c <= 'f')
3807                    chr += 10 + c - 'a';
3808                else
3809                    chr += 10 + c - 'A';
3810            }
3811            s += i;
3812            if (chr == 0xffffffff && PyErr_Occurred())
3813                /* _decoding_error will have already written into the
3814                   target buffer. */
3815                break;
3816        store:
3817            /* when we get here, chr is a 32-bit unicode character */
3818            if (chr <= 0xffff)
3819                /* UCS-2 character */
3820                *p++ = (Py_UNICODE) chr;
3821            else if (chr <= 0x10ffff) {
3822                /* UCS-4 character. Either store directly, or as
3823                   surrogate pair. */
3824#ifdef Py_UNICODE_WIDE
3825                *p++ = chr;
3826#else
3827                chr -= 0x10000L;
3828                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3829                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3830#endif
3831            } else {
3832                endinpos = s-starts;
3833                outpos = p-PyUnicode_AS_UNICODE(v);
3834                if (unicode_decode_call_errorhandler(
3835                        errors, &errorHandler,
3836                        "unicodeescape", "illegal Unicode character",
3837                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3838                        &v, &outpos, &p))
3839                    goto onError;
3840            }
3841            break;
3842
3843            /* \N{name} */
3844        case 'N':
3845            message = "malformed \\N character escape";
3846            if (ucnhash_CAPI == NULL) {
3847                /* load the unicode data module */
3848                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3849                if (ucnhash_CAPI == NULL)
3850                    goto ucnhashError;
3851            }
3852            if (*s == '{') {
3853                const char *start = s+1;
3854                /* look for the closing brace */
3855                while (*s != '}' && s < end)
3856                    s++;
3857                if (s > start && s < end && *s == '}') {
3858                    /* found a name.  look it up in the unicode database */
3859                    message = "unknown Unicode character name";
3860                    s++;
3861                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3862                        goto store;
3863                }
3864            }
3865            endinpos = s-starts;
3866            outpos = p-PyUnicode_AS_UNICODE(v);
3867            if (unicode_decode_call_errorhandler(
3868                    errors, &errorHandler,
3869                    "unicodeescape", message,
3870                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3871                    &v, &outpos, &p))
3872                goto onError;
3873            break;
3874
3875        default:
3876            if (s > end) {
3877                message = "\\ at end of string";
3878                s--;
3879                endinpos = s-starts;
3880                outpos = p-PyUnicode_AS_UNICODE(v);
3881                if (unicode_decode_call_errorhandler(
3882                        errors, &errorHandler,
3883                        "unicodeescape", message,
3884                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3885                        &v, &outpos, &p))
3886                    goto onError;
3887            }
3888            else {
3889                *p++ = '\\';
3890                *p++ = (unsigned char)s[-1];
3891            }
3892            break;
3893        }
3894      nextByte:
3895        ;
3896    }
3897    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3898        goto onError;
3899    Py_XDECREF(errorHandler);
3900    Py_XDECREF(exc);
3901    return (PyObject *)v;
3902
3903  ucnhashError:
3904    PyErr_SetString(
3905        PyExc_UnicodeError,
3906        "\\N escapes not supported (can't load unicodedata module)"
3907        );
3908    Py_XDECREF(v);
3909    Py_XDECREF(errorHandler);
3910    Py_XDECREF(exc);
3911    return NULL;
3912
3913  onError:
3914    Py_XDECREF(v);
3915    Py_XDECREF(errorHandler);
3916    Py_XDECREF(exc);
3917    return NULL;
3918}
3919
3920/* Return a Unicode-Escape string version of the Unicode object.
3921
3922   If quotes is true, the string is enclosed in u"" or u'' quotes as
3923   appropriate.
3924
3925*/
3926
3927Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3928                                             Py_ssize_t size,
3929                                             Py_UNICODE ch)
3930{
3931    /* like wcschr, but doesn't stop at NULL characters */
3932
3933    while (size-- > 0) {
3934        if (*s == ch)
3935            return s;
3936        s++;
3937    }
3938
3939    return NULL;
3940}
3941
3942static const char *hexdigits = "0123456789abcdef";
3943
3944PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3945                                        Py_ssize_t size)
3946{
3947    PyObject *repr;
3948    char *p;
3949
3950#ifdef Py_UNICODE_WIDE
3951    const Py_ssize_t expandsize = 10;
3952#else
3953    const Py_ssize_t expandsize = 6;
3954#endif
3955
3956    /* XXX(nnorwitz): rather than over-allocating, it would be
3957       better to choose a different scheme.  Perhaps scan the
3958       first N-chars of the string and allocate based on that size.
3959    */
3960    /* Initial allocation is based on the longest-possible unichr
3961       escape.
3962
3963       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3964       unichr, so in this case it's the longest unichr escape. In
3965       narrow (UTF-16) builds this is five chars per source unichr
3966       since there are two unichrs in the surrogate pair, so in narrow
3967       (UTF-16) builds it's not the longest unichr escape.
3968
3969       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3970       so in the narrow (UTF-16) build case it's the longest unichr
3971       escape.
3972    */
3973
3974    if (size == 0)
3975        return PyBytes_FromStringAndSize(NULL, 0);
3976
3977    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3978        return PyErr_NoMemory();
3979
3980    repr = PyBytes_FromStringAndSize(NULL,
3981                                     2
3982                                     + expandsize*size
3983                                     + 1);
3984    if (repr == NULL)
3985        return NULL;
3986
3987    p = PyBytes_AS_STRING(repr);
3988
3989    while (size-- > 0) {
3990        Py_UNICODE ch = *s++;
3991
3992        /* Escape backslashes */
3993        if (ch == '\\') {
3994            *p++ = '\\';
3995            *p++ = (char) ch;
3996            continue;
3997        }
3998
3999#ifdef Py_UNICODE_WIDE
4000        /* Map 21-bit characters to '\U00xxxxxx' */
4001        else if (ch >= 0x10000) {
4002            *p++ = '\\';
4003            *p++ = 'U';
4004            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4005            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4006            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4007            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4008            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4009            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4010            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4011            *p++ = hexdigits[ch & 0x0000000F];
4012            continue;
4013        }
4014#else
4015        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4016        else if (ch >= 0xD800 && ch < 0xDC00) {
4017            Py_UNICODE ch2;
4018            Py_UCS4 ucs;
4019
4020            ch2 = *s++;
4021            size--;
4022            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4023                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4024                *p++ = '\\';
4025                *p++ = 'U';
4026                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4027                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4028                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4029                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4030                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4031                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4032                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4033                *p++ = hexdigits[ucs & 0x0000000F];
4034                continue;
4035            }
4036            /* Fall through: isolated surrogates are copied as-is */
4037            s--;
4038            size++;
4039        }
4040#endif
4041
4042        /* Map 16-bit characters to '\uxxxx' */
4043        if (ch >= 256) {
4044            *p++ = '\\';
4045            *p++ = 'u';
4046            *p++ = hexdigits[(ch >> 12) & 0x000F];
4047            *p++ = hexdigits[(ch >> 8) & 0x000F];
4048            *p++ = hexdigits[(ch >> 4) & 0x000F];
4049            *p++ = hexdigits[ch & 0x000F];
4050        }
4051
4052        /* Map special whitespace to '\t', \n', '\r' */
4053        else if (ch == '\t') {
4054            *p++ = '\\';
4055            *p++ = 't';
4056        }
4057        else if (ch == '\n') {
4058            *p++ = '\\';
4059            *p++ = 'n';
4060        }
4061        else if (ch == '\r') {
4062            *p++ = '\\';
4063            *p++ = 'r';
4064        }
4065
4066        /* Map non-printable US ASCII to '\xhh' */
4067        else if (ch < ' ' || ch >= 0x7F) {
4068            *p++ = '\\';
4069            *p++ = 'x';
4070            *p++ = hexdigits[(ch >> 4) & 0x000F];
4071            *p++ = hexdigits[ch & 0x000F];
4072        }
4073
4074        /* Copy everything else as-is */
4075        else
4076            *p++ = (char) ch;
4077    }
4078
4079    assert(p - PyBytes_AS_STRING(repr) > 0);
4080    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4081        return NULL;
4082    return repr;
4083}
4084
4085PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
4086{
4087    PyObject *s;
4088    if (!PyUnicode_Check(unicode)) {
4089        PyErr_BadArgument();
4090        return NULL;
4091    }
4092    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4093                                      PyUnicode_GET_SIZE(unicode));
4094    return s;
4095}
4096
4097/* --- Raw Unicode Escape Codec ------------------------------------------- */
4098
4099PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
4100                                           Py_ssize_t size,
4101                                           const char *errors)
4102{
4103    const char *starts = s;
4104    Py_ssize_t startinpos;
4105    Py_ssize_t endinpos;
4106    Py_ssize_t outpos;
4107    PyUnicodeObject *v;
4108    Py_UNICODE *p;
4109    const char *end;
4110    const char *bs;
4111    PyObject *errorHandler = NULL;
4112    PyObject *exc = NULL;
4113
4114    /* Escaped strings will always be longer than the resulting
4115       Unicode string, so we start with size here and then reduce the
4116       length after conversion to the true value. (But decoding error
4117       handler might have to resize the string) */
4118    v = _PyUnicode_New(size);
4119    if (v == NULL)
4120        goto onError;
4121    if (size == 0)
4122        return (PyObject *)v;
4123    p = PyUnicode_AS_UNICODE(v);
4124    end = s + size;
4125    while (s < end) {
4126        unsigned char c;
4127        Py_UCS4 x;
4128        int i;
4129        int count;
4130
4131        /* Non-escape characters are interpreted as Unicode ordinals */
4132        if (*s != '\\') {
4133            *p++ = (unsigned char)*s++;
4134            continue;
4135        }
4136        startinpos = s-starts;
4137
4138        /* \u-escapes are only interpreted iff the number of leading
4139           backslashes if odd */
4140        bs = s;
4141        for (;s < end;) {
4142            if (*s != '\\')
4143                break;
4144            *p++ = (unsigned char)*s++;
4145        }
4146        if (((s - bs) & 1) == 0 ||
4147            s >= end ||
4148            (*s != 'u' && *s != 'U')) {
4149            continue;
4150        }
4151        p--;
4152        count = *s=='u' ? 4 : 8;
4153        s++;
4154
4155        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4156        outpos = p-PyUnicode_AS_UNICODE(v);
4157        for (x = 0, i = 0; i < count; ++i, ++s) {
4158            c = (unsigned char)*s;
4159            if (!Py_ISXDIGIT(c)) {
4160                endinpos = s-starts;
4161                if (unicode_decode_call_errorhandler(
4162                        errors, &errorHandler,
4163                        "rawunicodeescape", "truncated \\uXXXX",
4164                        &starts, &end, &startinpos, &endinpos, &exc, &s,
4165                        &v, &outpos, &p))
4166                    goto onError;
4167                goto nextByte;
4168            }
4169            x = (x<<4) & ~0xF;
4170            if (c >= '0' && c <= '9')
4171                x += c - '0';
4172            else if (c >= 'a' && c <= 'f')
4173                x += 10 + c - 'a';
4174            else
4175                x += 10 + c - 'A';
4176        }
4177        if (x <= 0xffff)
4178            /* UCS-2 character */
4179            *p++ = (Py_UNICODE) x;
4180        else if (x <= 0x10ffff) {
4181            /* UCS-4 character. Either store directly, or as
4182               surrogate pair. */
4183#ifdef Py_UNICODE_WIDE
4184            *p++ = (Py_UNICODE) x;
4185#else
4186            x -= 0x10000L;
4187            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4188            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
4189#endif
4190        } else {
4191            endinpos = s-starts;
4192            outpos = p-PyUnicode_AS_UNICODE(v);
4193            if (unicode_decode_call_errorhandler(
4194                    errors, &errorHandler,
4195                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
4196                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4197                    &v, &outpos, &p))
4198                goto onError;
4199        }
4200      nextByte:
4201        ;
4202    }
4203    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4204        goto onError;
4205    Py_XDECREF(errorHandler);
4206    Py_XDECREF(exc);
4207    return (PyObject *)v;
4208
4209  onError:
4210    Py_XDECREF(v);
4211    Py_XDECREF(errorHandler);
4212    Py_XDECREF(exc);
4213    return NULL;
4214}
4215
4216PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4217                                           Py_ssize_t size)
4218{
4219    PyObject *repr;
4220    char *p;
4221    char *q;
4222
4223#ifdef Py_UNICODE_WIDE
4224    const Py_ssize_t expandsize = 10;
4225#else
4226    const Py_ssize_t expandsize = 6;
4227#endif
4228
4229    if (size > PY_SSIZE_T_MAX / expandsize)
4230        return PyErr_NoMemory();
4231
4232    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
4233    if (repr == NULL)
4234        return NULL;
4235    if (size == 0)
4236        return repr;
4237
4238    p = q = PyBytes_AS_STRING(repr);
4239    while (size-- > 0) {
4240        Py_UNICODE ch = *s++;
4241#ifdef Py_UNICODE_WIDE
4242        /* Map 32-bit characters to '\Uxxxxxxxx' */
4243        if (ch >= 0x10000) {
4244            *p++ = '\\';
4245            *p++ = 'U';
4246            *p++ = hexdigits[(ch >> 28) & 0xf];
4247            *p++ = hexdigits[(ch >> 24) & 0xf];
4248            *p++ = hexdigits[(ch >> 20) & 0xf];
4249            *p++ = hexdigits[(ch >> 16) & 0xf];
4250            *p++ = hexdigits[(ch >> 12) & 0xf];
4251            *p++ = hexdigits[(ch >> 8) & 0xf];
4252            *p++ = hexdigits[(ch >> 4) & 0xf];
4253            *p++ = hexdigits[ch & 15];
4254        }
4255        else
4256#else
4257            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4258            if (ch >= 0xD800 && ch < 0xDC00) {
4259                Py_UNICODE ch2;
4260                Py_UCS4 ucs;
4261
4262                ch2 = *s++;
4263                size--;
4264                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4265                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4266                    *p++ = '\\';
4267                    *p++ = 'U';
4268                    *p++ = hexdigits[(ucs >> 28) & 0xf];
4269                    *p++ = hexdigits[(ucs >> 24) & 0xf];
4270                    *p++ = hexdigits[(ucs >> 20) & 0xf];
4271                    *p++ = hexdigits[(ucs >> 16) & 0xf];
4272                    *p++ = hexdigits[(ucs >> 12) & 0xf];
4273                    *p++ = hexdigits[(ucs >> 8) & 0xf];
4274                    *p++ = hexdigits[(ucs >> 4) & 0xf];
4275                    *p++ = hexdigits[ucs & 0xf];
4276                    continue;
4277                }
4278                /* Fall through: isolated surrogates are copied as-is */
4279                s--;
4280                size++;
4281            }
4282#endif
4283        /* Map 16-bit characters to '\uxxxx' */
4284        if (ch >= 256) {
4285            *p++ = '\\';
4286            *p++ = 'u';
4287            *p++ = hexdigits[(ch >> 12) & 0xf];
4288            *p++ = hexdigits[(ch >> 8) & 0xf];
4289            *p++ = hexdigits[(ch >> 4) & 0xf];
4290            *p++ = hexdigits[ch & 15];
4291        }
4292        /* Copy everything else as-is */
4293        else
4294            *p++ = (char) ch;
4295    }
4296    size = p - q;
4297
4298    assert(size > 0);
4299    if (_PyBytes_Resize(&repr, size) < 0)
4300        return NULL;
4301    return repr;
4302}
4303
4304PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4305{
4306    PyObject *s;
4307    if (!PyUnicode_Check(unicode)) {
4308        PyErr_BadArgument();
4309        return NULL;
4310    }
4311    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4312                                         PyUnicode_GET_SIZE(unicode));
4313
4314    return s;
4315}
4316
4317/* --- Unicode Internal Codec ------------------------------------------- */
4318
4319PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
4320                                           Py_ssize_t size,
4321                                           const char *errors)
4322{
4323    const char *starts = s;
4324    Py_ssize_t startinpos;
4325    Py_ssize_t endinpos;
4326    Py_ssize_t outpos;
4327    PyUnicodeObject *v;
4328    Py_UNICODE *p;
4329    const char *end;
4330    const char *reason;
4331    PyObject *errorHandler = NULL;
4332    PyObject *exc = NULL;
4333
4334#ifdef Py_UNICODE_WIDE
4335    Py_UNICODE unimax = PyUnicode_GetMax();
4336#endif
4337
4338    /* XXX overflow detection missing */
4339    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4340    if (v == NULL)
4341        goto onError;
4342    if (PyUnicode_GetSize((PyObject *)v) == 0)
4343        return (PyObject *)v;
4344    p = PyUnicode_AS_UNICODE(v);
4345    end = s + size;
4346
4347    while (s < end) {
4348        memcpy(p, s, sizeof(Py_UNICODE));
4349        /* We have to sanity check the raw data, otherwise doom looms for
4350           some malformed UCS-4 data. */
4351        if (
4352#ifdef Py_UNICODE_WIDE
4353            *p > unimax || *p < 0 ||
4354#endif
4355            end-s < Py_UNICODE_SIZE
4356            )
4357        {
4358            startinpos = s - starts;
4359            if (end-s < Py_UNICODE_SIZE) {
4360                endinpos = end-starts;
4361                reason = "truncated input";
4362            }
4363            else {
4364                endinpos = s - starts + Py_UNICODE_SIZE;
4365                reason = "illegal code point (> 0x10FFFF)";
4366            }
4367            outpos = p - PyUnicode_AS_UNICODE(v);
4368            if (unicode_decode_call_errorhandler(
4369                    errors, &errorHandler,
4370                    "unicode_internal", reason,
4371                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4372                    &v, &outpos, &p)) {
4373                goto onError;
4374            }
4375        }
4376        else {
4377            p++;
4378            s += Py_UNICODE_SIZE;
4379        }
4380    }
4381
4382    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4383        goto onError;
4384    Py_XDECREF(errorHandler);
4385    Py_XDECREF(exc);
4386    return (PyObject *)v;
4387
4388  onError:
4389    Py_XDECREF(v);
4390    Py_XDECREF(errorHandler);
4391    Py_XDECREF(exc);
4392    return NULL;
4393}
4394
4395/* --- Latin-1 Codec ------------------------------------------------------ */
4396
4397PyObject *PyUnicode_DecodeLatin1(const char *s,
4398                                 Py_ssize_t size,
4399                                 const char *errors)
4400{
4401    PyUnicodeObject *v;
4402    Py_UNICODE *p;
4403    const char *e, *unrolled_end;
4404
4405    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
4406    if (size == 1) {
4407        Py_UNICODE r = *(unsigned char*)s;
4408        return PyUnicode_FromUnicode(&r, 1);
4409    }
4410
4411    v = _PyUnicode_New(size);
4412    if (v == NULL)
4413        goto onError;
4414    if (size == 0)
4415        return (PyObject *)v;
4416    p = PyUnicode_AS_UNICODE(v);
4417    e = s + size;
4418    /* Unrolling the copy makes it much faster by reducing the looping
4419       overhead. This is similar to what many memcpy() implementations do. */
4420    unrolled_end = e - 4;
4421    while (s < unrolled_end) {
4422        p[0] = (unsigned char) s[0];
4423        p[1] = (unsigned char) s[1];
4424        p[2] = (unsigned char) s[2];
4425        p[3] = (unsigned char) s[3];
4426        s += 4;
4427        p += 4;
4428    }
4429    while (s < e)
4430        *p++ = (unsigned char) *s++;
4431    return (PyObject *)v;
4432
4433  onError:
4434    Py_XDECREF(v);
4435    return NULL;
4436}
4437
4438/* create or adjust a UnicodeEncodeError */
4439static void make_encode_exception(PyObject **exceptionObject,
4440                                  const char *encoding,
4441                                  const Py_UNICODE *unicode, Py_ssize_t size,
4442                                  Py_ssize_t startpos, Py_ssize_t endpos,
4443                                  const char *reason)
4444{
4445    if (*exceptionObject == NULL) {
4446        *exceptionObject = PyUnicodeEncodeError_Create(
4447            encoding, unicode, size, startpos, endpos, reason);
4448    }
4449    else {
4450        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4451            goto onError;
4452        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4453            goto onError;
4454        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4455            goto onError;
4456        return;
4457      onError:
4458        Py_DECREF(*exceptionObject);
4459        *exceptionObject = NULL;
4460    }
4461}
4462
4463/* raises a UnicodeEncodeError */
4464static void raise_encode_exception(PyObject **exceptionObject,
4465                                   const char *encoding,
4466                                   const Py_UNICODE *unicode, Py_ssize_t size,
4467                                   Py_ssize_t startpos, Py_ssize_t endpos,
4468                                   const char *reason)
4469{
4470    make_encode_exception(exceptionObject,
4471                          encoding, unicode, size, startpos, endpos, reason);
4472    if (*exceptionObject != NULL)
4473        PyCodec_StrictErrors(*exceptionObject);
4474}
4475
4476/* error handling callback helper:
4477   build arguments, call the callback and check the arguments,
4478   put the result into newpos and return the replacement string, which
4479   has to be freed by the caller */
4480static PyObject *unicode_encode_call_errorhandler(const char *errors,
4481                                                  PyObject **errorHandler,
4482                                                  const char *encoding, const char *reason,
4483                                                  const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4484                                                  Py_ssize_t startpos, Py_ssize_t endpos,
4485                                                  Py_ssize_t *newpos)
4486{
4487    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4488
4489    PyObject *restuple;
4490    PyObject *resunicode;
4491
4492    if (*errorHandler == NULL) {
4493        *errorHandler = PyCodec_LookupError(errors);
4494        if (*errorHandler == NULL)
4495            return NULL;
4496    }
4497
4498    make_encode_exception(exceptionObject,
4499                          encoding, unicode, size, startpos, endpos, reason);
4500    if (*exceptionObject == NULL)
4501        return NULL;
4502
4503    restuple = PyObject_CallFunctionObjArgs(
4504        *errorHandler, *exceptionObject, NULL);
4505    if (restuple == NULL)
4506        return NULL;
4507    if (!PyTuple_Check(restuple)) {
4508        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4509        Py_DECREF(restuple);
4510        return NULL;
4511    }
4512    if (!PyArg_ParseTuple(restuple, argparse,
4513                          &resunicode, newpos)) {
4514        Py_DECREF(restuple);
4515        return NULL;
4516    }
4517    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4518        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4519        Py_DECREF(restuple);
4520        return NULL;
4521    }
4522    if (*newpos<0)
4523        *newpos = size+*newpos;
4524    if (*newpos<0 || *newpos>size) {
4525        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4526        Py_DECREF(restuple);
4527        return NULL;
4528    }
4529    Py_INCREF(resunicode);
4530    Py_DECREF(restuple);
4531    return resunicode;
4532}
4533
4534static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
4535                                     Py_ssize_t size,
4536                                     const char *errors,
4537                                     int limit)
4538{
4539    /* output object */
4540    PyObject *res;
4541    /* pointers to the beginning and end+1 of input */
4542    const Py_UNICODE *startp = p;
4543    const Py_UNICODE *endp = p + size;
4544    /* pointer to the beginning of the unencodable characters */
4545    /* const Py_UNICODE *badp = NULL; */
4546    /* pointer into the output */
4547    char *str;
4548    /* current output position */
4549    Py_ssize_t ressize;
4550    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4551    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4552    PyObject *errorHandler = NULL;
4553    PyObject *exc = NULL;
4554    /* the following variable is used for caching string comparisons
4555     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4556    int known_errorHandler = -1;
4557
4558    /* allocate enough for a simple encoding without
4559       replacements, if we need more, we'll resize */
4560    if (size == 0)
4561        return PyBytes_FromStringAndSize(NULL, 0);
4562    res = PyBytes_FromStringAndSize(NULL, size);
4563    if (res == NULL)
4564        return NULL;
4565    str = PyBytes_AS_STRING(res);
4566    ressize = size;
4567
4568    while (p<endp) {
4569        Py_UNICODE c = *p;
4570
4571        /* can we encode this? */
4572        if (c<limit) {
4573            /* no overflow check, because we know that the space is enough */
4574            *str++ = (char)c;
4575            ++p;
4576        }
4577        else {
4578            Py_ssize_t unicodepos = p-startp;
4579            Py_ssize_t requiredsize;
4580            PyObject *repunicode;
4581            Py_ssize_t repsize;
4582            Py_ssize_t newpos;
4583            Py_ssize_t respos;
4584            Py_UNICODE *uni2;
4585            /* startpos for collecting unencodable chars */
4586            const Py_UNICODE *collstart = p;
4587            const Py_UNICODE *collend = p;
4588            /* find all unecodable characters */
4589            while ((collend < endp) && ((*collend)>=limit))
4590                ++collend;
4591            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4592            if (known_errorHandler==-1) {
4593                if ((errors==NULL) || (!strcmp(errors, "strict")))
4594                    known_errorHandler = 1;
4595                else if (!strcmp(errors, "replace"))
4596                    known_errorHandler = 2;
4597                else if (!strcmp(errors, "ignore"))
4598                    known_errorHandler = 3;
4599                else if (!strcmp(errors, "xmlcharrefreplace"))
4600                    known_errorHandler = 4;
4601                else
4602                    known_errorHandler = 0;
4603            }
4604            switch (known_errorHandler) {
4605            case 1: /* strict */
4606                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4607                goto onError;
4608            case 2: /* replace */
4609                while (collstart++<collend)
4610                    *str++ = '?'; /* fall through */
4611            case 3: /* ignore */
4612                p = collend;
4613                break;
4614            case 4: /* xmlcharrefreplace */
4615                respos = str - PyBytes_AS_STRING(res);
4616                /* determine replacement size (temporarily (mis)uses p) */
4617                for (p = collstart, repsize = 0; p < collend; ++p) {
4618                    if (*p<10)
4619                        repsize += 2+1+1;
4620                    else if (*p<100)
4621                        repsize += 2+2+1;
4622                    else if (*p<1000)
4623                        repsize += 2+3+1;
4624                    else if (*p<10000)
4625                        repsize += 2+4+1;
4626#ifndef Py_UNICODE_WIDE
4627                    else
4628                        repsize += 2+5+1;
4629#else
4630                    else if (*p<100000)
4631                        repsize += 2+5+1;
4632                    else if (*p<1000000)
4633                        repsize += 2+6+1;
4634                    else
4635                        repsize += 2+7+1;
4636#endif
4637                }
4638                requiredsize = respos+repsize+(endp-collend);
4639                if (requiredsize > ressize) {
4640                    if (requiredsize<2*ressize)
4641                        requiredsize = 2*ressize;
4642                    if (_PyBytes_Resize(&res, requiredsize))
4643                        goto onError;
4644                    str = PyBytes_AS_STRING(res) + respos;
4645                    ressize = requiredsize;
4646                }
4647                /* generate replacement (temporarily (mis)uses p) */
4648                for (p = collstart; p < collend; ++p) {
4649                    str += sprintf(str, "&#%d;", (int)*p);
4650                }
4651                p = collend;
4652                break;
4653            default:
4654                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4655                                                              encoding, reason, startp, size, &exc,
4656                                                              collstart-startp, collend-startp, &newpos);
4657                if (repunicode == NULL)
4658                    goto onError;
4659                if (PyBytes_Check(repunicode)) {
4660                    /* Directly copy bytes result to output. */
4661                    repsize = PyBytes_Size(repunicode);
4662                    if (repsize > 1) {
4663                        /* Make room for all additional bytes. */
4664                        respos = str - PyBytes_AS_STRING(res);
4665                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4666                            Py_DECREF(repunicode);
4667                            goto onError;
4668                        }
4669                        str = PyBytes_AS_STRING(res) + respos;
4670                        ressize += repsize-1;
4671                    }
4672                    memcpy(str, PyBytes_AsString(repunicode), repsize);
4673                    str += repsize;
4674                    p = startp + newpos;
4675                    Py_DECREF(repunicode);
4676                    break;
4677                }
4678                /* need more space? (at least enough for what we
4679                   have+the replacement+the rest of the string, so
4680                   we won't have to check space for encodable characters) */
4681                respos = str - PyBytes_AS_STRING(res);
4682                repsize = PyUnicode_GET_SIZE(repunicode);
4683                requiredsize = respos+repsize+(endp-collend);
4684                if (requiredsize > ressize) {
4685                    if (requiredsize<2*ressize)
4686                        requiredsize = 2*ressize;
4687                    if (_PyBytes_Resize(&res, requiredsize)) {
4688                        Py_DECREF(repunicode);
4689                        goto onError;
4690                    }
4691                    str = PyBytes_AS_STRING(res) + respos;
4692                    ressize = requiredsize;
4693                }
4694                /* check if there is anything unencodable in the replacement
4695                   and copy it to the output */
4696                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4697                    c = *uni2;
4698                    if (c >= limit) {
4699                        raise_encode_exception(&exc, encoding, startp, size,
4700                                               unicodepos, unicodepos+1, reason);
4701                        Py_DECREF(repunicode);
4702                        goto onError;
4703                    }
4704                    *str = (char)c;
4705                }
4706                p = startp + newpos;
4707                Py_DECREF(repunicode);
4708            }
4709        }
4710    }
4711    /* Resize if we allocated to much */
4712    size = str - PyBytes_AS_STRING(res);
4713    if (size < ressize) { /* If this falls res will be NULL */
4714        assert(size >= 0);
4715        if (_PyBytes_Resize(&res, size) < 0)
4716            goto onError;
4717    }
4718
4719    Py_XDECREF(errorHandler);
4720    Py_XDECREF(exc);
4721    return res;
4722
4723  onError:
4724    Py_XDECREF(res);
4725    Py_XDECREF(errorHandler);
4726    Py_XDECREF(exc);
4727    return NULL;
4728}
4729
4730PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4731                                 Py_ssize_t size,
4732                                 const char *errors)
4733{
4734    return unicode_encode_ucs1(p, size, errors, 256);
4735}
4736
4737PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4738{
4739    if (!PyUnicode_Check(unicode)) {
4740        PyErr_BadArgument();
4741        return NULL;
4742    }
4743    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4744                                  PyUnicode_GET_SIZE(unicode),
4745                                  NULL);
4746}
4747
4748/* --- 7-bit ASCII Codec -------------------------------------------------- */
4749
4750PyObject *PyUnicode_DecodeASCII(const char *s,
4751                                Py_ssize_t size,
4752                                const char *errors)
4753{
4754    const char *starts = s;
4755    PyUnicodeObject *v;
4756    Py_UNICODE *p;
4757    Py_ssize_t startinpos;
4758    Py_ssize_t endinpos;
4759    Py_ssize_t outpos;
4760    const char *e;
4761    PyObject *errorHandler = NULL;
4762    PyObject *exc = NULL;
4763
4764    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4765    if (size == 1 && *(unsigned char*)s < 128) {
4766        Py_UNICODE r = *(unsigned char*)s;
4767        return PyUnicode_FromUnicode(&r, 1);
4768    }
4769
4770    v = _PyUnicode_New(size);
4771    if (v == NULL)
4772        goto onError;
4773    if (size == 0)
4774        return (PyObject *)v;
4775    p = PyUnicode_AS_UNICODE(v);
4776    e = s + size;
4777    while (s < e) {
4778        register unsigned char c = (unsigned char)*s;
4779        if (c < 128) {
4780            *p++ = c;
4781            ++s;
4782        }
4783        else {
4784            startinpos = s-starts;
4785            endinpos = startinpos + 1;
4786            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4787            if (unicode_decode_call_errorhandler(
4788                    errors, &errorHandler,
4789                    "ascii", "ordinal not in range(128)",
4790                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4791                    &v, &outpos, &p))
4792                goto onError;
4793        }
4794    }
4795    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4796        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4797            goto onError;
4798    Py_XDECREF(errorHandler);
4799    Py_XDECREF(exc);
4800    return (PyObject *)v;
4801
4802  onError:
4803    Py_XDECREF(v);
4804    Py_XDECREF(errorHandler);
4805    Py_XDECREF(exc);
4806    return NULL;
4807}
4808
4809PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4810                                Py_ssize_t size,
4811                                const char *errors)
4812{
4813    return unicode_encode_ucs1(p, size, errors, 128);
4814}
4815
4816PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4817{
4818    if (!PyUnicode_Check(unicode)) {
4819        PyErr_BadArgument();
4820        return NULL;
4821    }
4822    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4823                                 PyUnicode_GET_SIZE(unicode),
4824                                 NULL);
4825}
4826
4827#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4828
4829/* --- MBCS codecs for Windows -------------------------------------------- */
4830
4831#if SIZEOF_INT < SIZEOF_SIZE_T
4832#define NEED_RETRY
4833#endif
4834
4835/* XXX This code is limited to "true" double-byte encodings, as
4836   a) it assumes an incomplete character consists of a single byte, and
4837   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4838   encodings, see IsDBCSLeadByteEx documentation. */
4839
4840static int is_dbcs_lead_byte(const char *s, int offset)
4841{
4842    const char *curr = s + offset;
4843
4844    if (IsDBCSLeadByte(*curr)) {
4845        const char *prev = CharPrev(s, curr);
4846        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4847    }
4848    return 0;
4849}
4850
4851/*
4852 * Decode MBCS string into unicode object. If 'final' is set, converts
4853 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4854 */
4855static int decode_mbcs(PyUnicodeObject **v,
4856                       const char *s, /* MBCS string */
4857                       int size, /* sizeof MBCS string */
4858                       int final,
4859                       const char *errors)
4860{
4861    Py_UNICODE *p;
4862    Py_ssize_t n;
4863    DWORD usize;
4864    DWORD flags;
4865
4866    assert(size >= 0);
4867
4868    /* check and handle 'errors' arg */
4869    if (errors==NULL || strcmp(errors, "strict")==0)
4870        flags = MB_ERR_INVALID_CHARS;
4871    else if (strcmp(errors, "ignore")==0)
4872        flags = 0;
4873    else {
4874        PyErr_Format(PyExc_ValueError,
4875                     "mbcs encoding does not support errors='%s'",
4876                     errors);
4877        return -1;
4878    }
4879
4880    /* Skip trailing lead-byte unless 'final' is set */
4881    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4882        --size;
4883
4884    /* First get the size of the result */
4885    if (size > 0) {
4886        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4887        if (usize==0)
4888            goto mbcs_decode_error;
4889    } else
4890        usize = 0;
4891
4892    if (*v == NULL) {
4893        /* Create unicode object */
4894        *v = _PyUnicode_New(usize);
4895        if (*v == NULL)
4896            return -1;
4897        n = 0;
4898    }
4899    else {
4900        /* Extend unicode object */
4901        n = PyUnicode_GET_SIZE(*v);
4902        if (_PyUnicode_Resize(v, n + usize) < 0)
4903            return -1;
4904    }
4905
4906    /* Do the conversion */
4907    if (usize > 0) {
4908        p = PyUnicode_AS_UNICODE(*v) + n;
4909        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4910            goto mbcs_decode_error;
4911        }
4912    }
4913    return size;
4914
4915mbcs_decode_error:
4916    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4917       we raise a UnicodeDecodeError - else it is a 'generic'
4918       windows error
4919     */
4920    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4921        /* Ideally, we should get reason from FormatMessage - this
4922           is the Windows 2000 English version of the message
4923        */
4924        PyObject *exc = NULL;
4925        const char *reason = "No mapping for the Unicode character exists "
4926                             "in the target multi-byte code page.";
4927        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4928        if (exc != NULL) {
4929            PyCodec_StrictErrors(exc);
4930            Py_DECREF(exc);
4931        }
4932    } else {
4933        PyErr_SetFromWindowsErrWithFilename(0, NULL);
4934    }
4935    return -1;
4936}
4937
4938PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4939                                       Py_ssize_t size,
4940                                       const char *errors,
4941                                       Py_ssize_t *consumed)
4942{
4943    PyUnicodeObject *v = NULL;
4944    int done;
4945
4946    if (consumed)
4947        *consumed = 0;
4948
4949#ifdef NEED_RETRY
4950  retry:
4951    if (size > INT_MAX)
4952        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
4953    else
4954#endif
4955        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
4956
4957    if (done < 0) {
4958        Py_XDECREF(v);
4959        return NULL;
4960    }
4961
4962    if (consumed)
4963        *consumed += done;
4964
4965#ifdef NEED_RETRY
4966    if (size > INT_MAX) {
4967        s += done;
4968        size -= done;
4969        goto retry;
4970    }
4971#endif
4972
4973    return (PyObject *)v;
4974}
4975
4976PyObject *PyUnicode_DecodeMBCS(const char *s,
4977                               Py_ssize_t size,
4978                               const char *errors)
4979{
4980    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4981}
4982
4983/*
4984 * Convert unicode into string object (MBCS).
4985 * Returns 0 if succeed, -1 otherwise.
4986 */
4987static int encode_mbcs(PyObject **repr,
4988                       const Py_UNICODE *p, /* unicode */
4989                       int size, /* size of unicode */
4990                       const char* errors)
4991{
4992    BOOL usedDefaultChar = FALSE;
4993    BOOL *pusedDefaultChar;
4994    int mbcssize;
4995    Py_ssize_t n;
4996    PyObject *exc = NULL;
4997    DWORD flags;
4998
4999    assert(size >= 0);
5000
5001    /* check and handle 'errors' arg */
5002    if (errors==NULL || strcmp(errors, "strict")==0) {
5003        flags = WC_NO_BEST_FIT_CHARS;
5004        pusedDefaultChar = &usedDefaultChar;
5005    } else if (strcmp(errors, "replace")==0) {
5006        flags = 0;
5007        pusedDefaultChar = NULL;
5008    } else {
5009         PyErr_Format(PyExc_ValueError,
5010                      "mbcs encoding does not support errors='%s'",
5011                      errors);
5012         return -1;
5013    }
5014
5015    /* First get the size of the result */
5016    if (size > 0) {
5017        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5018                                       NULL, pusedDefaultChar);
5019        if (mbcssize == 0) {
5020            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5021            return -1;
5022        }
5023        /* If we used a default char, then we failed! */
5024        if (pusedDefaultChar && *pusedDefaultChar)
5025            goto mbcs_encode_error;
5026    } else {
5027        mbcssize = 0;
5028    }
5029
5030    if (*repr == NULL) {
5031        /* Create string object */
5032        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5033        if (*repr == NULL)
5034            return -1;
5035        n = 0;
5036    }
5037    else {
5038        /* Extend string object */
5039        n = PyBytes_Size(*repr);
5040        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5041            return -1;
5042    }
5043
5044    /* Do the conversion */
5045    if (size > 0) {
5046        char *s = PyBytes_AS_STRING(*repr) + n;
5047        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5048                                     NULL, pusedDefaultChar)) {
5049            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5050            return -1;
5051        }
5052        if (pusedDefaultChar && *pusedDefaultChar)
5053            goto mbcs_encode_error;
5054    }
5055    return 0;
5056
5057mbcs_encode_error:
5058    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5059    Py_XDECREF(exc);
5060    return -1;
5061}
5062
5063PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5064                               Py_ssize_t size,
5065                               const char *errors)
5066{
5067    PyObject *repr = NULL;
5068    int ret;
5069
5070#ifdef NEED_RETRY
5071  retry:
5072    if (size > INT_MAX)
5073        ret = encode_mbcs(&repr, p, INT_MAX, errors);
5074    else
5075#endif
5076        ret = encode_mbcs(&repr, p, (int)size, errors);
5077
5078    if (ret < 0) {
5079        Py_XDECREF(repr);
5080        return NULL;
5081    }
5082
5083#ifdef NEED_RETRY
5084    if (size > INT_MAX) {
5085        p += INT_MAX;
5086        size -= INT_MAX;
5087        goto retry;
5088    }
5089#endif
5090
5091    return repr;
5092}
5093
5094PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5095{
5096    if (!PyUnicode_Check(unicode)) {
5097        PyErr_BadArgument();
5098        return NULL;
5099    }
5100    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
5101                                PyUnicode_GET_SIZE(unicode),
5102                                NULL);
5103}
5104
5105#undef NEED_RETRY
5106
5107#endif /* MS_WINDOWS */
5108
5109/* --- Character Mapping Codec -------------------------------------------- */
5110
5111PyObject *PyUnicode_DecodeCharmap(const char *s,
5112                                  Py_ssize_t size,
5113                                  PyObject *mapping,
5114                                  const char *errors)
5115{
5116    const char *starts = s;
5117    Py_ssize_t startinpos;
5118    Py_ssize_t endinpos;
5119    Py_ssize_t outpos;
5120    const char *e;
5121    PyUnicodeObject *v;
5122    Py_UNICODE *p;
5123    Py_ssize_t extrachars = 0;
5124    PyObject *errorHandler = NULL;
5125    PyObject *exc = NULL;
5126    Py_UNICODE *mapstring = NULL;
5127    Py_ssize_t maplen = 0;
5128
5129    /* Default to Latin-1 */
5130    if (mapping == NULL)
5131        return PyUnicode_DecodeLatin1(s, size, errors);
5132
5133    v = _PyUnicode_New(size);
5134    if (v == NULL)
5135        goto onError;
5136    if (size == 0)
5137        return (PyObject *)v;
5138    p = PyUnicode_AS_UNICODE(v);
5139    e = s + size;
5140    if (PyUnicode_CheckExact(mapping)) {
5141        mapstring = PyUnicode_AS_UNICODE(mapping);
5142        maplen = PyUnicode_GET_SIZE(mapping);
5143        while (s < e) {
5144            unsigned char ch = *s;
5145            Py_UNICODE x = 0xfffe; /* illegal value */
5146
5147            if (ch < maplen)
5148                x = mapstring[ch];
5149
5150            if (x == 0xfffe) {
5151                /* undefined mapping */
5152                outpos = p-PyUnicode_AS_UNICODE(v);
5153                startinpos = s-starts;
5154                endinpos = startinpos+1;
5155                if (unicode_decode_call_errorhandler(
5156                        errors, &errorHandler,
5157                        "charmap", "character maps to <undefined>",
5158                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5159                        &v, &outpos, &p)) {
5160                    goto onError;
5161                }
5162                continue;
5163            }
5164            *p++ = x;
5165            ++s;
5166        }
5167    }
5168    else {
5169        while (s < e) {
5170            unsigned char ch = *s;
5171            PyObject *w, *x;
5172
5173            /* Get mapping (char ordinal -> integer, Unicode char or None) */
5174            w = PyLong_FromLong((long)ch);
5175            if (w == NULL)
5176                goto onError;
5177            x = PyObject_GetItem(mapping, w);
5178            Py_DECREF(w);
5179            if (x == NULL) {
5180                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5181                    /* No mapping found means: mapping is undefined. */
5182                    PyErr_Clear();
5183                    x = Py_None;
5184                    Py_INCREF(x);
5185                } else
5186                    goto onError;
5187            }
5188
5189            /* Apply mapping */
5190            if (PyLong_Check(x)) {
5191                long value = PyLong_AS_LONG(x);
5192                if (value < 0 || value > 65535) {
5193                    PyErr_SetString(PyExc_TypeError,
5194                                    "character mapping must be in range(65536)");
5195                    Py_DECREF(x);
5196                    goto onError;
5197                }
5198                *p++ = (Py_UNICODE)value;
5199            }
5200            else if (x == Py_None) {
5201                /* undefined mapping */
5202                outpos = p-PyUnicode_AS_UNICODE(v);
5203                startinpos = s-starts;
5204                endinpos = startinpos+1;
5205                if (unicode_decode_call_errorhandler(
5206                        errors, &errorHandler,
5207                        "charmap", "character maps to <undefined>",
5208                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5209                        &v, &outpos, &p)) {
5210                    Py_DECREF(x);
5211                    goto onError;
5212                }
5213                Py_DECREF(x);
5214                continue;
5215            }
5216            else if (PyUnicode_Check(x)) {
5217                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
5218
5219                if (targetsize == 1)
5220                    /* 1-1 mapping */
5221                    *p++ = *PyUnicode_AS_UNICODE(x);
5222
5223                else if (targetsize > 1) {
5224                    /* 1-n mapping */
5225                    if (targetsize > extrachars) {
5226                        /* resize first */
5227                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5228                        Py_ssize_t needed = (targetsize - extrachars) + \
5229                            (targetsize << 2);
5230                        extrachars += needed;
5231                        /* XXX overflow detection missing */
5232                        if (_PyUnicode_Resize(&v,
5233                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
5234                            Py_DECREF(x);
5235                            goto onError;
5236                        }
5237                        p = PyUnicode_AS_UNICODE(v) + oldpos;
5238                    }
5239                    Py_UNICODE_COPY(p,
5240                                    PyUnicode_AS_UNICODE(x),
5241                                    targetsize);
5242                    p += targetsize;
5243                    extrachars -= targetsize;
5244                }
5245                /* 1-0 mapping: skip the character */
5246            }
5247            else {
5248                /* wrong return value */
5249                PyErr_SetString(PyExc_TypeError,
5250                                "character mapping must return integer, None or str");
5251                Py_DECREF(x);
5252                goto onError;
5253            }
5254            Py_DECREF(x);
5255            ++s;
5256        }
5257    }
5258    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
5259        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5260            goto onError;
5261    Py_XDECREF(errorHandler);
5262    Py_XDECREF(exc);
5263    return (PyObject *)v;
5264
5265  onError:
5266    Py_XDECREF(errorHandler);
5267    Py_XDECREF(exc);
5268    Py_XDECREF(v);
5269    return NULL;
5270}
5271
5272/* Charmap encoding: the lookup table */
5273
5274struct encoding_map{
5275    PyObject_HEAD
5276    unsigned char level1[32];
5277    int count2, count3;
5278    unsigned char level23[1];
5279};
5280
5281static PyObject*
5282encoding_map_size(PyObject *obj, PyObject* args)
5283{
5284    struct encoding_map *map = (struct encoding_map*)obj;
5285    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
5286                           128*map->count3);
5287}
5288
5289static PyMethodDef encoding_map_methods[] = {
5290    {"size", encoding_map_size, METH_NOARGS,
5291     PyDoc_STR("Return the size (in bytes) of this object") },
5292    { 0 }
5293};
5294
5295static void
5296encoding_map_dealloc(PyObject* o)
5297{
5298    PyObject_FREE(o);
5299}
5300
5301static PyTypeObject EncodingMapType = {
5302    PyVarObject_HEAD_INIT(NULL, 0)
5303    "EncodingMap",          /*tp_name*/
5304    sizeof(struct encoding_map),   /*tp_basicsize*/
5305    0,                      /*tp_itemsize*/
5306    /* methods */
5307    encoding_map_dealloc,   /*tp_dealloc*/
5308    0,                      /*tp_print*/
5309    0,                      /*tp_getattr*/
5310    0,                      /*tp_setattr*/
5311    0,                      /*tp_reserved*/
5312    0,                      /*tp_repr*/
5313    0,                      /*tp_as_number*/
5314    0,                      /*tp_as_sequence*/
5315    0,                      /*tp_as_mapping*/
5316    0,                      /*tp_hash*/
5317    0,                      /*tp_call*/
5318    0,                      /*tp_str*/
5319    0,                      /*tp_getattro*/
5320    0,                      /*tp_setattro*/
5321    0,                      /*tp_as_buffer*/
5322    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
5323    0,                      /*tp_doc*/
5324    0,                      /*tp_traverse*/
5325    0,                      /*tp_clear*/
5326    0,                      /*tp_richcompare*/
5327    0,                      /*tp_weaklistoffset*/
5328    0,                      /*tp_iter*/
5329    0,                      /*tp_iternext*/
5330    encoding_map_methods,   /*tp_methods*/
5331    0,                      /*tp_members*/
5332    0,                      /*tp_getset*/
5333    0,                      /*tp_base*/
5334    0,                      /*tp_dict*/
5335    0,                      /*tp_descr_get*/
5336    0,                      /*tp_descr_set*/
5337    0,                      /*tp_dictoffset*/
5338    0,                      /*tp_init*/
5339    0,                      /*tp_alloc*/
5340    0,                      /*tp_new*/
5341    0,                      /*tp_free*/
5342    0,                      /*tp_is_gc*/
5343};
5344
5345PyObject*
5346PyUnicode_BuildEncodingMap(PyObject* string)
5347{
5348    Py_UNICODE *decode;
5349    PyObject *result;
5350    struct encoding_map *mresult;
5351    int i;
5352    int need_dict = 0;
5353    unsigned char level1[32];
5354    unsigned char level2[512];
5355    unsigned char *mlevel1, *mlevel2, *mlevel3;
5356    int count2 = 0, count3 = 0;
5357
5358    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5359        PyErr_BadArgument();
5360        return NULL;
5361    }
5362    decode = PyUnicode_AS_UNICODE(string);
5363    memset(level1, 0xFF, sizeof level1);
5364    memset(level2, 0xFF, sizeof level2);
5365
5366    /* If there isn't a one-to-one mapping of NULL to \0,
5367       or if there are non-BMP characters, we need to use
5368       a mapping dictionary. */
5369    if (decode[0] != 0)
5370        need_dict = 1;
5371    for (i = 1; i < 256; i++) {
5372        int l1, l2;
5373        if (decode[i] == 0
5374#ifdef Py_UNICODE_WIDE
5375            || decode[i] > 0xFFFF
5376#endif
5377            ) {
5378            need_dict = 1;
5379            break;
5380        }
5381        if (decode[i] == 0xFFFE)
5382            /* unmapped character */
5383            continue;
5384        l1 = decode[i] >> 11;
5385        l2 = decode[i] >> 7;
5386        if (level1[l1] == 0xFF)
5387            level1[l1] = count2++;
5388        if (level2[l2] == 0xFF)
5389            level2[l2] = count3++;
5390    }
5391
5392    if (count2 >= 0xFF || count3 >= 0xFF)
5393        need_dict = 1;
5394
5395    if (need_dict) {
5396        PyObject *result = PyDict_New();
5397        PyObject *key, *value;
5398        if (!result)
5399            return NULL;
5400        for (i = 0; i < 256; i++) {
5401            key = value = NULL;
5402            key = PyLong_FromLong(decode[i]);
5403            value = PyLong_FromLong(i);
5404            if (!key || !value)
5405                goto failed1;
5406            if (PyDict_SetItem(result, key, value) == -1)
5407                goto failed1;
5408            Py_DECREF(key);
5409            Py_DECREF(value);
5410        }
5411        return result;
5412      failed1:
5413        Py_XDECREF(key);
5414        Py_XDECREF(value);
5415        Py_DECREF(result);
5416        return NULL;
5417    }
5418
5419    /* Create a three-level trie */
5420    result = PyObject_MALLOC(sizeof(struct encoding_map) +
5421                             16*count2 + 128*count3 - 1);
5422    if (!result)
5423        return PyErr_NoMemory();
5424    PyObject_Init(result, &EncodingMapType);
5425    mresult = (struct encoding_map*)result;
5426    mresult->count2 = count2;
5427    mresult->count3 = count3;
5428    mlevel1 = mresult->level1;
5429    mlevel2 = mresult->level23;
5430    mlevel3 = mresult->level23 + 16*count2;
5431    memcpy(mlevel1, level1, 32);
5432    memset(mlevel2, 0xFF, 16*count2);
5433    memset(mlevel3, 0, 128*count3);
5434    count3 = 0;
5435    for (i = 1; i < 256; i++) {
5436        int o1, o2, o3, i2, i3;
5437        if (decode[i] == 0xFFFE)
5438            /* unmapped character */
5439            continue;
5440        o1 = decode[i]>>11;
5441        o2 = (decode[i]>>7) & 0xF;
5442        i2 = 16*mlevel1[o1] + o2;
5443        if (mlevel2[i2] == 0xFF)
5444            mlevel2[i2] = count3++;
5445        o3 = decode[i] & 0x7F;
5446        i3 = 128*mlevel2[i2] + o3;
5447        mlevel3[i3] = i;
5448    }
5449    return result;
5450}
5451
5452static int
5453encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5454{
5455    struct encoding_map *map = (struct encoding_map*)mapping;
5456    int l1 = c>>11;
5457    int l2 = (c>>7) & 0xF;
5458    int l3 = c & 0x7F;
5459    int i;
5460
5461#ifdef Py_UNICODE_WIDE
5462    if (c > 0xFFFF) {
5463        return -1;
5464    }
5465#endif
5466    if (c == 0)
5467        return 0;
5468    /* level 1*/
5469    i = map->level1[l1];
5470    if (i == 0xFF) {
5471        return -1;
5472    }
5473    /* level 2*/
5474    i = map->level23[16*i+l2];
5475    if (i == 0xFF) {
5476        return -1;
5477    }
5478    /* level 3 */
5479    i = map->level23[16*map->count2 + 128*i + l3];
5480    if (i == 0) {
5481        return -1;
5482    }
5483    return i;
5484}
5485
5486/* Lookup the character ch in the mapping. If the character
5487   can't be found, Py_None is returned (or NULL, if another
5488   error occurred). */
5489static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5490{
5491    PyObject *w = PyLong_FromLong((long)c);
5492    PyObject *x;
5493
5494    if (w == NULL)
5495        return NULL;
5496    x = PyObject_GetItem(mapping, w);
5497    Py_DECREF(w);
5498    if (x == NULL) {
5499        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5500            /* No mapping found means: mapping is undefined. */
5501            PyErr_Clear();
5502            x = Py_None;
5503            Py_INCREF(x);
5504            return x;
5505        } else
5506            return NULL;
5507    }
5508    else if (x == Py_None)
5509        return x;
5510    else if (PyLong_Check(x)) {
5511        long value = PyLong_AS_LONG(x);
5512        if (value < 0 || value > 255) {
5513            PyErr_SetString(PyExc_TypeError,
5514                            "character mapping must be in range(256)");
5515            Py_DECREF(x);
5516            return NULL;
5517        }
5518        return x;
5519    }
5520    else if (PyBytes_Check(x))
5521        return x;
5522    else {
5523        /* wrong return value */
5524        PyErr_Format(PyExc_TypeError,
5525                     "character mapping must return integer, bytes or None, not %.400s",
5526                     x->ob_type->tp_name);
5527        Py_DECREF(x);
5528        return NULL;
5529    }
5530}
5531
5532static int
5533charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5534{
5535    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5536    /* exponentially overallocate to minimize reallocations */
5537    if (requiredsize < 2*outsize)
5538        requiredsize = 2*outsize;
5539    if (_PyBytes_Resize(outobj, requiredsize))
5540        return -1;
5541    return 0;
5542}
5543
5544typedef enum charmapencode_result {
5545    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5546}charmapencode_result;
5547/* lookup the character, put the result in the output string and adjust
5548   various state variables. Resize the output bytes object if not enough
5549   space is available. Return a new reference to the object that
5550   was put in the output buffer, or Py_None, if the mapping was undefined
5551   (in which case no character was written) or NULL, if a
5552   reallocation error occurred. The caller must decref the result */
5553static
5554charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
5555                                          PyObject **outobj, Py_ssize_t *outpos)
5556{
5557    PyObject *rep;
5558    char *outstart;
5559    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5560
5561    if (Py_TYPE(mapping) == &EncodingMapType) {
5562        int res = encoding_map_lookup(c, mapping);
5563        Py_ssize_t requiredsize = *outpos+1;
5564        if (res == -1)
5565            return enc_FAILED;
5566        if (outsize<requiredsize)
5567            if (charmapencode_resize(outobj, outpos, requiredsize))
5568                return enc_EXCEPTION;
5569        outstart = PyBytes_AS_STRING(*outobj);
5570        outstart[(*outpos)++] = (char)res;
5571        return enc_SUCCESS;
5572    }
5573
5574    rep = charmapencode_lookup(c, mapping);
5575    if (rep==NULL)
5576        return enc_EXCEPTION;
5577    else if (rep==Py_None) {
5578        Py_DECREF(rep);
5579        return enc_FAILED;
5580    } else {
5581        if (PyLong_Check(rep)) {
5582            Py_ssize_t requiredsize = *outpos+1;
5583            if (outsize<requiredsize)
5584                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5585                    Py_DECREF(rep);
5586                    return enc_EXCEPTION;
5587                }
5588            outstart = PyBytes_AS_STRING(*outobj);
5589            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5590        }
5591        else {
5592            const char *repchars = PyBytes_AS_STRING(rep);
5593            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5594            Py_ssize_t requiredsize = *outpos+repsize;
5595            if (outsize<requiredsize)
5596                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5597                    Py_DECREF(rep);
5598                    return enc_EXCEPTION;
5599                }
5600            outstart = PyBytes_AS_STRING(*outobj);
5601            memcpy(outstart + *outpos, repchars, repsize);
5602            *outpos += repsize;
5603        }
5604    }
5605    Py_DECREF(rep);
5606    return enc_SUCCESS;
5607}
5608
5609/* handle an error in PyUnicode_EncodeCharmap
5610   Return 0 on success, -1 on error */
5611static
5612int charmap_encoding_error(
5613    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5614    PyObject **exceptionObject,
5615    int *known_errorHandler, PyObject **errorHandler, const char *errors,
5616    PyObject **res, Py_ssize_t *respos)
5617{
5618    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5619    Py_ssize_t repsize;
5620    Py_ssize_t newpos;
5621    Py_UNICODE *uni2;
5622    /* startpos for collecting unencodable chars */
5623    Py_ssize_t collstartpos = *inpos;
5624    Py_ssize_t collendpos = *inpos+1;
5625    Py_ssize_t collpos;
5626    char *encoding = "charmap";
5627    char *reason = "character maps to <undefined>";
5628    charmapencode_result x;
5629
5630    /* find all unencodable characters */
5631    while (collendpos < size) {
5632        PyObject *rep;
5633        if (Py_TYPE(mapping) == &EncodingMapType) {
5634            int res = encoding_map_lookup(p[collendpos], mapping);
5635            if (res != -1)
5636                break;
5637            ++collendpos;
5638            continue;
5639        }
5640
5641        rep = charmapencode_lookup(p[collendpos], mapping);
5642        if (rep==NULL)
5643            return -1;
5644        else if (rep!=Py_None) {
5645            Py_DECREF(rep);
5646            break;
5647        }
5648        Py_DECREF(rep);
5649        ++collendpos;
5650    }
5651    /* cache callback name lookup
5652     * (if not done yet, i.e. it's the first error) */
5653    if (*known_errorHandler==-1) {
5654        if ((errors==NULL) || (!strcmp(errors, "strict")))
5655            *known_errorHandler = 1;
5656        else if (!strcmp(errors, "replace"))
5657            *known_errorHandler = 2;
5658        else if (!strcmp(errors, "ignore"))
5659            *known_errorHandler = 3;
5660        else if (!strcmp(errors, "xmlcharrefreplace"))
5661            *known_errorHandler = 4;
5662        else
5663            *known_errorHandler = 0;
5664    }
5665    switch (*known_errorHandler) {
5666    case 1: /* strict */
5667        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5668        return -1;
5669    case 2: /* replace */
5670        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
5671            x = charmapencode_output('?', mapping, res, respos);
5672            if (x==enc_EXCEPTION) {
5673                return -1;
5674            }
5675            else if (x==enc_FAILED) {
5676                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5677                return -1;
5678            }
5679        }
5680        /* fall through */
5681    case 3: /* ignore */
5682        *inpos = collendpos;
5683        break;
5684    case 4: /* xmlcharrefreplace */
5685        /* generate replacement (temporarily (mis)uses p) */
5686        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
5687            char buffer[2+29+1+1];
5688            char *cp;
5689            sprintf(buffer, "&#%d;", (int)p[collpos]);
5690            for (cp = buffer; *cp; ++cp) {
5691                x = charmapencode_output(*cp, mapping, res, respos);
5692                if (x==enc_EXCEPTION)
5693                    return -1;
5694                else if (x==enc_FAILED) {
5695                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5696                    return -1;
5697                }
5698            }
5699        }
5700        *inpos = collendpos;
5701        break;
5702    default:
5703        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
5704                                                      encoding, reason, p, size, exceptionObject,
5705                                                      collstartpos, collendpos, &newpos);
5706        if (repunicode == NULL)
5707            return -1;
5708        if (PyBytes_Check(repunicode)) {
5709            /* Directly copy bytes result to output. */
5710            Py_ssize_t outsize = PyBytes_Size(*res);
5711            Py_ssize_t requiredsize;
5712            repsize = PyBytes_Size(repunicode);
5713            requiredsize = *respos + repsize;
5714            if (requiredsize > outsize)
5715                /* Make room for all additional bytes. */
5716                if (charmapencode_resize(res, respos, requiredsize)) {
5717                    Py_DECREF(repunicode);
5718                    return -1;
5719                }
5720            memcpy(PyBytes_AsString(*res) + *respos,
5721                   PyBytes_AsString(repunicode),  repsize);
5722            *respos += repsize;
5723            *inpos = newpos;
5724            Py_DECREF(repunicode);
5725            break;
5726        }
5727        /* generate replacement  */
5728        repsize = PyUnicode_GET_SIZE(repunicode);
5729        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5730            x = charmapencode_output(*uni2, mapping, res, respos);
5731            if (x==enc_EXCEPTION) {
5732                return -1;
5733            }
5734            else if (x==enc_FAILED) {
5735                Py_DECREF(repunicode);
5736                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5737                return -1;
5738            }
5739        }
5740        *inpos = newpos;
5741        Py_DECREF(repunicode);
5742    }
5743    return 0;
5744}
5745
5746PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5747                                  Py_ssize_t size,
5748                                  PyObject *mapping,
5749                                  const char *errors)
5750{
5751    /* output object */
5752    PyObject *res = NULL;
5753    /* current input position */
5754    Py_ssize_t inpos = 0;
5755    /* current output position */
5756    Py_ssize_t respos = 0;
5757    PyObject *errorHandler = NULL;
5758    PyObject *exc = NULL;
5759    /* the following variable is used for caching string comparisons
5760     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5761     * 3=ignore, 4=xmlcharrefreplace */
5762    int known_errorHandler = -1;
5763
5764    /* Default to Latin-1 */
5765    if (mapping == NULL)
5766        return PyUnicode_EncodeLatin1(p, size, errors);
5767
5768    /* allocate enough for a simple encoding without
5769       replacements, if we need more, we'll resize */
5770    res = PyBytes_FromStringAndSize(NULL, size);
5771    if (res == NULL)
5772        goto onError;
5773    if (size == 0)
5774        return res;
5775
5776    while (inpos<size) {
5777        /* try to encode it */
5778        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5779        if (x==enc_EXCEPTION) /* error */
5780            goto onError;
5781        if (x==enc_FAILED) { /* unencodable character */
5782            if (charmap_encoding_error(p, size, &inpos, mapping,
5783                                       &exc,
5784                                       &known_errorHandler, &errorHandler, errors,
5785                                       &res, &respos)) {
5786                goto onError;
5787            }
5788        }
5789        else
5790            /* done with this character => adjust input position */
5791            ++inpos;
5792    }
5793
5794    /* Resize if we allocated to much */
5795    if (respos<PyBytes_GET_SIZE(res))
5796        if (_PyBytes_Resize(&res, respos) < 0)
5797            goto onError;
5798
5799    Py_XDECREF(exc);
5800    Py_XDECREF(errorHandler);
5801    return res;
5802
5803  onError:
5804    Py_XDECREF(res);
5805    Py_XDECREF(exc);
5806    Py_XDECREF(errorHandler);
5807    return NULL;
5808}
5809
5810PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5811                                    PyObject *mapping)
5812{
5813    if (!PyUnicode_Check(unicode) || mapping == NULL) {
5814        PyErr_BadArgument();
5815        return NULL;
5816    }
5817    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5818                                   PyUnicode_GET_SIZE(unicode),
5819                                   mapping,
5820                                   NULL);
5821}
5822
5823/* create or adjust a UnicodeTranslateError */
5824static void make_translate_exception(PyObject **exceptionObject,
5825                                     const Py_UNICODE *unicode, Py_ssize_t size,
5826                                     Py_ssize_t startpos, Py_ssize_t endpos,
5827                                     const char *reason)
5828{
5829    if (*exceptionObject == NULL) {
5830        *exceptionObject = PyUnicodeTranslateError_Create(
5831            unicode, size, startpos, endpos, reason);
5832    }
5833    else {
5834        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5835            goto onError;
5836        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5837            goto onError;
5838        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5839            goto onError;
5840        return;
5841      onError:
5842        Py_DECREF(*exceptionObject);
5843        *exceptionObject = NULL;
5844    }
5845}
5846
5847/* raises a UnicodeTranslateError */
5848static void raise_translate_exception(PyObject **exceptionObject,
5849                                      const Py_UNICODE *unicode, Py_ssize_t size,
5850                                      Py_ssize_t startpos, Py_ssize_t endpos,
5851                                      const char *reason)
5852{
5853    make_translate_exception(exceptionObject,
5854                             unicode, size, startpos, endpos, reason);
5855    if (*exceptionObject != NULL)
5856        PyCodec_StrictErrors(*exceptionObject);
5857}
5858
5859/* error handling callback helper:
5860   build arguments, call the callback and check the arguments,
5861   put the result into newpos and return the replacement string, which
5862   has to be freed by the caller */
5863static PyObject *unicode_translate_call_errorhandler(const char *errors,
5864                                                     PyObject **errorHandler,
5865                                                     const char *reason,
5866                                                     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5867                                                     Py_ssize_t startpos, Py_ssize_t endpos,
5868                                                     Py_ssize_t *newpos)
5869{
5870    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5871
5872    Py_ssize_t i_newpos;
5873    PyObject *restuple;
5874    PyObject *resunicode;
5875
5876    if (*errorHandler == NULL) {
5877        *errorHandler = PyCodec_LookupError(errors);
5878        if (*errorHandler == NULL)
5879            return NULL;
5880    }
5881
5882    make_translate_exception(exceptionObject,
5883                             unicode, size, startpos, endpos, reason);
5884    if (*exceptionObject == NULL)
5885        return NULL;
5886
5887    restuple = PyObject_CallFunctionObjArgs(
5888        *errorHandler, *exceptionObject, NULL);
5889    if (restuple == NULL)
5890        return NULL;
5891    if (!PyTuple_Check(restuple)) {
5892        PyErr_SetString(PyExc_TypeError, &argparse[4]);
5893        Py_DECREF(restuple);
5894        return NULL;
5895    }
5896    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
5897                          &resunicode, &i_newpos)) {
5898        Py_DECREF(restuple);
5899        return NULL;
5900    }
5901    if (i_newpos<0)
5902        *newpos = size+i_newpos;
5903    else
5904        *newpos = i_newpos;
5905    if (*newpos<0 || *newpos>size) {
5906        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5907        Py_DECREF(restuple);
5908        return NULL;
5909    }
5910    Py_INCREF(resunicode);
5911    Py_DECREF(restuple);
5912    return resunicode;
5913}
5914
5915/* Lookup the character ch in the mapping and put the result in result,
5916   which must be decrefed by the caller.
5917   Return 0 on success, -1 on error */
5918static
5919int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5920{
5921    PyObject *w = PyLong_FromLong((long)c);
5922    PyObject *x;
5923
5924    if (w == NULL)
5925        return -1;
5926    x = PyObject_GetItem(mapping, w);
5927    Py_DECREF(w);
5928    if (x == NULL) {
5929        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5930            /* No mapping found means: use 1:1 mapping. */
5931            PyErr_Clear();
5932            *result = NULL;
5933            return 0;
5934        } else
5935            return -1;
5936    }
5937    else if (x == Py_None) {
5938        *result = x;
5939        return 0;
5940    }
5941    else if (PyLong_Check(x)) {
5942        long value = PyLong_AS_LONG(x);
5943        long max = PyUnicode_GetMax();
5944        if (value < 0 || value > max) {
5945            PyErr_Format(PyExc_TypeError,
5946                         "character mapping must be in range(0x%x)", max+1);
5947            Py_DECREF(x);
5948            return -1;
5949        }
5950        *result = x;
5951        return 0;
5952    }
5953    else if (PyUnicode_Check(x)) {
5954        *result = x;
5955        return 0;
5956    }
5957    else {
5958        /* wrong return value */
5959        PyErr_SetString(PyExc_TypeError,
5960                        "character mapping must return integer, None or str");
5961        Py_DECREF(x);
5962        return -1;
5963    }
5964}
5965/* ensure that *outobj is at least requiredsize characters long,
5966   if not reallocate and adjust various state variables.
5967   Return 0 on success, -1 on error */
5968static
5969int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
5970                               Py_ssize_t requiredsize)
5971{
5972    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
5973    if (requiredsize > oldsize) {
5974        /* remember old output position */
5975        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5976        /* exponentially overallocate to minimize reallocations */
5977        if (requiredsize < 2 * oldsize)
5978            requiredsize = 2 * oldsize;
5979        if (PyUnicode_Resize(outobj, requiredsize) < 0)
5980            return -1;
5981        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5982    }
5983    return 0;
5984}
5985/* lookup the character, put the result in the output string and adjust
5986   various state variables. Return a new reference to the object that
5987   was put in the output buffer in *result, or Py_None, if the mapping was
5988   undefined (in which case no character was written).
5989   The called must decref result.
5990   Return 0 on success, -1 on error. */
5991static
5992int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5993                            Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5994                            PyObject **res)
5995{
5996    if (charmaptranslate_lookup(*curinp, mapping, res))
5997        return -1;
5998    if (*res==NULL) {
5999        /* not found => default to 1:1 mapping */
6000        *(*outp)++ = *curinp;
6001    }
6002    else if (*res==Py_None)
6003        ;
6004    else if (PyLong_Check(*res)) {
6005        /* no overflow check, because we know that the space is enough */
6006        *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
6007    }
6008    else if (PyUnicode_Check(*res)) {
6009        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6010        if (repsize==1) {
6011            /* no overflow check, because we know that the space is enough */
6012            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6013        }
6014        else if (repsize!=0) {
6015            /* more than one character */
6016            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6017                (insize - (curinp-startinp)) +
6018                repsize - 1;
6019            if (charmaptranslate_makespace(outobj, outp, requiredsize))
6020                return -1;
6021            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6022            *outp += repsize;
6023        }
6024    }
6025    else
6026        return -1;
6027    return 0;
6028}
6029
6030PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6031                                     Py_ssize_t size,
6032                                     PyObject *mapping,
6033                                     const char *errors)
6034{
6035    /* output object */
6036    PyObject *res = NULL;
6037    /* pointers to the beginning and end+1 of input */
6038    const Py_UNICODE *startp = p;
6039    const Py_UNICODE *endp = p + size;
6040    /* pointer into the output */
6041    Py_UNICODE *str;
6042    /* current output position */
6043    Py_ssize_t respos = 0;
6044    char *reason = "character maps to <undefined>";
6045    PyObject *errorHandler = NULL;
6046    PyObject *exc = NULL;
6047    /* the following variable is used for caching string comparisons
6048     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6049     * 3=ignore, 4=xmlcharrefreplace */
6050    int known_errorHandler = -1;
6051
6052    if (mapping == NULL) {
6053        PyErr_BadArgument();
6054        return NULL;
6055    }
6056
6057    /* allocate enough for a simple 1:1 translation without
6058       replacements, if we need more, we'll resize */
6059    res = PyUnicode_FromUnicode(NULL, size);
6060    if (res == NULL)
6061        goto onError;
6062    if (size == 0)
6063        return res;
6064    str = PyUnicode_AS_UNICODE(res);
6065
6066    while (p<endp) {
6067        /* try to encode it */
6068        PyObject *x = NULL;
6069        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6070            Py_XDECREF(x);
6071            goto onError;
6072        }
6073        Py_XDECREF(x);
6074        if (x!=Py_None) /* it worked => adjust input pointer */
6075            ++p;
6076        else { /* untranslatable character */
6077            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6078            Py_ssize_t repsize;
6079            Py_ssize_t newpos;
6080            Py_UNICODE *uni2;
6081            /* startpos for collecting untranslatable chars */
6082            const Py_UNICODE *collstart = p;
6083            const Py_UNICODE *collend = p+1;
6084            const Py_UNICODE *coll;
6085
6086            /* find all untranslatable characters */
6087            while (collend < endp) {
6088                if (charmaptranslate_lookup(*collend, mapping, &x))
6089                    goto onError;
6090                Py_XDECREF(x);
6091                if (x!=Py_None)
6092                    break;
6093                ++collend;
6094            }
6095            /* cache callback name lookup
6096             * (if not done yet, i.e. it's the first error) */
6097            if (known_errorHandler==-1) {
6098                if ((errors==NULL) || (!strcmp(errors, "strict")))
6099                    known_errorHandler = 1;
6100                else if (!strcmp(errors, "replace"))
6101                    known_errorHandler = 2;
6102                else if (!strcmp(errors, "ignore"))
6103                    known_errorHandler = 3;
6104                else if (!strcmp(errors, "xmlcharrefreplace"))
6105                    known_errorHandler = 4;
6106                else
6107                    known_errorHandler = 0;
6108            }
6109            switch (known_errorHandler) {
6110            case 1: /* strict */
6111                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
6112                goto onError;
6113            case 2: /* replace */
6114                /* No need to check for space, this is a 1:1 replacement */
6115                for (coll = collstart; coll<collend; ++coll)
6116                    *str++ = '?';
6117                /* fall through */
6118            case 3: /* ignore */
6119                p = collend;
6120                break;
6121            case 4: /* xmlcharrefreplace */
6122                /* generate replacement (temporarily (mis)uses p) */
6123                for (p = collstart; p < collend; ++p) {
6124                    char buffer[2+29+1+1];
6125                    char *cp;
6126                    sprintf(buffer, "&#%d;", (int)*p);
6127                    if (charmaptranslate_makespace(&res, &str,
6128                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6129                        goto onError;
6130                    for (cp = buffer; *cp; ++cp)
6131                        *str++ = *cp;
6132                }
6133                p = collend;
6134                break;
6135            default:
6136                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6137                                                                 reason, startp, size, &exc,
6138                                                                 collstart-startp, collend-startp, &newpos);
6139                if (repunicode == NULL)
6140                    goto onError;
6141                /* generate replacement  */
6142                repsize = PyUnicode_GET_SIZE(repunicode);
6143                if (charmaptranslate_makespace(&res, &str,
6144                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6145                    Py_DECREF(repunicode);
6146                    goto onError;
6147                }
6148                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6149                    *str++ = *uni2;
6150                p = startp + newpos;
6151                Py_DECREF(repunicode);
6152            }
6153        }
6154    }
6155    /* Resize if we allocated to much */
6156    respos = str-PyUnicode_AS_UNICODE(res);
6157    if (respos<PyUnicode_GET_SIZE(res)) {
6158        if (PyUnicode_Resize(&res, respos) < 0)
6159            goto onError;
6160    }
6161    Py_XDECREF(exc);
6162    Py_XDECREF(errorHandler);
6163    return res;
6164
6165  onError:
6166    Py_XDECREF(res);
6167    Py_XDECREF(exc);
6168    Py_XDECREF(errorHandler);
6169    return NULL;
6170}
6171
6172PyObject *PyUnicode_Translate(PyObject *str,
6173                              PyObject *mapping,
6174                              const char *errors)
6175{
6176    PyObject *result;
6177
6178    str = PyUnicode_FromObject(str);
6179    if (str == NULL)
6180        goto onError;
6181    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
6182                                        PyUnicode_GET_SIZE(str),
6183                                        mapping,
6184                                        errors);
6185    Py_DECREF(str);
6186    return result;
6187
6188  onError:
6189    Py_XDECREF(str);
6190    return NULL;
6191}
6192
6193/* --- Decimal Encoder ---------------------------------------------------- */
6194
6195int PyUnicode_EncodeDecimal(Py_UNICODE *s,
6196                            Py_ssize_t length,
6197                            char *output,
6198                            const char *errors)
6199{
6200    Py_UNICODE *p, *end;
6201    PyObject *errorHandler = NULL;
6202    PyObject *exc = NULL;
6203    const char *encoding = "decimal";
6204    const char *reason = "invalid decimal Unicode string";
6205    /* the following variable is used for caching string comparisons
6206     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6207    int known_errorHandler = -1;
6208
6209    if (output == NULL) {
6210        PyErr_BadArgument();
6211        return -1;
6212    }
6213
6214    p = s;
6215    end = s + length;
6216    while (p < end) {
6217        register Py_UNICODE ch = *p;
6218        int decimal;
6219        PyObject *repunicode;
6220        Py_ssize_t repsize;
6221        Py_ssize_t newpos;
6222        Py_UNICODE *uni2;
6223        Py_UNICODE *collstart;
6224        Py_UNICODE *collend;
6225
6226        if (Py_UNICODE_ISSPACE(ch)) {
6227            *output++ = ' ';
6228            ++p;
6229            continue;
6230        }
6231        decimal = Py_UNICODE_TODECIMAL(ch);
6232        if (decimal >= 0) {
6233            *output++ = '0' + decimal;
6234            ++p;
6235            continue;
6236        }
6237        if (0 < ch && ch < 256) {
6238            *output++ = (char)ch;
6239            ++p;
6240            continue;
6241        }
6242        /* All other characters are considered unencodable */
6243        collstart = p;
6244        collend = p+1;
6245        while (collend < end) {
6246            if ((0 < *collend && *collend < 256) ||
6247                !Py_UNICODE_ISSPACE(*collend) ||
6248                Py_UNICODE_TODECIMAL(*collend))
6249                break;
6250        }
6251        /* cache callback name lookup
6252         * (if not done yet, i.e. it's the first error) */
6253        if (known_errorHandler==-1) {
6254            if ((errors==NULL) || (!strcmp(errors, "strict")))
6255                known_errorHandler = 1;
6256            else if (!strcmp(errors, "replace"))
6257                known_errorHandler = 2;
6258            else if (!strcmp(errors, "ignore"))
6259                known_errorHandler = 3;
6260            else if (!strcmp(errors, "xmlcharrefreplace"))
6261                known_errorHandler = 4;
6262            else
6263                known_errorHandler = 0;
6264        }
6265        switch (known_errorHandler) {
6266        case 1: /* strict */
6267            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6268            goto onError;
6269        case 2: /* replace */
6270            for (p = collstart; p < collend; ++p)
6271                *output++ = '?';
6272            /* fall through */
6273        case 3: /* ignore */
6274            p = collend;
6275            break;
6276        case 4: /* xmlcharrefreplace */
6277            /* generate replacement (temporarily (mis)uses p) */
6278            for (p = collstart; p < collend; ++p)
6279                output += sprintf(output, "&#%d;", (int)*p);
6280            p = collend;
6281            break;
6282        default:
6283            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6284                                                          encoding, reason, s, length, &exc,
6285                                                          collstart-s, collend-s, &newpos);
6286            if (repunicode == NULL)
6287                goto onError;
6288            if (!PyUnicode_Check(repunicode)) {
6289                /* Byte results not supported, since they have no decimal property. */
6290                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6291                Py_DECREF(repunicode);
6292                goto onError;
6293            }
6294            /* generate replacement  */
6295            repsize = PyUnicode_GET_SIZE(repunicode);
6296            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6297                Py_UNICODE ch = *uni2;
6298                if (Py_UNICODE_ISSPACE(ch))
6299                    *output++ = ' ';
6300                else {
6301                    decimal = Py_UNICODE_TODECIMAL(ch);
6302                    if (decimal >= 0)
6303                        *output++ = '0' + decimal;
6304                    else if (0 < ch && ch < 256)
6305                        *output++ = (char)ch;
6306                    else {
6307                        Py_DECREF(repunicode);
6308                        raise_encode_exception(&exc, encoding,
6309                                               s, length, collstart-s, collend-s, reason);
6310                        goto onError;
6311                    }
6312                }
6313            }
6314            p = s + newpos;
6315            Py_DECREF(repunicode);
6316        }
6317    }
6318    /* 0-terminate the output string */
6319    *output++ = '\0';
6320    Py_XDECREF(exc);
6321    Py_XDECREF(errorHandler);
6322    return 0;
6323
6324  onError:
6325    Py_XDECREF(exc);
6326    Py_XDECREF(errorHandler);
6327    return -1;
6328}
6329
6330/* --- Helpers ------------------------------------------------------------ */
6331
6332#include "stringlib/unicodedefs.h"
6333#include "stringlib/fastsearch.h"
6334
6335#include "stringlib/count.h"
6336#include "stringlib/find.h"
6337#include "stringlib/partition.h"
6338#include "stringlib/split.h"
6339
6340#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
6341#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
6342#include "stringlib/localeutil.h"
6343
6344/* helper macro to fixup start/end slice values */
6345#define ADJUST_INDICES(start, end, len)         \
6346    if (end > len)                              \
6347        end = len;                              \
6348    else if (end < 0) {                         \
6349        end += len;                             \
6350        if (end < 0)                            \
6351            end = 0;                            \
6352    }                                           \
6353    if (start < 0) {                            \
6354        start += len;                           \
6355        if (start < 0)                          \
6356            start = 0;                          \
6357    }
6358
6359Py_ssize_t PyUnicode_Count(PyObject *str,
6360                           PyObject *substr,
6361                           Py_ssize_t start,
6362                           Py_ssize_t end)
6363{
6364    Py_ssize_t result;
6365    PyUnicodeObject* str_obj;
6366    PyUnicodeObject* sub_obj;
6367
6368    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6369    if (!str_obj)
6370        return -1;
6371    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6372    if (!sub_obj) {
6373        Py_DECREF(str_obj);
6374        return -1;
6375    }
6376
6377    ADJUST_INDICES(start, end, str_obj->length);
6378    result = stringlib_count(
6379        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6380        PY_SSIZE_T_MAX
6381        );
6382
6383    Py_DECREF(sub_obj);
6384    Py_DECREF(str_obj);
6385
6386    return result;
6387}
6388
6389Py_ssize_t PyUnicode_Find(PyObject *str,
6390                          PyObject *sub,
6391                          Py_ssize_t start,
6392                          Py_ssize_t end,
6393                          int direction)
6394{
6395    Py_ssize_t result;
6396
6397    str = PyUnicode_FromObject(str);
6398    if (!str)
6399        return -2;
6400    sub = PyUnicode_FromObject(sub);
6401    if (!sub) {
6402        Py_DECREF(str);
6403        return -2;
6404    }
6405
6406    if (direction > 0)
6407        result = stringlib_find_slice(
6408            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6409            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6410            start, end
6411            );
6412    else
6413        result = stringlib_rfind_slice(
6414            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6415            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6416            start, end
6417            );
6418
6419    Py_DECREF(str);
6420    Py_DECREF(sub);
6421
6422    return result;
6423}
6424
6425static
6426int tailmatch(PyUnicodeObject *self,
6427              PyUnicodeObject *substring,
6428              Py_ssize_t start,
6429              Py_ssize_t end,
6430              int direction)
6431{
6432    if (substring->length == 0)
6433        return 1;
6434
6435    ADJUST_INDICES(start, end, self->length);
6436    end -= substring->length;
6437    if (end < start)
6438        return 0;
6439
6440    if (direction > 0) {
6441        if (Py_UNICODE_MATCH(self, end, substring))
6442            return 1;
6443    } else {
6444        if (Py_UNICODE_MATCH(self, start, substring))
6445            return 1;
6446    }
6447
6448    return 0;
6449}
6450
6451Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
6452                               PyObject *substr,
6453                               Py_ssize_t start,
6454                               Py_ssize_t end,
6455                               int direction)
6456{
6457    Py_ssize_t result;
6458
6459    str = PyUnicode_FromObject(str);
6460    if (str == NULL)
6461        return -1;
6462    substr = PyUnicode_FromObject(substr);
6463    if (substr == NULL) {
6464        Py_DECREF(str);
6465        return -1;
6466    }
6467
6468    result = tailmatch((PyUnicodeObject *)str,
6469                       (PyUnicodeObject *)substr,
6470                       start, end, direction);
6471    Py_DECREF(str);
6472    Py_DECREF(substr);
6473    return result;
6474}
6475
6476/* Apply fixfct filter to the Unicode object self and return a
6477   reference to the modified object */
6478
6479static
6480PyObject *fixup(PyUnicodeObject *self,
6481                int (*fixfct)(PyUnicodeObject *s))
6482{
6483
6484    PyUnicodeObject *u;
6485
6486    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6487    if (u == NULL)
6488        return NULL;
6489
6490    Py_UNICODE_COPY(u->str, self->str, self->length);
6491
6492    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6493        /* fixfct should return TRUE if it modified the buffer. If
6494           FALSE, return a reference to the original buffer instead
6495           (to save space, not time) */
6496        Py_INCREF(self);
6497        Py_DECREF(u);
6498        return (PyObject*) self;
6499    }
6500    return (PyObject*) u;
6501}
6502
6503static
6504int fixupper(PyUnicodeObject *self)
6505{
6506    Py_ssize_t len = self->length;
6507    Py_UNICODE *s = self->str;
6508    int status = 0;
6509
6510    while (len-- > 0) {
6511        register Py_UNICODE ch;
6512
6513        ch = Py_UNICODE_TOUPPER(*s);
6514        if (ch != *s) {
6515            status = 1;
6516            *s = ch;
6517        }
6518        s++;
6519    }
6520
6521    return status;
6522}
6523
6524static
6525int fixlower(PyUnicodeObject *self)
6526{
6527    Py_ssize_t len = self->length;
6528    Py_UNICODE *s = self->str;
6529    int status = 0;
6530
6531    while (len-- > 0) {
6532        register Py_UNICODE ch;
6533
6534        ch = Py_UNICODE_TOLOWER(*s);
6535        if (ch != *s) {
6536            status = 1;
6537            *s = ch;
6538        }
6539        s++;
6540    }
6541
6542    return status;
6543}
6544
6545static
6546int fixswapcase(PyUnicodeObject *self)
6547{
6548    Py_ssize_t len = self->length;
6549    Py_UNICODE *s = self->str;
6550    int status = 0;
6551
6552    while (len-- > 0) {
6553        if (Py_UNICODE_ISUPPER(*s)) {
6554            *s = Py_UNICODE_TOLOWER(*s);
6555            status = 1;
6556        } else if (Py_UNICODE_ISLOWER(*s)) {
6557            *s = Py_UNICODE_TOUPPER(*s);
6558            status = 1;
6559        }
6560        s++;
6561    }
6562
6563    return status;
6564}
6565
6566static
6567int fixcapitalize(PyUnicodeObject *self)
6568{
6569    Py_ssize_t len = self->length;
6570    Py_UNICODE *s = self->str;
6571    int status = 0;
6572
6573    if (len == 0)
6574        return 0;
6575    if (Py_UNICODE_ISLOWER(*s)) {
6576        *s = Py_UNICODE_TOUPPER(*s);
6577        status = 1;
6578    }
6579    s++;
6580    while (--len > 0) {
6581        if (Py_UNICODE_ISUPPER(*s)) {
6582            *s = Py_UNICODE_TOLOWER(*s);
6583            status = 1;
6584        }
6585        s++;
6586    }
6587    return status;
6588}
6589
6590static
6591int fixtitle(PyUnicodeObject *self)
6592{
6593    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6594    register Py_UNICODE *e;
6595    int previous_is_cased;
6596
6597    /* Shortcut for single character strings */
6598    if (PyUnicode_GET_SIZE(self) == 1) {
6599        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6600        if (*p != ch) {
6601            *p = ch;
6602            return 1;
6603        }
6604        else
6605            return 0;
6606    }
6607
6608    e = p + PyUnicode_GET_SIZE(self);
6609    previous_is_cased = 0;
6610    for (; p < e; p++) {
6611        register const Py_UNICODE ch = *p;
6612
6613        if (previous_is_cased)
6614            *p = Py_UNICODE_TOLOWER(ch);
6615        else
6616            *p = Py_UNICODE_TOTITLE(ch);
6617
6618        if (Py_UNICODE_ISLOWER(ch) ||
6619            Py_UNICODE_ISUPPER(ch) ||
6620            Py_UNICODE_ISTITLE(ch))
6621            previous_is_cased = 1;
6622        else
6623            previous_is_cased = 0;
6624    }
6625    return 1;
6626}
6627
6628PyObject *
6629PyUnicode_Join(PyObject *separator, PyObject *seq)
6630{
6631    const Py_UNICODE blank = ' ';
6632    const Py_UNICODE *sep = &blank;
6633    Py_ssize_t seplen = 1;
6634    PyUnicodeObject *res = NULL; /* the result */
6635    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
6636    PyObject *fseq;          /* PySequence_Fast(seq) */
6637    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
6638    PyObject **items;
6639    PyObject *item;
6640    Py_ssize_t sz, i;
6641
6642    fseq = PySequence_Fast(seq, "");
6643    if (fseq == NULL) {
6644        return NULL;
6645    }
6646
6647    /* NOTE: the following code can't call back into Python code,
6648     * so we are sure that fseq won't be mutated.
6649     */
6650
6651    seqlen = PySequence_Fast_GET_SIZE(fseq);
6652    /* If empty sequence, return u"". */
6653    if (seqlen == 0) {
6654        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
6655        goto Done;
6656    }
6657    items = PySequence_Fast_ITEMS(fseq);
6658    /* If singleton sequence with an exact Unicode, return that. */
6659    if (seqlen == 1) {
6660        item = items[0];
6661        if (PyUnicode_CheckExact(item)) {
6662            Py_INCREF(item);
6663            res = (PyUnicodeObject *)item;
6664            goto Done;
6665        }
6666    }
6667    else {
6668        /* Set up sep and seplen */
6669        if (separator == NULL) {
6670            sep = &blank;
6671            seplen = 1;
6672        }
6673        else {
6674            if (!PyUnicode_Check(separator)) {
6675                PyErr_Format(PyExc_TypeError,
6676                             "separator: expected str instance,"
6677                             " %.80s found",
6678                             Py_TYPE(separator)->tp_name);
6679                goto onError;
6680            }
6681            sep = PyUnicode_AS_UNICODE(separator);
6682            seplen = PyUnicode_GET_SIZE(separator);
6683        }
6684    }
6685
6686    /* There are at least two things to join, or else we have a subclass
6687     * of str in the sequence.
6688     * Do a pre-pass to figure out the total amount of space we'll
6689     * need (sz), and see whether all argument are strings.
6690     */
6691    sz = 0;
6692    for (i = 0; i < seqlen; i++) {
6693        const Py_ssize_t old_sz = sz;
6694        item = items[i];
6695        if (!PyUnicode_Check(item)) {
6696            PyErr_Format(PyExc_TypeError,
6697                         "sequence item %zd: expected str instance,"
6698                         " %.80s found",
6699                         i, Py_TYPE(item)->tp_name);
6700            goto onError;
6701        }
6702        sz += PyUnicode_GET_SIZE(item);
6703        if (i != 0)
6704            sz += seplen;
6705        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6706            PyErr_SetString(PyExc_OverflowError,
6707                            "join() result is too long for a Python string");
6708            goto onError;
6709        }
6710    }
6711
6712    res = _PyUnicode_New(sz);
6713    if (res == NULL)
6714        goto onError;
6715
6716    /* Catenate everything. */
6717    res_p = PyUnicode_AS_UNICODE(res);
6718    for (i = 0; i < seqlen; ++i) {
6719        Py_ssize_t itemlen;
6720        item = items[i];
6721        itemlen = PyUnicode_GET_SIZE(item);
6722        /* Copy item, and maybe the separator. */
6723        if (i) {
6724            Py_UNICODE_COPY(res_p, sep, seplen);
6725            res_p += seplen;
6726        }
6727        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6728        res_p += itemlen;
6729    }
6730
6731  Done:
6732    Py_DECREF(fseq);
6733    return (PyObject *)res;
6734
6735  onError:
6736    Py_DECREF(fseq);
6737    Py_XDECREF(res);
6738    return NULL;
6739}
6740
6741static
6742PyUnicodeObject *pad(PyUnicodeObject *self,
6743                     Py_ssize_t left,
6744                     Py_ssize_t right,
6745                     Py_UNICODE fill)
6746{
6747    PyUnicodeObject *u;
6748
6749    if (left < 0)
6750        left = 0;
6751    if (right < 0)
6752        right = 0;
6753
6754    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6755        Py_INCREF(self);
6756        return self;
6757    }
6758
6759    if (left > PY_SSIZE_T_MAX - self->length ||
6760        right > PY_SSIZE_T_MAX - (left + self->length)) {
6761        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6762        return NULL;
6763    }
6764    u = _PyUnicode_New(left + self->length + right);
6765    if (u) {
6766        if (left)
6767            Py_UNICODE_FILL(u->str, fill, left);
6768        Py_UNICODE_COPY(u->str + left, self->str, self->length);
6769        if (right)
6770            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6771    }
6772
6773    return u;
6774}
6775
6776PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
6777{
6778    PyObject *list;
6779
6780    string = PyUnicode_FromObject(string);
6781    if (string == NULL)
6782        return NULL;
6783
6784    list = stringlib_splitlines(
6785        (PyObject*) string, PyUnicode_AS_UNICODE(string),
6786        PyUnicode_GET_SIZE(string), keepends);
6787
6788    Py_DECREF(string);
6789    return list;
6790}
6791
6792static
6793PyObject *split(PyUnicodeObject *self,
6794                PyUnicodeObject *substring,
6795                Py_ssize_t maxcount)
6796{
6797    if (maxcount < 0)
6798        maxcount = PY_SSIZE_T_MAX;
6799
6800    if (substring == NULL)
6801        return stringlib_split_whitespace(
6802            (PyObject*) self,  self->str, self->length, maxcount
6803            );
6804
6805    return stringlib_split(
6806        (PyObject*) self,  self->str, self->length,
6807        substring->str, substring->length,
6808        maxcount
6809        );
6810}
6811
6812static
6813PyObject *rsplit(PyUnicodeObject *self,
6814                 PyUnicodeObject *substring,
6815                 Py_ssize_t maxcount)
6816{
6817    if (maxcount < 0)
6818        maxcount = PY_SSIZE_T_MAX;
6819
6820    if (substring == NULL)
6821        return stringlib_rsplit_whitespace(
6822            (PyObject*) self,  self->str, self->length, maxcount
6823            );
6824
6825    return stringlib_rsplit(
6826        (PyObject*) self,  self->str, self->length,
6827        substring->str, substring->length,
6828        maxcount
6829        );
6830}
6831
6832static
6833PyObject *replace(PyUnicodeObject *self,
6834                  PyUnicodeObject *str1,
6835                  PyUnicodeObject *str2,
6836                  Py_ssize_t maxcount)
6837{
6838    PyUnicodeObject *u;
6839
6840    if (maxcount < 0)
6841        maxcount = PY_SSIZE_T_MAX;
6842    else if (maxcount == 0 || self->length == 0)
6843        goto nothing;
6844
6845    if (str1->length == str2->length) {
6846        Py_ssize_t i;
6847        /* same length */
6848        if (str1->length == 0)
6849            goto nothing;
6850        if (str1->length == 1) {
6851            /* replace characters */
6852            Py_UNICODE u1, u2;
6853            if (!findchar(self->str, self->length, str1->str[0]))
6854                goto nothing;
6855            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6856            if (!u)
6857                return NULL;
6858            Py_UNICODE_COPY(u->str, self->str, self->length);
6859            u1 = str1->str[0];
6860            u2 = str2->str[0];
6861            for (i = 0; i < u->length; i++)
6862                if (u->str[i] == u1) {
6863                    if (--maxcount < 0)
6864                        break;
6865                    u->str[i] = u2;
6866                }
6867        } else {
6868            i = stringlib_find(
6869                self->str, self->length, str1->str, str1->length, 0
6870                );
6871            if (i < 0)
6872                goto nothing;
6873            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6874            if (!u)
6875                return NULL;
6876            Py_UNICODE_COPY(u->str, self->str, self->length);
6877
6878            /* change everything in-place, starting with this one */
6879            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6880            i += str1->length;
6881
6882            while ( --maxcount > 0) {
6883                i = stringlib_find(self->str+i, self->length-i,
6884                                   str1->str, str1->length,
6885                                   i);
6886                if (i == -1)
6887                    break;
6888                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6889                i += str1->length;
6890            }
6891        }
6892    } else {
6893
6894        Py_ssize_t n, i, j, e;
6895        Py_ssize_t product, new_size, delta;
6896        Py_UNICODE *p;
6897
6898        /* replace strings */
6899        n = stringlib_count(self->str, self->length, str1->str, str1->length,
6900                            maxcount);
6901        if (n == 0)
6902            goto nothing;
6903        /* new_size = self->length + n * (str2->length - str1->length)); */
6904        delta = (str2->length - str1->length);
6905        if (delta == 0) {
6906            new_size = self->length;
6907        } else {
6908            product = n * (str2->length - str1->length);
6909            if ((product / (str2->length - str1->length)) != n) {
6910                PyErr_SetString(PyExc_OverflowError,
6911                                "replace string is too long");
6912                return NULL;
6913            }
6914            new_size = self->length + product;
6915            if (new_size < 0) {
6916                PyErr_SetString(PyExc_OverflowError,
6917                                "replace string is too long");
6918                return NULL;
6919            }
6920        }
6921        u = _PyUnicode_New(new_size);
6922        if (!u)
6923            return NULL;
6924        i = 0;
6925        p = u->str;
6926        e = self->length - str1->length;
6927        if (str1->length > 0) {
6928            while (n-- > 0) {
6929                /* look for next match */
6930                j = stringlib_find(self->str+i, self->length-i,
6931                                   str1->str, str1->length,
6932                                   i);
6933                if (j == -1)
6934                    break;
6935                else if (j > i) {
6936                    /* copy unchanged part [i:j] */
6937                    Py_UNICODE_COPY(p, self->str+i, j-i);
6938                    p += j - i;
6939                }
6940                /* copy substitution string */
6941                if (str2->length > 0) {
6942                    Py_UNICODE_COPY(p, str2->str, str2->length);
6943                    p += str2->length;
6944                }
6945                i = j + str1->length;
6946            }
6947            if (i < self->length)
6948                /* copy tail [i:] */
6949                Py_UNICODE_COPY(p, self->str+i, self->length-i);
6950        } else {
6951            /* interleave */
6952            while (n > 0) {
6953                Py_UNICODE_COPY(p, str2->str, str2->length);
6954                p += str2->length;
6955                if (--n <= 0)
6956                    break;
6957                *p++ = self->str[i++];
6958            }
6959            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6960        }
6961    }
6962    return (PyObject *) u;
6963
6964  nothing:
6965    /* nothing to replace; return original string (when possible) */
6966    if (PyUnicode_CheckExact(self)) {
6967        Py_INCREF(self);
6968        return (PyObject *) self;
6969    }
6970    return PyUnicode_FromUnicode(self->str, self->length);
6971}
6972
6973/* --- Unicode Object Methods --------------------------------------------- */
6974
6975PyDoc_STRVAR(title__doc__,
6976             "S.title() -> str\n\
6977\n\
6978Return a titlecased version of S, i.e. words start with title case\n\
6979characters, all remaining cased characters have lower case.");
6980
6981static PyObject*
6982unicode_title(PyUnicodeObject *self)
6983{
6984    return fixup(self, fixtitle);
6985}
6986
6987PyDoc_STRVAR(capitalize__doc__,
6988             "S.capitalize() -> str\n\
6989\n\
6990Return a capitalized version of S, i.e. make the first character\n\
6991have upper case and the rest lower case.");
6992
6993static PyObject*
6994unicode_capitalize(PyUnicodeObject *self)
6995{
6996    return fixup(self, fixcapitalize);
6997}
6998
6999#if 0
7000PyDoc_STRVAR(capwords__doc__,
7001             "S.capwords() -> str\n\
7002\n\
7003Apply .capitalize() to all words in S and return the result with\n\
7004normalized whitespace (all whitespace strings are replaced by ' ').");
7005
7006static PyObject*
7007unicode_capwords(PyUnicodeObject *self)
7008{
7009    PyObject *list;
7010    PyObject *item;
7011    Py_ssize_t i;
7012
7013    /* Split into words */
7014    list = split(self, NULL, -1);
7015    if (!list)
7016        return NULL;
7017
7018    /* Capitalize each word */
7019    for (i = 0; i < PyList_GET_SIZE(list); i++) {
7020        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
7021                     fixcapitalize);
7022        if (item == NULL)
7023            goto onError;
7024        Py_DECREF(PyList_GET_ITEM(list, i));
7025        PyList_SET_ITEM(list, i, item);
7026    }
7027
7028    /* Join the words to form a new string */
7029    item = PyUnicode_Join(NULL, list);
7030
7031  onError:
7032    Py_DECREF(list);
7033    return (PyObject *)item;
7034}
7035#endif
7036
7037/* Argument converter.  Coerces to a single unicode character */
7038
7039static int
7040convert_uc(PyObject *obj, void *addr)
7041{
7042    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7043    PyObject *uniobj;
7044    Py_UNICODE *unistr;
7045
7046    uniobj = PyUnicode_FromObject(obj);
7047    if (uniobj == NULL) {
7048        PyErr_SetString(PyExc_TypeError,
7049                        "The fill character cannot be converted to Unicode");
7050        return 0;
7051    }
7052    if (PyUnicode_GET_SIZE(uniobj) != 1) {
7053        PyErr_SetString(PyExc_TypeError,
7054                        "The fill character must be exactly one character long");
7055        Py_DECREF(uniobj);
7056        return 0;
7057    }
7058    unistr = PyUnicode_AS_UNICODE(uniobj);
7059    *fillcharloc = unistr[0];
7060    Py_DECREF(uniobj);
7061    return 1;
7062}
7063
7064PyDoc_STRVAR(center__doc__,
7065             "S.center(width[, fillchar]) -> str\n\
7066\n\
7067Return S centered in a string of length width. Padding is\n\
7068done using the specified fill character (default is a space)");
7069
7070static PyObject *
7071unicode_center(PyUnicodeObject *self, PyObject *args)
7072{
7073    Py_ssize_t marg, left;
7074    Py_ssize_t width;
7075    Py_UNICODE fillchar = ' ';
7076
7077    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
7078        return NULL;
7079
7080    if (self->length >= width && PyUnicode_CheckExact(self)) {
7081        Py_INCREF(self);
7082        return (PyObject*) self;
7083    }
7084
7085    marg = width - self->length;
7086    left = marg / 2 + (marg & width & 1);
7087
7088    return (PyObject*) pad(self, left, marg - left, fillchar);
7089}
7090
7091#if 0
7092
7093/* This code should go into some future Unicode collation support
7094   module. The basic comparison should compare ordinals on a naive
7095   basis (this is what Java does and thus Jython too). */
7096
7097/* speedy UTF-16 code point order comparison */
7098/* gleaned from: */
7099/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7100
7101static short utf16Fixup[32] =
7102{
7103    0, 0, 0, 0, 0, 0, 0, 0,
7104    0, 0, 0, 0, 0, 0, 0, 0,
7105    0, 0, 0, 0, 0, 0, 0, 0,
7106    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
7107};
7108
7109static int
7110unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7111{
7112    Py_ssize_t len1, len2;
7113
7114    Py_UNICODE *s1 = str1->str;
7115    Py_UNICODE *s2 = str2->str;
7116
7117    len1 = str1->length;
7118    len2 = str2->length;
7119
7120    while (len1 > 0 && len2 > 0) {
7121        Py_UNICODE c1, c2;
7122
7123        c1 = *s1++;
7124        c2 = *s2++;
7125
7126        if (c1 > (1<<11) * 26)
7127            c1 += utf16Fixup[c1>>11];
7128        if (c2 > (1<<11) * 26)
7129            c2 += utf16Fixup[c2>>11];
7130        /* now c1 and c2 are in UTF-32-compatible order */
7131
7132        if (c1 != c2)
7133            return (c1 < c2) ? -1 : 1;
7134
7135        len1--; len2--;
7136    }
7137
7138    return (len1 < len2) ? -1 : (len1 != len2);
7139}
7140
7141#else
7142
7143static int
7144unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7145{
7146    register Py_ssize_t len1, len2;
7147
7148    Py_UNICODE *s1 = str1->str;
7149    Py_UNICODE *s2 = str2->str;
7150
7151    len1 = str1->length;
7152    len2 = str2->length;
7153
7154    while (len1 > 0 && len2 > 0) {
7155        Py_UNICODE c1, c2;
7156
7157        c1 = *s1++;
7158        c2 = *s2++;
7159
7160        if (c1 != c2)
7161            return (c1 < c2) ? -1 : 1;
7162
7163        len1--; len2--;
7164    }
7165
7166    return (len1 < len2) ? -1 : (len1 != len2);
7167}
7168
7169#endif
7170
7171int PyUnicode_Compare(PyObject *left,
7172                      PyObject *right)
7173{
7174    if (PyUnicode_Check(left) && PyUnicode_Check(right))
7175        return unicode_compare((PyUnicodeObject *)left,
7176                               (PyUnicodeObject *)right);
7177    PyErr_Format(PyExc_TypeError,
7178                 "Can't compare %.100s and %.100s",
7179                 left->ob_type->tp_name,
7180                 right->ob_type->tp_name);
7181    return -1;
7182}
7183
7184int
7185PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7186{
7187    int i;
7188    Py_UNICODE *id;
7189    assert(PyUnicode_Check(uni));
7190    id = PyUnicode_AS_UNICODE(uni);
7191    /* Compare Unicode string and source character set string */
7192    for (i = 0; id[i] && str[i]; i++)
7193        if (id[i] != str[i])
7194            return ((int)id[i] < (int)str[i]) ? -1 : 1;
7195    /* This check keeps Python strings that end in '\0' from comparing equal
7196     to C strings identical up to that point. */
7197    if (PyUnicode_GET_SIZE(uni) != i || id[i])
7198        return 1; /* uni is longer */
7199    if (str[i])
7200        return -1; /* str is longer */
7201    return 0;
7202}
7203
7204
7205#define TEST_COND(cond)                         \
7206    ((cond) ? Py_True : Py_False)
7207
7208PyObject *PyUnicode_RichCompare(PyObject *left,
7209                                PyObject *right,
7210                                int op)
7211{
7212    int result;
7213
7214    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7215        PyObject *v;
7216        if (((PyUnicodeObject *) left)->length !=
7217            ((PyUnicodeObject *) right)->length) {
7218            if (op == Py_EQ) {
7219                Py_INCREF(Py_False);
7220                return Py_False;
7221            }
7222            if (op == Py_NE) {
7223                Py_INCREF(Py_True);
7224                return Py_True;
7225            }
7226        }
7227        if (left == right)
7228            result = 0;
7229        else
7230            result = unicode_compare((PyUnicodeObject *)left,
7231                                     (PyUnicodeObject *)right);
7232
7233        /* Convert the return value to a Boolean */
7234        switch (op) {
7235        case Py_EQ:
7236            v = TEST_COND(result == 0);
7237            break;
7238        case Py_NE:
7239            v = TEST_COND(result != 0);
7240            break;
7241        case Py_LE:
7242            v = TEST_COND(result <= 0);
7243            break;
7244        case Py_GE:
7245            v = TEST_COND(result >= 0);
7246            break;
7247        case Py_LT:
7248            v = TEST_COND(result == -1);
7249            break;
7250        case Py_GT:
7251            v = TEST_COND(result == 1);
7252            break;
7253        default:
7254            PyErr_BadArgument();
7255            return NULL;
7256        }
7257        Py_INCREF(v);
7258        return v;
7259    }
7260
7261    Py_INCREF(Py_NotImplemented);
7262    return Py_NotImplemented;
7263}
7264
7265int PyUnicode_Contains(PyObject *container,
7266                       PyObject *element)
7267{
7268    PyObject *str, *sub;
7269    int result;
7270
7271    /* Coerce the two arguments */
7272    sub = PyUnicode_FromObject(element);
7273    if (!sub) {
7274        PyErr_Format(PyExc_TypeError,
7275                     "'in <string>' requires string as left operand, not %s",
7276                     element->ob_type->tp_name);
7277        return -1;
7278    }
7279
7280    str = PyUnicode_FromObject(container);
7281    if (!str) {
7282        Py_DECREF(sub);
7283        return -1;
7284    }
7285
7286    result = stringlib_contains_obj(str, sub);
7287
7288    Py_DECREF(str);
7289    Py_DECREF(sub);
7290
7291    return result;
7292}
7293
7294/* Concat to string or Unicode object giving a new Unicode object. */
7295
7296PyObject *PyUnicode_Concat(PyObject *left,
7297                           PyObject *right)
7298{
7299    PyUnicodeObject *u = NULL, *v = NULL, *w;
7300
7301    /* Coerce the two arguments */
7302    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7303    if (u == NULL)
7304        goto onError;
7305    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7306    if (v == NULL)
7307        goto onError;
7308
7309    /* Shortcuts */
7310    if (v == unicode_empty) {
7311        Py_DECREF(v);
7312        return (PyObject *)u;
7313    }
7314    if (u == unicode_empty) {
7315        Py_DECREF(u);
7316        return (PyObject *)v;
7317    }
7318
7319    /* Concat the two Unicode strings */
7320    w = _PyUnicode_New(u->length + v->length);
7321    if (w == NULL)
7322        goto onError;
7323    Py_UNICODE_COPY(w->str, u->str, u->length);
7324    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7325
7326    Py_DECREF(u);
7327    Py_DECREF(v);
7328    return (PyObject *)w;
7329
7330  onError:
7331    Py_XDECREF(u);
7332    Py_XDECREF(v);
7333    return NULL;
7334}
7335
7336void
7337PyUnicode_Append(PyObject **pleft, PyObject *right)
7338{
7339    PyObject *new;
7340    if (*pleft == NULL)
7341        return;
7342    if (right == NULL || !PyUnicode_Check(*pleft)) {
7343        Py_DECREF(*pleft);
7344        *pleft = NULL;
7345        return;
7346    }
7347    new = PyUnicode_Concat(*pleft, right);
7348    Py_DECREF(*pleft);
7349    *pleft = new;
7350}
7351
7352void
7353PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7354{
7355    PyUnicode_Append(pleft, right);
7356    Py_XDECREF(right);
7357}
7358
7359PyDoc_STRVAR(count__doc__,
7360             "S.count(sub[, start[, end]]) -> int\n\
7361\n\
7362Return the number of non-overlapping occurrences of substring sub in\n\
7363string S[start:end].  Optional arguments start and end are\n\
7364interpreted as in slice notation.");
7365
7366static PyObject *
7367unicode_count(PyUnicodeObject *self, PyObject *args)
7368{
7369    PyUnicodeObject *substring;
7370    Py_ssize_t start = 0;
7371    Py_ssize_t end = PY_SSIZE_T_MAX;
7372    PyObject *result;
7373
7374    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
7375                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7376        return NULL;
7377
7378    substring = (PyUnicodeObject *)PyUnicode_FromObject(
7379        (PyObject *)substring);
7380    if (substring == NULL)
7381        return NULL;
7382
7383    ADJUST_INDICES(start, end, self->length);
7384    result = PyLong_FromSsize_t(
7385        stringlib_count(self->str + start, end - start,
7386                        substring->str, substring->length,
7387                        PY_SSIZE_T_MAX)
7388        );
7389
7390    Py_DECREF(substring);
7391
7392    return result;
7393}
7394
7395PyDoc_STRVAR(encode__doc__,
7396             "S.encode([encoding[, errors]]) -> bytes\n\
7397\n\
7398Encode S using the codec registered for encoding. encoding defaults\n\
7399to the default encoding. errors may be given to set a different error\n\
7400handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7401a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7402'xmlcharrefreplace' as well as any other name registered with\n\
7403codecs.register_error that can handle UnicodeEncodeErrors.");
7404
7405static PyObject *
7406unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7407{
7408    static char *kwlist[] = {"encoding", "errors", 0};
7409    char *encoding = NULL;
7410    char *errors = NULL;
7411    PyObject *v;
7412
7413    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7414                                     kwlist, &encoding, &errors))
7415        return NULL;
7416    v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7417    if (v == NULL)
7418        goto onError;
7419    if (!PyBytes_Check(v)) {
7420        PyErr_Format(PyExc_TypeError,
7421                     "encoder did not return a bytes object "
7422                     "(type=%.400s)",
7423                     Py_TYPE(v)->tp_name);
7424        Py_DECREF(v);
7425        return NULL;
7426    }
7427    return v;
7428
7429  onError:
7430    return NULL;
7431}
7432
7433PyDoc_STRVAR(expandtabs__doc__,
7434             "S.expandtabs([tabsize]) -> str\n\
7435\n\
7436Return a copy of S where all tab characters are expanded using spaces.\n\
7437If tabsize is not given, a tab size of 8 characters is assumed.");
7438
7439static PyObject*
7440unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7441{
7442    Py_UNICODE *e;
7443    Py_UNICODE *p;
7444    Py_UNICODE *q;
7445    Py_UNICODE *qe;
7446    Py_ssize_t i, j, incr;
7447    PyUnicodeObject *u;
7448    int tabsize = 8;
7449
7450    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
7451        return NULL;
7452
7453    /* First pass: determine size of output string */
7454    i = 0; /* chars up to and including most recent \n or \r */
7455    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7456    e = self->str + self->length; /* end of input */
7457    for (p = self->str; p < e; p++)
7458        if (*p == '\t') {
7459            if (tabsize > 0) {
7460                incr = tabsize - (j % tabsize); /* cannot overflow */
7461                if (j > PY_SSIZE_T_MAX - incr)
7462                    goto overflow1;
7463                j += incr;
7464            }
7465        }
7466        else {
7467            if (j > PY_SSIZE_T_MAX - 1)
7468                goto overflow1;
7469            j++;
7470            if (*p == '\n' || *p == '\r') {
7471                if (i > PY_SSIZE_T_MAX - j)
7472                    goto overflow1;
7473                i += j;
7474                j = 0;
7475            }
7476        }
7477
7478    if (i > PY_SSIZE_T_MAX - j)
7479        goto overflow1;
7480
7481    /* Second pass: create output string and fill it */
7482    u = _PyUnicode_New(i + j);
7483    if (!u)
7484        return NULL;
7485
7486    j = 0; /* same as in first pass */
7487    q = u->str; /* next output char */
7488    qe = u->str + u->length; /* end of output */
7489
7490    for (p = self->str; p < e; p++)
7491        if (*p == '\t') {
7492            if (tabsize > 0) {
7493                i = tabsize - (j % tabsize);
7494                j += i;
7495                while (i--) {
7496                    if (q >= qe)
7497                        goto overflow2;
7498                    *q++ = ' ';
7499                }
7500            }
7501        }
7502        else {
7503            if (q >= qe)
7504                goto overflow2;
7505            *q++ = *p;
7506            j++;
7507            if (*p == '\n' || *p == '\r')
7508                j = 0;
7509        }
7510
7511    return (PyObject*) u;
7512
7513  overflow2:
7514    Py_DECREF(u);
7515  overflow1:
7516    PyErr_SetString(PyExc_OverflowError, "new string is too long");
7517    return NULL;
7518}
7519
7520PyDoc_STRVAR(find__doc__,
7521             "S.find(sub[, start[, end]]) -> int\n\
7522\n\
7523Return the lowest index in S where substring sub is found,\n\
7524such that sub is contained within s[start:end].  Optional\n\
7525arguments start and end are interpreted as in slice notation.\n\
7526\n\
7527Return -1 on failure.");
7528
7529static PyObject *
7530unicode_find(PyUnicodeObject *self, PyObject *args)
7531{
7532    PyObject *substring;
7533    Py_ssize_t start;
7534    Py_ssize_t end;
7535    Py_ssize_t result;
7536
7537    if (!_ParseTupleFinds(args, &substring, &start, &end))
7538        return NULL;
7539
7540    result = stringlib_find_slice(
7541        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7542        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7543        start, end
7544        );
7545
7546    Py_DECREF(substring);
7547
7548    return PyLong_FromSsize_t(result);
7549}
7550
7551static PyObject *
7552unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7553{
7554    if (index < 0 || index >= self->length) {
7555        PyErr_SetString(PyExc_IndexError, "string index out of range");
7556        return NULL;
7557    }
7558
7559    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7560}
7561
7562/* Believe it or not, this produces the same value for ASCII strings
7563   as string_hash(). */
7564static Py_hash_t
7565unicode_hash(PyUnicodeObject *self)
7566{
7567    Py_ssize_t len;
7568    Py_UNICODE *p;
7569    Py_hash_t x;
7570
7571    if (self->hash != -1)
7572        return self->hash;
7573    len = Py_SIZE(self);
7574    p = self->str;
7575    x = *p << 7;
7576    while (--len >= 0)
7577        x = (1000003*x) ^ *p++;
7578    x ^= Py_SIZE(self);
7579    if (x == -1)
7580        x = -2;
7581    self->hash = x;
7582    return x;
7583}
7584
7585PyDoc_STRVAR(index__doc__,
7586             "S.index(sub[, start[, end]]) -> int\n\
7587\n\
7588Like S.find() but raise ValueError when the substring is not found.");
7589
7590static PyObject *
7591unicode_index(PyUnicodeObject *self, PyObject *args)
7592{
7593    Py_ssize_t result;
7594    PyObject *substring;
7595    Py_ssize_t start;
7596    Py_ssize_t end;
7597
7598    if (!_ParseTupleFinds(args, &substring, &start, &end))
7599        return NULL;
7600
7601    result = stringlib_find_slice(
7602        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7603        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7604        start, end
7605        );
7606
7607    Py_DECREF(substring);
7608
7609    if (result < 0) {
7610        PyErr_SetString(PyExc_ValueError, "substring not found");
7611        return NULL;
7612    }
7613
7614    return PyLong_FromSsize_t(result);
7615}
7616
7617PyDoc_STRVAR(islower__doc__,
7618             "S.islower() -> bool\n\
7619\n\
7620Return True if all cased characters in S are lowercase and there is\n\
7621at least one cased character in S, False otherwise.");
7622
7623static PyObject*
7624unicode_islower(PyUnicodeObject *self)
7625{
7626    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7627    register const Py_UNICODE *e;
7628    int cased;
7629
7630    /* Shortcut for single character strings */
7631    if (PyUnicode_GET_SIZE(self) == 1)
7632        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7633
7634    /* Special case for empty strings */
7635    if (PyUnicode_GET_SIZE(self) == 0)
7636        return PyBool_FromLong(0);
7637
7638    e = p + PyUnicode_GET_SIZE(self);
7639    cased = 0;
7640    for (; p < e; p++) {
7641        register const Py_UNICODE ch = *p;
7642
7643        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7644            return PyBool_FromLong(0);
7645        else if (!cased && Py_UNICODE_ISLOWER(ch))
7646            cased = 1;
7647    }
7648    return PyBool_FromLong(cased);
7649}
7650
7651PyDoc_STRVAR(isupper__doc__,
7652             "S.isupper() -> bool\n\
7653\n\
7654Return True if all cased characters in S are uppercase and there is\n\
7655at least one cased character in S, False otherwise.");
7656
7657static PyObject*
7658unicode_isupper(PyUnicodeObject *self)
7659{
7660    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7661    register const Py_UNICODE *e;
7662    int cased;
7663
7664    /* Shortcut for single character strings */
7665    if (PyUnicode_GET_SIZE(self) == 1)
7666        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7667
7668    /* Special case for empty strings */
7669    if (PyUnicode_GET_SIZE(self) == 0)
7670        return PyBool_FromLong(0);
7671
7672    e = p + PyUnicode_GET_SIZE(self);
7673    cased = 0;
7674    for (; p < e; p++) {
7675        register const Py_UNICODE ch = *p;
7676
7677        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7678            return PyBool_FromLong(0);
7679        else if (!cased && Py_UNICODE_ISUPPER(ch))
7680            cased = 1;
7681    }
7682    return PyBool_FromLong(cased);
7683}
7684
7685PyDoc_STRVAR(istitle__doc__,
7686             "S.istitle() -> bool\n\
7687\n\
7688Return True if S is a titlecased string and there is at least one\n\
7689character in S, i.e. upper- and titlecase characters may only\n\
7690follow uncased characters and lowercase characters only cased ones.\n\
7691Return False otherwise.");
7692
7693static PyObject*
7694unicode_istitle(PyUnicodeObject *self)
7695{
7696    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7697    register const Py_UNICODE *e;
7698    int cased, previous_is_cased;
7699
7700    /* Shortcut for single character strings */
7701    if (PyUnicode_GET_SIZE(self) == 1)
7702        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7703                               (Py_UNICODE_ISUPPER(*p) != 0));
7704
7705    /* Special case for empty strings */
7706    if (PyUnicode_GET_SIZE(self) == 0)
7707        return PyBool_FromLong(0);
7708
7709    e = p + PyUnicode_GET_SIZE(self);
7710    cased = 0;
7711    previous_is_cased = 0;
7712    for (; p < e; p++) {
7713        register const Py_UNICODE ch = *p;
7714
7715        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7716            if (previous_is_cased)
7717                return PyBool_FromLong(0);
7718            previous_is_cased = 1;
7719            cased = 1;
7720        }
7721        else if (Py_UNICODE_ISLOWER(ch)) {
7722            if (!previous_is_cased)
7723                return PyBool_FromLong(0);
7724            previous_is_cased = 1;
7725            cased = 1;
7726        }
7727        else
7728            previous_is_cased = 0;
7729    }
7730    return PyBool_FromLong(cased);
7731}
7732
7733PyDoc_STRVAR(isspace__doc__,
7734             "S.isspace() -> bool\n\
7735\n\
7736Return True if all characters in S are whitespace\n\
7737and there is at least one character in S, False otherwise.");
7738
7739static PyObject*
7740unicode_isspace(PyUnicodeObject *self)
7741{
7742    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7743    register const Py_UNICODE *e;
7744
7745    /* Shortcut for single character strings */
7746    if (PyUnicode_GET_SIZE(self) == 1 &&
7747        Py_UNICODE_ISSPACE(*p))
7748        return PyBool_FromLong(1);
7749
7750    /* Special case for empty strings */
7751    if (PyUnicode_GET_SIZE(self) == 0)
7752        return PyBool_FromLong(0);
7753
7754    e = p + PyUnicode_GET_SIZE(self);
7755    for (; p < e; p++) {
7756        if (!Py_UNICODE_ISSPACE(*p))
7757            return PyBool_FromLong(0);
7758    }
7759    return PyBool_FromLong(1);
7760}
7761
7762PyDoc_STRVAR(isalpha__doc__,
7763             "S.isalpha() -> bool\n\
7764\n\
7765Return True if all characters in S are alphabetic\n\
7766and there is at least one character in S, False otherwise.");
7767
7768static PyObject*
7769unicode_isalpha(PyUnicodeObject *self)
7770{
7771    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7772    register const Py_UNICODE *e;
7773
7774    /* Shortcut for single character strings */
7775    if (PyUnicode_GET_SIZE(self) == 1 &&
7776        Py_UNICODE_ISALPHA(*p))
7777        return PyBool_FromLong(1);
7778
7779    /* Special case for empty strings */
7780    if (PyUnicode_GET_SIZE(self) == 0)
7781        return PyBool_FromLong(0);
7782
7783    e = p + PyUnicode_GET_SIZE(self);
7784    for (; p < e; p++) {
7785        if (!Py_UNICODE_ISALPHA(*p))
7786            return PyBool_FromLong(0);
7787    }
7788    return PyBool_FromLong(1);
7789}
7790
7791PyDoc_STRVAR(isalnum__doc__,
7792             "S.isalnum() -> bool\n\
7793\n\
7794Return True if all characters in S are alphanumeric\n\
7795and there is at least one character in S, False otherwise.");
7796
7797static PyObject*
7798unicode_isalnum(PyUnicodeObject *self)
7799{
7800    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7801    register const Py_UNICODE *e;
7802
7803    /* Shortcut for single character strings */
7804    if (PyUnicode_GET_SIZE(self) == 1 &&
7805        Py_UNICODE_ISALNUM(*p))
7806        return PyBool_FromLong(1);
7807
7808    /* Special case for empty strings */
7809    if (PyUnicode_GET_SIZE(self) == 0)
7810        return PyBool_FromLong(0);
7811
7812    e = p + PyUnicode_GET_SIZE(self);
7813    for (; p < e; p++) {
7814        if (!Py_UNICODE_ISALNUM(*p))
7815            return PyBool_FromLong(0);
7816    }
7817    return PyBool_FromLong(1);
7818}
7819
7820PyDoc_STRVAR(isdecimal__doc__,
7821             "S.isdecimal() -> bool\n\
7822\n\
7823Return True if there are only decimal characters in S,\n\
7824False otherwise.");
7825
7826static PyObject*
7827unicode_isdecimal(PyUnicodeObject *self)
7828{
7829    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7830    register const Py_UNICODE *e;
7831
7832    /* Shortcut for single character strings */
7833    if (PyUnicode_GET_SIZE(self) == 1 &&
7834        Py_UNICODE_ISDECIMAL(*p))
7835        return PyBool_FromLong(1);
7836
7837    /* Special case for empty strings */
7838    if (PyUnicode_GET_SIZE(self) == 0)
7839        return PyBool_FromLong(0);
7840
7841    e = p + PyUnicode_GET_SIZE(self);
7842    for (; p < e; p++) {
7843        if (!Py_UNICODE_ISDECIMAL(*p))
7844            return PyBool_FromLong(0);
7845    }
7846    return PyBool_FromLong(1);
7847}
7848
7849PyDoc_STRVAR(isdigit__doc__,
7850             "S.isdigit() -> bool\n\
7851\n\
7852Return True if all characters in S are digits\n\
7853and there is at least one character in S, False otherwise.");
7854
7855static PyObject*
7856unicode_isdigit(PyUnicodeObject *self)
7857{
7858    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7859    register const Py_UNICODE *e;
7860
7861    /* Shortcut for single character strings */
7862    if (PyUnicode_GET_SIZE(self) == 1 &&
7863        Py_UNICODE_ISDIGIT(*p))
7864        return PyBool_FromLong(1);
7865
7866    /* Special case for empty strings */
7867    if (PyUnicode_GET_SIZE(self) == 0)
7868        return PyBool_FromLong(0);
7869
7870    e = p + PyUnicode_GET_SIZE(self);
7871    for (; p < e; p++) {
7872        if (!Py_UNICODE_ISDIGIT(*p))
7873            return PyBool_FromLong(0);
7874    }
7875    return PyBool_FromLong(1);
7876}
7877
7878PyDoc_STRVAR(isnumeric__doc__,
7879             "S.isnumeric() -> bool\n\
7880\n\
7881Return True if there are only numeric characters in S,\n\
7882False otherwise.");
7883
7884static PyObject*
7885unicode_isnumeric(PyUnicodeObject *self)
7886{
7887    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7888    register const Py_UNICODE *e;
7889
7890    /* Shortcut for single character strings */
7891    if (PyUnicode_GET_SIZE(self) == 1 &&
7892        Py_UNICODE_ISNUMERIC(*p))
7893        return PyBool_FromLong(1);
7894
7895    /* Special case for empty strings */
7896    if (PyUnicode_GET_SIZE(self) == 0)
7897        return PyBool_FromLong(0);
7898
7899    e = p + PyUnicode_GET_SIZE(self);
7900    for (; p < e; p++) {
7901        if (!Py_UNICODE_ISNUMERIC(*p))
7902            return PyBool_FromLong(0);
7903    }
7904    return PyBool_FromLong(1);
7905}
7906
7907int
7908PyUnicode_IsIdentifier(PyObject *self)
7909{
7910    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7911    register const Py_UNICODE *e;
7912
7913    /* Special case for empty strings */
7914    if (PyUnicode_GET_SIZE(self) == 0)
7915        return 0;
7916
7917    /* PEP 3131 says that the first character must be in
7918       XID_Start and subsequent characters in XID_Continue,
7919       and for the ASCII range, the 2.x rules apply (i.e
7920       start with letters and underscore, continue with
7921       letters, digits, underscore). However, given the current
7922       definition of XID_Start and XID_Continue, it is sufficient
7923       to check just for these, except that _ must be allowed
7924       as starting an identifier.  */
7925    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7926        return 0;
7927
7928    e = p + PyUnicode_GET_SIZE(self);
7929    for (p++; p < e; p++) {
7930        if (!_PyUnicode_IsXidContinue(*p))
7931            return 0;
7932    }
7933    return 1;
7934}
7935
7936PyDoc_STRVAR(isidentifier__doc__,
7937             "S.isidentifier() -> bool\n\
7938\n\
7939Return True if S is a valid identifier according\n\
7940to the language definition.");
7941
7942static PyObject*
7943unicode_isidentifier(PyObject *self)
7944{
7945    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7946}
7947
7948PyDoc_STRVAR(isprintable__doc__,
7949             "S.isprintable() -> bool\n\
7950\n\
7951Return True if all characters in S are considered\n\
7952printable in repr() or S is empty, False otherwise.");
7953
7954static PyObject*
7955unicode_isprintable(PyObject *self)
7956{
7957    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7958    register const Py_UNICODE *e;
7959
7960    /* Shortcut for single character strings */
7961    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7962        Py_RETURN_TRUE;
7963    }
7964
7965    e = p + PyUnicode_GET_SIZE(self);
7966    for (; p < e; p++) {
7967        if (!Py_UNICODE_ISPRINTABLE(*p)) {
7968            Py_RETURN_FALSE;
7969        }
7970    }
7971    Py_RETURN_TRUE;
7972}
7973
7974PyDoc_STRVAR(join__doc__,
7975             "S.join(iterable) -> str\n\
7976\n\
7977Return a string which is the concatenation of the strings in the\n\
7978iterable.  The separator between elements is S.");
7979
7980static PyObject*
7981unicode_join(PyObject *self, PyObject *data)
7982{
7983    return PyUnicode_Join(self, data);
7984}
7985
7986static Py_ssize_t
7987unicode_length(PyUnicodeObject *self)
7988{
7989    return self->length;
7990}
7991
7992PyDoc_STRVAR(ljust__doc__,
7993             "S.ljust(width[, fillchar]) -> str\n\
7994\n\
7995Return S left-justified in a Unicode string of length width. Padding is\n\
7996done using the specified fill character (default is a space).");
7997
7998static PyObject *
7999unicode_ljust(PyUnicodeObject *self, PyObject *args)
8000{
8001    Py_ssize_t width;
8002    Py_UNICODE fillchar = ' ';
8003
8004    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
8005        return NULL;
8006
8007    if (self->length >= width && PyUnicode_CheckExact(self)) {
8008        Py_INCREF(self);
8009        return (PyObject*) self;
8010    }
8011
8012    return (PyObject*) pad(self, 0, width - self->length, fillchar);
8013}
8014
8015PyDoc_STRVAR(lower__doc__,
8016             "S.lower() -> str\n\
8017\n\
8018Return a copy of the string S converted to lowercase.");
8019
8020static PyObject*
8021unicode_lower(PyUnicodeObject *self)
8022{
8023    return fixup(self, fixlower);
8024}
8025
8026#define LEFTSTRIP 0
8027#define RIGHTSTRIP 1
8028#define BOTHSTRIP 2
8029
8030/* Arrays indexed by above */
8031static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8032
8033#define STRIPNAME(i) (stripformat[i]+3)
8034
8035/* externally visible for str.strip(unicode) */
8036PyObject *
8037_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8038{
8039    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8040    Py_ssize_t len = PyUnicode_GET_SIZE(self);
8041    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8042    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8043    Py_ssize_t i, j;
8044
8045    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
8046
8047    i = 0;
8048    if (striptype != RIGHTSTRIP) {
8049        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8050            i++;
8051        }
8052    }
8053
8054    j = len;
8055    if (striptype != LEFTSTRIP) {
8056        do {
8057            j--;
8058        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8059        j++;
8060    }
8061
8062    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8063        Py_INCREF(self);
8064        return (PyObject*)self;
8065    }
8066    else
8067        return PyUnicode_FromUnicode(s+i, j-i);
8068}
8069
8070
8071static PyObject *
8072do_strip(PyUnicodeObject *self, int striptype)
8073{
8074    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8075    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
8076
8077    i = 0;
8078    if (striptype != RIGHTSTRIP) {
8079        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8080            i++;
8081        }
8082    }
8083
8084    j = len;
8085    if (striptype != LEFTSTRIP) {
8086        do {
8087            j--;
8088        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8089        j++;
8090    }
8091
8092    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8093        Py_INCREF(self);
8094        return (PyObject*)self;
8095    }
8096    else
8097        return PyUnicode_FromUnicode(s+i, j-i);
8098}
8099
8100
8101static PyObject *
8102do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8103{
8104    PyObject *sep = NULL;
8105
8106    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8107        return NULL;
8108
8109    if (sep != NULL && sep != Py_None) {
8110        if (PyUnicode_Check(sep))
8111            return _PyUnicode_XStrip(self, striptype, sep);
8112        else {
8113            PyErr_Format(PyExc_TypeError,
8114                         "%s arg must be None or str",
8115                         STRIPNAME(striptype));
8116            return NULL;
8117        }
8118    }
8119
8120    return do_strip(self, striptype);
8121}
8122
8123
8124PyDoc_STRVAR(strip__doc__,
8125             "S.strip([chars]) -> str\n\
8126\n\
8127Return a copy of the string S with leading and trailing\n\
8128whitespace removed.\n\
8129If chars is given and not None, remove characters in chars instead.");
8130
8131static PyObject *
8132unicode_strip(PyUnicodeObject *self, PyObject *args)
8133{
8134    if (PyTuple_GET_SIZE(args) == 0)
8135        return do_strip(self, BOTHSTRIP); /* Common case */
8136    else
8137        return do_argstrip(self, BOTHSTRIP, args);
8138}
8139
8140
8141PyDoc_STRVAR(lstrip__doc__,
8142             "S.lstrip([chars]) -> str\n\
8143\n\
8144Return a copy of the string S with leading whitespace removed.\n\
8145If chars is given and not None, remove characters in chars instead.");
8146
8147static PyObject *
8148unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8149{
8150    if (PyTuple_GET_SIZE(args) == 0)
8151        return do_strip(self, LEFTSTRIP); /* Common case */
8152    else
8153        return do_argstrip(self, LEFTSTRIP, args);
8154}
8155
8156
8157PyDoc_STRVAR(rstrip__doc__,
8158             "S.rstrip([chars]) -> str\n\
8159\n\
8160Return a copy of the string S with trailing whitespace removed.\n\
8161If chars is given and not None, remove characters in chars instead.");
8162
8163static PyObject *
8164unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8165{
8166    if (PyTuple_GET_SIZE(args) == 0)
8167        return do_strip(self, RIGHTSTRIP); /* Common case */
8168    else
8169        return do_argstrip(self, RIGHTSTRIP, args);
8170}
8171
8172
8173static PyObject*
8174unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
8175{
8176    PyUnicodeObject *u;
8177    Py_UNICODE *p;
8178    Py_ssize_t nchars;
8179    size_t nbytes;
8180
8181    if (len < 1) {
8182        Py_INCREF(unicode_empty);
8183        return (PyObject *)unicode_empty;
8184    }
8185
8186    if (len == 1 && PyUnicode_CheckExact(str)) {
8187        /* no repeat, return original string */
8188        Py_INCREF(str);
8189        return (PyObject*) str;
8190    }
8191
8192    /* ensure # of chars needed doesn't overflow int and # of bytes
8193     * needed doesn't overflow size_t
8194     */
8195    nchars = len * str->length;
8196    if (nchars / len != str->length) {
8197        PyErr_SetString(PyExc_OverflowError,
8198                        "repeated string is too long");
8199        return NULL;
8200    }
8201    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8202    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8203        PyErr_SetString(PyExc_OverflowError,
8204                        "repeated string is too long");
8205        return NULL;
8206    }
8207    u = _PyUnicode_New(nchars);
8208    if (!u)
8209        return NULL;
8210
8211    p = u->str;
8212
8213    if (str->length == 1) {
8214        Py_UNICODE_FILL(p, str->str[0], len);
8215    } else {
8216        Py_ssize_t done = str->length; /* number of characters copied this far */
8217        Py_UNICODE_COPY(p, str->str, str->length);
8218        while (done < nchars) {
8219            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
8220            Py_UNICODE_COPY(p+done, p, n);
8221            done += n;
8222        }
8223    }
8224
8225    return (PyObject*) u;
8226}
8227
8228PyObject *PyUnicode_Replace(PyObject *obj,
8229                            PyObject *subobj,
8230                            PyObject *replobj,
8231                            Py_ssize_t maxcount)
8232{
8233    PyObject *self;
8234    PyObject *str1;
8235    PyObject *str2;
8236    PyObject *result;
8237
8238    self = PyUnicode_FromObject(obj);
8239    if (self == NULL)
8240        return NULL;
8241    str1 = PyUnicode_FromObject(subobj);
8242    if (str1 == NULL) {
8243        Py_DECREF(self);
8244        return NULL;
8245    }
8246    str2 = PyUnicode_FromObject(replobj);
8247    if (str2 == NULL) {
8248        Py_DECREF(self);
8249        Py_DECREF(str1);
8250        return NULL;
8251    }
8252    result = replace((PyUnicodeObject *)self,
8253                     (PyUnicodeObject *)str1,
8254                     (PyUnicodeObject *)str2,
8255                     maxcount);
8256    Py_DECREF(self);
8257    Py_DECREF(str1);
8258    Py_DECREF(str2);
8259    return result;
8260}
8261
8262PyDoc_STRVAR(replace__doc__,
8263             "S.replace(old, new[, count]) -> str\n\
8264\n\
8265Return a copy of S with all occurrences of substring\n\
8266old replaced by new.  If the optional argument count is\n\
8267given, only the first count occurrences are replaced.");
8268
8269static PyObject*
8270unicode_replace(PyUnicodeObject *self, PyObject *args)
8271{
8272    PyUnicodeObject *str1;
8273    PyUnicodeObject *str2;
8274    Py_ssize_t maxcount = -1;
8275    PyObject *result;
8276
8277    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8278        return NULL;
8279    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8280    if (str1 == NULL)
8281        return NULL;
8282    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
8283    if (str2 == NULL) {
8284        Py_DECREF(str1);
8285        return NULL;
8286    }
8287
8288    result = replace(self, str1, str2, maxcount);
8289
8290    Py_DECREF(str1);
8291    Py_DECREF(str2);
8292    return result;
8293}
8294
8295static
8296PyObject *unicode_repr(PyObject *unicode)
8297{
8298    PyObject *repr;
8299    Py_UNICODE *p;
8300    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8301    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8302
8303    /* XXX(nnorwitz): rather than over-allocating, it would be
8304       better to choose a different scheme.  Perhaps scan the
8305       first N-chars of the string and allocate based on that size.
8306    */
8307    /* Initial allocation is based on the longest-possible unichr
8308       escape.
8309
8310       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8311       unichr, so in this case it's the longest unichr escape. In
8312       narrow (UTF-16) builds this is five chars per source unichr
8313       since there are two unichrs in the surrogate pair, so in narrow
8314       (UTF-16) builds it's not the longest unichr escape.
8315
8316       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8317       so in the narrow (UTF-16) build case it's the longest unichr
8318       escape.
8319    */
8320
8321    repr = PyUnicode_FromUnicode(NULL,
8322                                 2 /* quotes */
8323#ifdef Py_UNICODE_WIDE
8324                                 + 10*size
8325#else
8326                                 + 6*size
8327#endif
8328                                 + 1);
8329    if (repr == NULL)
8330        return NULL;
8331
8332    p = PyUnicode_AS_UNICODE(repr);
8333
8334    /* Add quote */
8335    *p++ = (findchar(s, size, '\'') &&
8336            !findchar(s, size, '"')) ? '"' : '\'';
8337    while (size-- > 0) {
8338        Py_UNICODE ch = *s++;
8339
8340        /* Escape quotes and backslashes */
8341        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8342            *p++ = '\\';
8343            *p++ = ch;
8344            continue;
8345        }
8346
8347        /* Map special whitespace to '\t', \n', '\r' */
8348        if (ch == '\t') {
8349            *p++ = '\\';
8350            *p++ = 't';
8351        }
8352        else if (ch == '\n') {
8353            *p++ = '\\';
8354            *p++ = 'n';
8355        }
8356        else if (ch == '\r') {
8357            *p++ = '\\';
8358            *p++ = 'r';
8359        }
8360
8361        /* Map non-printable US ASCII to '\xhh' */
8362        else if (ch < ' ' || ch == 0x7F) {
8363            *p++ = '\\';
8364            *p++ = 'x';
8365            *p++ = hexdigits[(ch >> 4) & 0x000F];
8366            *p++ = hexdigits[ch & 0x000F];
8367        }
8368
8369        /* Copy ASCII characters as-is */
8370        else if (ch < 0x7F) {
8371            *p++ = ch;
8372        }
8373
8374        /* Non-ASCII characters */
8375        else {
8376            Py_UCS4 ucs = ch;
8377
8378#ifndef Py_UNICODE_WIDE
8379            Py_UNICODE ch2 = 0;
8380            /* Get code point from surrogate pair */
8381            if (size > 0) {
8382                ch2 = *s;
8383                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
8384                    && ch2 <= 0xDFFF) {
8385                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
8386                        + 0x00010000;
8387                    s++;
8388                    size--;
8389                }
8390            }
8391#endif
8392            /* Map Unicode whitespace and control characters
8393               (categories Z* and C* except ASCII space)
8394            */
8395            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8396                /* Map 8-bit characters to '\xhh' */
8397                if (ucs <= 0xff) {
8398                    *p++ = '\\';
8399                    *p++ = 'x';
8400                    *p++ = hexdigits[(ch >> 4) & 0x000F];
8401                    *p++ = hexdigits[ch & 0x000F];
8402                }
8403                /* Map 21-bit characters to '\U00xxxxxx' */
8404                else if (ucs >= 0x10000) {
8405                    *p++ = '\\';
8406                    *p++ = 'U';
8407                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8408                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8409                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8410                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8411                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8412                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8413                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8414                    *p++ = hexdigits[ucs & 0x0000000F];
8415                }
8416                /* Map 16-bit characters to '\uxxxx' */
8417                else {
8418                    *p++ = '\\';
8419                    *p++ = 'u';
8420                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
8421                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
8422                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
8423                    *p++ = hexdigits[ucs & 0x000F];
8424                }
8425            }
8426            /* Copy characters as-is */
8427            else {
8428                *p++ = ch;
8429#ifndef Py_UNICODE_WIDE
8430                if (ucs >= 0x10000)
8431                    *p++ = ch2;
8432#endif
8433            }
8434        }
8435    }
8436    /* Add quote */
8437    *p++ = PyUnicode_AS_UNICODE(repr)[0];
8438
8439    *p = '\0';
8440    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8441    return repr;
8442}
8443
8444PyDoc_STRVAR(rfind__doc__,
8445             "S.rfind(sub[, start[, end]]) -> int\n\
8446\n\
8447Return the highest index in S where substring sub is found,\n\
8448such that sub is contained within s[start:end].  Optional\n\
8449arguments start and end are interpreted as in slice notation.\n\
8450\n\
8451Return -1 on failure.");
8452
8453static PyObject *
8454unicode_rfind(PyUnicodeObject *self, PyObject *args)
8455{
8456    PyObject *substring;
8457    Py_ssize_t start;
8458    Py_ssize_t end;
8459    Py_ssize_t result;
8460
8461    if (!_ParseTupleFinds(args, &substring, &start, &end))
8462        return NULL;
8463
8464    result = stringlib_rfind_slice(
8465        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8466        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8467        start, end
8468        );
8469
8470    Py_DECREF(substring);
8471
8472    return PyLong_FromSsize_t(result);
8473}
8474
8475PyDoc_STRVAR(rindex__doc__,
8476             "S.rindex(sub[, start[, end]]) -> int\n\
8477\n\
8478Like S.rfind() but raise ValueError when the substring is not found.");
8479
8480static PyObject *
8481unicode_rindex(PyUnicodeObject *self, PyObject *args)
8482{
8483    PyObject *substring;
8484    Py_ssize_t start;
8485    Py_ssize_t end;
8486    Py_ssize_t result;
8487
8488    if (!_ParseTupleFinds(args, &substring, &start, &end))
8489        return NULL;
8490
8491    result = stringlib_rfind_slice(
8492        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8493        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8494        start, end
8495        );
8496
8497    Py_DECREF(substring);
8498
8499    if (result < 0) {
8500        PyErr_SetString(PyExc_ValueError, "substring not found");
8501        return NULL;
8502    }
8503    return PyLong_FromSsize_t(result);
8504}
8505
8506PyDoc_STRVAR(rjust__doc__,
8507             "S.rjust(width[, fillchar]) -> str\n\
8508\n\
8509Return S right-justified in a string of length width. Padding is\n\
8510done using the specified fill character (default is a space).");
8511
8512static PyObject *
8513unicode_rjust(PyUnicodeObject *self, PyObject *args)
8514{
8515    Py_ssize_t width;
8516    Py_UNICODE fillchar = ' ';
8517
8518    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8519        return NULL;
8520
8521    if (self->length >= width && PyUnicode_CheckExact(self)) {
8522        Py_INCREF(self);
8523        return (PyObject*) self;
8524    }
8525
8526    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8527}
8528
8529PyObject *PyUnicode_Split(PyObject *s,
8530                          PyObject *sep,
8531                          Py_ssize_t maxsplit)
8532{
8533    PyObject *result;
8534
8535    s = PyUnicode_FromObject(s);
8536    if (s == NULL)
8537        return NULL;
8538    if (sep != NULL) {
8539        sep = PyUnicode_FromObject(sep);
8540        if (sep == NULL) {
8541            Py_DECREF(s);
8542            return NULL;
8543        }
8544    }
8545
8546    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8547
8548    Py_DECREF(s);
8549    Py_XDECREF(sep);
8550    return result;
8551}
8552
8553PyDoc_STRVAR(split__doc__,
8554             "S.split([sep[, maxsplit]]) -> list of strings\n\
8555\n\
8556Return a list of the words in S, using sep as the\n\
8557delimiter string.  If maxsplit is given, at most maxsplit\n\
8558splits are done. If sep is not specified or is None, any\n\
8559whitespace string is a separator and empty strings are\n\
8560removed from the result.");
8561
8562static PyObject*
8563unicode_split(PyUnicodeObject *self, PyObject *args)
8564{
8565    PyObject *substring = Py_None;
8566    Py_ssize_t maxcount = -1;
8567
8568    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8569        return NULL;
8570
8571    if (substring == Py_None)
8572        return split(self, NULL, maxcount);
8573    else if (PyUnicode_Check(substring))
8574        return split(self, (PyUnicodeObject *)substring, maxcount);
8575    else
8576        return PyUnicode_Split((PyObject *)self, substring, maxcount);
8577}
8578
8579PyObject *
8580PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8581{
8582    PyObject* str_obj;
8583    PyObject* sep_obj;
8584    PyObject* out;
8585
8586    str_obj = PyUnicode_FromObject(str_in);
8587    if (!str_obj)
8588        return NULL;
8589    sep_obj = PyUnicode_FromObject(sep_in);
8590    if (!sep_obj) {
8591        Py_DECREF(str_obj);
8592        return NULL;
8593    }
8594
8595    out = stringlib_partition(
8596        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8597        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8598        );
8599
8600    Py_DECREF(sep_obj);
8601    Py_DECREF(str_obj);
8602
8603    return out;
8604}
8605
8606
8607PyObject *
8608PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8609{
8610    PyObject* str_obj;
8611    PyObject* sep_obj;
8612    PyObject* out;
8613
8614    str_obj = PyUnicode_FromObject(str_in);
8615    if (!str_obj)
8616        return NULL;
8617    sep_obj = PyUnicode_FromObject(sep_in);
8618    if (!sep_obj) {
8619        Py_DECREF(str_obj);
8620        return NULL;
8621    }
8622
8623    out = stringlib_rpartition(
8624        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8625        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8626        );
8627
8628    Py_DECREF(sep_obj);
8629    Py_DECREF(str_obj);
8630
8631    return out;
8632}
8633
8634PyDoc_STRVAR(partition__doc__,
8635             "S.partition(sep) -> (head, sep, tail)\n\
8636\n\
8637Search for the separator sep in S, and return the part before it,\n\
8638the separator itself, and the part after it.  If the separator is not\n\
8639found, return S and two empty strings.");
8640
8641static PyObject*
8642unicode_partition(PyUnicodeObject *self, PyObject *separator)
8643{
8644    return PyUnicode_Partition((PyObject *)self, separator);
8645}
8646
8647PyDoc_STRVAR(rpartition__doc__,
8648             "S.rpartition(sep) -> (head, sep, tail)\n\
8649\n\
8650Search for the separator sep in S, starting at the end of S, and return\n\
8651the part before it, the separator itself, and the part after it.  If the\n\
8652separator is not found, return two empty strings and S.");
8653
8654static PyObject*
8655unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8656{
8657    return PyUnicode_RPartition((PyObject *)self, separator);
8658}
8659
8660PyObject *PyUnicode_RSplit(PyObject *s,
8661                           PyObject *sep,
8662                           Py_ssize_t maxsplit)
8663{
8664    PyObject *result;
8665
8666    s = PyUnicode_FromObject(s);
8667    if (s == NULL)
8668        return NULL;
8669    if (sep != NULL) {
8670        sep = PyUnicode_FromObject(sep);
8671        if (sep == NULL) {
8672            Py_DECREF(s);
8673            return NULL;
8674        }
8675    }
8676
8677    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8678
8679    Py_DECREF(s);
8680    Py_XDECREF(sep);
8681    return result;
8682}
8683
8684PyDoc_STRVAR(rsplit__doc__,
8685             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8686\n\
8687Return a list of the words in S, using sep as the\n\
8688delimiter string, starting at the end of the string and\n\
8689working to the front.  If maxsplit is given, at most maxsplit\n\
8690splits are done. If sep is not specified, any whitespace string\n\
8691is a separator.");
8692
8693static PyObject*
8694unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8695{
8696    PyObject *substring = Py_None;
8697    Py_ssize_t maxcount = -1;
8698
8699    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8700        return NULL;
8701
8702    if (substring == Py_None)
8703        return rsplit(self, NULL, maxcount);
8704    else if (PyUnicode_Check(substring))
8705        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8706    else
8707        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8708}
8709
8710PyDoc_STRVAR(splitlines__doc__,
8711             "S.splitlines([keepends]) -> list of strings\n\
8712\n\
8713Return a list of the lines in S, breaking at line boundaries.\n\
8714Line breaks are not included in the resulting list unless keepends\n\
8715is given and true.");
8716
8717static PyObject*
8718unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8719{
8720    int keepends = 0;
8721
8722    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8723        return NULL;
8724
8725    return PyUnicode_Splitlines((PyObject *)self, keepends);
8726}
8727
8728static
8729PyObject *unicode_str(PyObject *self)
8730{
8731    if (PyUnicode_CheckExact(self)) {
8732        Py_INCREF(self);
8733        return self;
8734    } else
8735        /* Subtype -- return genuine unicode string with the same value. */
8736        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8737                                     PyUnicode_GET_SIZE(self));
8738}
8739
8740PyDoc_STRVAR(swapcase__doc__,
8741             "S.swapcase() -> str\n\
8742\n\
8743Return a copy of S with uppercase characters converted to lowercase\n\
8744and vice versa.");
8745
8746static PyObject*
8747unicode_swapcase(PyUnicodeObject *self)
8748{
8749    return fixup(self, fixswapcase);
8750}
8751
8752PyDoc_STRVAR(maketrans__doc__,
8753             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8754\n\
8755Return a translation table usable for str.translate().\n\
8756If there is only one argument, it must be a dictionary mapping Unicode\n\
8757ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8758Character keys will be then converted to ordinals.\n\
8759If there are two arguments, they must be strings of equal length, and\n\
8760in the resulting dictionary, each character in x will be mapped to the\n\
8761character at the same position in y. If there is a third argument, it\n\
8762must be a string, whose characters will be mapped to None in the result.");
8763
8764static PyObject*
8765unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8766{
8767    PyObject *x, *y = NULL, *z = NULL;
8768    PyObject *new = NULL, *key, *value;
8769    Py_ssize_t i = 0;
8770    int res;
8771
8772    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8773        return NULL;
8774    new = PyDict_New();
8775    if (!new)
8776        return NULL;
8777    if (y != NULL) {
8778        /* x must be a string too, of equal length */
8779        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8780        if (!PyUnicode_Check(x)) {
8781            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8782                            "be a string if there is a second argument");
8783            goto err;
8784        }
8785        if (PyUnicode_GET_SIZE(x) != ylen) {
8786            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8787                            "arguments must have equal length");
8788            goto err;
8789        }
8790        /* create entries for translating chars in x to those in y */
8791        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8792            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8793            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8794            if (!key || !value)
8795                goto err;
8796            res = PyDict_SetItem(new, key, value);
8797            Py_DECREF(key);
8798            Py_DECREF(value);
8799            if (res < 0)
8800                goto err;
8801        }
8802        /* create entries for deleting chars in z */
8803        if (z != NULL) {
8804            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8805                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8806                if (!key)
8807                    goto err;
8808                res = PyDict_SetItem(new, key, Py_None);
8809                Py_DECREF(key);
8810                if (res < 0)
8811                    goto err;
8812            }
8813        }
8814    } else {
8815        /* x must be a dict */
8816        if (!PyDict_CheckExact(x)) {
8817            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8818                            "to maketrans it must be a dict");
8819            goto err;
8820        }
8821        /* copy entries into the new dict, converting string keys to int keys */
8822        while (PyDict_Next(x, &i, &key, &value)) {
8823            if (PyUnicode_Check(key)) {
8824                /* convert string keys to integer keys */
8825                PyObject *newkey;
8826                if (PyUnicode_GET_SIZE(key) != 1) {
8827                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8828                                    "table must be of length 1");
8829                    goto err;
8830                }
8831                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8832                if (!newkey)
8833                    goto err;
8834                res = PyDict_SetItem(new, newkey, value);
8835                Py_DECREF(newkey);
8836                if (res < 0)
8837                    goto err;
8838            } else if (PyLong_Check(key)) {
8839                /* just keep integer keys */
8840                if (PyDict_SetItem(new, key, value) < 0)
8841                    goto err;
8842            } else {
8843                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8844                                "be strings or integers");
8845                goto err;
8846            }
8847        }
8848    }
8849    return new;
8850  err:
8851    Py_DECREF(new);
8852    return NULL;
8853}
8854
8855PyDoc_STRVAR(translate__doc__,
8856             "S.translate(table) -> str\n\
8857\n\
8858Return a copy of the string S, where all characters have been mapped\n\
8859through the given translation table, which must be a mapping of\n\
8860Unicode ordinals to Unicode ordinals, strings, or None.\n\
8861Unmapped characters are left untouched. Characters mapped to None\n\
8862are deleted.");
8863
8864static PyObject*
8865unicode_translate(PyUnicodeObject *self, PyObject *table)
8866{
8867    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8868}
8869
8870PyDoc_STRVAR(upper__doc__,
8871             "S.upper() -> str\n\
8872\n\
8873Return a copy of S converted to uppercase.");
8874
8875static PyObject*
8876unicode_upper(PyUnicodeObject *self)
8877{
8878    return fixup(self, fixupper);
8879}
8880
8881PyDoc_STRVAR(zfill__doc__,
8882             "S.zfill(width) -> str\n\
8883\n\
8884Pad a numeric string S with zeros on the left, to fill a field\n\
8885of the specified width. The string S is never truncated.");
8886
8887static PyObject *
8888unicode_zfill(PyUnicodeObject *self, PyObject *args)
8889{
8890    Py_ssize_t fill;
8891    PyUnicodeObject *u;
8892
8893    Py_ssize_t width;
8894    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8895        return NULL;
8896
8897    if (self->length >= width) {
8898        if (PyUnicode_CheckExact(self)) {
8899            Py_INCREF(self);
8900            return (PyObject*) self;
8901        }
8902        else
8903            return PyUnicode_FromUnicode(
8904                PyUnicode_AS_UNICODE(self),
8905                PyUnicode_GET_SIZE(self)
8906                );
8907    }
8908
8909    fill = width - self->length;
8910
8911    u = pad(self, fill, 0, '0');
8912
8913    if (u == NULL)
8914        return NULL;
8915
8916    if (u->str[fill] == '+' || u->str[fill] == '-') {
8917        /* move sign to beginning of string */
8918        u->str[0] = u->str[fill];
8919        u->str[fill] = '0';
8920    }
8921
8922    return (PyObject*) u;
8923}
8924
8925#if 0
8926static PyObject*
8927unicode_freelistsize(PyUnicodeObject *self)
8928{
8929    return PyLong_FromLong(numfree);
8930}
8931#endif
8932
8933PyDoc_STRVAR(startswith__doc__,
8934             "S.startswith(prefix[, start[, end]]) -> bool\n\
8935\n\
8936Return True if S starts with the specified prefix, False otherwise.\n\
8937With optional start, test S beginning at that position.\n\
8938With optional end, stop comparing S at that position.\n\
8939prefix can also be a tuple of strings to try.");
8940
8941static PyObject *
8942unicode_startswith(PyUnicodeObject *self,
8943                   PyObject *args)
8944{
8945    PyObject *subobj;
8946    PyUnicodeObject *substring;
8947    Py_ssize_t start = 0;
8948    Py_ssize_t end = PY_SSIZE_T_MAX;
8949    int result;
8950
8951    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8952                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8953        return NULL;
8954    if (PyTuple_Check(subobj)) {
8955        Py_ssize_t i;
8956        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8957            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8958                PyTuple_GET_ITEM(subobj, i));
8959            if (substring == NULL)
8960                return NULL;
8961            result = tailmatch(self, substring, start, end, -1);
8962            Py_DECREF(substring);
8963            if (result) {
8964                Py_RETURN_TRUE;
8965            }
8966        }
8967        /* nothing matched */
8968        Py_RETURN_FALSE;
8969    }
8970    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8971    if (substring == NULL)
8972        return NULL;
8973    result = tailmatch(self, substring, start, end, -1);
8974    Py_DECREF(substring);
8975    return PyBool_FromLong(result);
8976}
8977
8978
8979PyDoc_STRVAR(endswith__doc__,
8980             "S.endswith(suffix[, start[, end]]) -> bool\n\
8981\n\
8982Return True if S ends with the specified suffix, False otherwise.\n\
8983With optional start, test S beginning at that position.\n\
8984With optional end, stop comparing S at that position.\n\
8985suffix can also be a tuple of strings to try.");
8986
8987static PyObject *
8988unicode_endswith(PyUnicodeObject *self,
8989                 PyObject *args)
8990{
8991    PyObject *subobj;
8992    PyUnicodeObject *substring;
8993    Py_ssize_t start = 0;
8994    Py_ssize_t end = PY_SSIZE_T_MAX;
8995    int result;
8996
8997    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8998                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8999        return NULL;
9000    if (PyTuple_Check(subobj)) {
9001        Py_ssize_t i;
9002        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9003            substring = (PyUnicodeObject *)PyUnicode_FromObject(
9004                PyTuple_GET_ITEM(subobj, i));
9005            if (substring == NULL)
9006                return NULL;
9007            result = tailmatch(self, substring, start, end, +1);
9008            Py_DECREF(substring);
9009            if (result) {
9010                Py_RETURN_TRUE;
9011            }
9012        }
9013        Py_RETURN_FALSE;
9014    }
9015    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9016    if (substring == NULL)
9017        return NULL;
9018
9019    result = tailmatch(self, substring, start, end, +1);
9020    Py_DECREF(substring);
9021    return PyBool_FromLong(result);
9022}
9023
9024#include "stringlib/string_format.h"
9025
9026PyDoc_STRVAR(format__doc__,
9027             "S.format(*args, **kwargs) -> str\n\
9028\n\
9029Return a formatted version of S, using substitutions from args and kwargs.\n\
9030The substitutions are identified by braces ('{' and '}').");
9031
9032PyDoc_STRVAR(format_map__doc__,
9033             "S.format_map(mapping) -> str\n\
9034\n\
9035Return a formatted version of S, using substitutions from mapping.\n\
9036The substitutions are identified by braces ('{' and '}').");
9037
9038static PyObject *
9039unicode__format__(PyObject* self, PyObject* args)
9040{
9041    PyObject *format_spec;
9042
9043    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9044        return NULL;
9045
9046    return _PyUnicode_FormatAdvanced(self,
9047                                     PyUnicode_AS_UNICODE(format_spec),
9048                                     PyUnicode_GET_SIZE(format_spec));
9049}
9050
9051PyDoc_STRVAR(p_format__doc__,
9052             "S.__format__(format_spec) -> str\n\
9053\n\
9054Return a formatted version of S as described by format_spec.");
9055
9056static PyObject *
9057unicode__sizeof__(PyUnicodeObject *v)
9058{
9059    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9060                              sizeof(Py_UNICODE) * (v->length + 1));
9061}
9062
9063PyDoc_STRVAR(sizeof__doc__,
9064             "S.__sizeof__() -> size of S in memory, in bytes");
9065
9066static PyObject *
9067unicode_getnewargs(PyUnicodeObject *v)
9068{
9069    return Py_BuildValue("(u#)", v->str, v->length);
9070}
9071
9072
9073static PyMethodDef unicode_methods[] = {
9074
9075    /* Order is according to common usage: often used methods should
9076       appear first, since lookup is done sequentially. */
9077
9078    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
9079    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9080    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
9081    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
9082    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9083    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9084    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9085    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9086    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9087    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9088    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
9089    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
9090    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9091    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9092    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
9093    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
9094    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9095    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9096    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
9097    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
9098    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
9099    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
9100    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
9101    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9102    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9103    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9104    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9105    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9106    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9107    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9108    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9109    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9110    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9111    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9112    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9113    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9114    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
9115    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
9116    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
9117    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
9118    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
9119    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
9120    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
9121    {"maketrans", (PyCFunction) unicode_maketrans,
9122     METH_VARARGS | METH_STATIC, maketrans__doc__},
9123    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
9124#if 0
9125    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
9126#endif
9127
9128#if 0
9129    /* This one is just used for debugging the implementation. */
9130    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
9131#endif
9132
9133    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
9134    {NULL, NULL}
9135};
9136
9137static PyObject *
9138unicode_mod(PyObject *v, PyObject *w)
9139{
9140    if (!PyUnicode_Check(v)) {
9141        Py_INCREF(Py_NotImplemented);
9142        return Py_NotImplemented;
9143    }
9144    return PyUnicode_Format(v, w);
9145}
9146
9147static PyNumberMethods unicode_as_number = {
9148    0,              /*nb_add*/
9149    0,              /*nb_subtract*/
9150    0,              /*nb_multiply*/
9151    unicode_mod,            /*nb_remainder*/
9152};
9153
9154static PySequenceMethods unicode_as_sequence = {
9155    (lenfunc) unicode_length,       /* sq_length */
9156    PyUnicode_Concat,           /* sq_concat */
9157    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
9158    (ssizeargfunc) unicode_getitem,     /* sq_item */
9159    0,                  /* sq_slice */
9160    0,                  /* sq_ass_item */
9161    0,                  /* sq_ass_slice */
9162    PyUnicode_Contains,         /* sq_contains */
9163};
9164
9165static PyObject*
9166unicode_subscript(PyUnicodeObject* self, PyObject* item)
9167{
9168    if (PyIndex_Check(item)) {
9169        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
9170        if (i == -1 && PyErr_Occurred())
9171            return NULL;
9172        if (i < 0)
9173            i += PyUnicode_GET_SIZE(self);
9174        return unicode_getitem(self, i);
9175    } else if (PySlice_Check(item)) {
9176        Py_ssize_t start, stop, step, slicelength, cur, i;
9177        Py_UNICODE* source_buf;
9178        Py_UNICODE* result_buf;
9179        PyObject* result;
9180
9181        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
9182                                 &start, &stop, &step, &slicelength) < 0) {
9183            return NULL;
9184        }
9185
9186        if (slicelength <= 0) {
9187            return PyUnicode_FromUnicode(NULL, 0);
9188        } else if (start == 0 && step == 1 && slicelength == self->length &&
9189                   PyUnicode_CheckExact(self)) {
9190            Py_INCREF(self);
9191            return (PyObject *)self;
9192        } else if (step == 1) {
9193            return PyUnicode_FromUnicode(self->str + start, slicelength);
9194        } else {
9195            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
9196            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9197                                                       sizeof(Py_UNICODE));
9198
9199            if (result_buf == NULL)
9200                return PyErr_NoMemory();
9201
9202            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9203                result_buf[i] = source_buf[cur];
9204            }
9205
9206            result = PyUnicode_FromUnicode(result_buf, slicelength);
9207            PyObject_FREE(result_buf);
9208            return result;
9209        }
9210    } else {
9211        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9212        return NULL;
9213    }
9214}
9215
9216static PyMappingMethods unicode_as_mapping = {
9217    (lenfunc)unicode_length,        /* mp_length */
9218    (binaryfunc)unicode_subscript,  /* mp_subscript */
9219    (objobjargproc)0,           /* mp_ass_subscript */
9220};
9221
9222
9223/* Helpers for PyUnicode_Format() */
9224
9225static PyObject *
9226getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
9227{
9228    Py_ssize_t argidx = *p_argidx;
9229    if (argidx < arglen) {
9230        (*p_argidx)++;
9231        if (arglen < 0)
9232            return args;
9233        else
9234            return PyTuple_GetItem(args, argidx);
9235    }
9236    PyErr_SetString(PyExc_TypeError,
9237                    "not enough arguments for format string");
9238    return NULL;
9239}
9240
9241/* Returns a new reference to a PyUnicode object, or NULL on failure. */
9242
9243static PyObject *
9244formatfloat(PyObject *v, int flags, int prec, int type)
9245{
9246    char *p;
9247    PyObject *result;
9248    double x;
9249
9250    x = PyFloat_AsDouble(v);
9251    if (x == -1.0 && PyErr_Occurred())
9252        return NULL;
9253
9254    if (prec < 0)
9255        prec = 6;
9256
9257    p = PyOS_double_to_string(x, type, prec,
9258                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
9259    if (p == NULL)
9260        return NULL;
9261    result = PyUnicode_FromStringAndSize(p, strlen(p));
9262    PyMem_Free(p);
9263    return result;
9264}
9265
9266static PyObject*
9267formatlong(PyObject *val, int flags, int prec, int type)
9268{
9269    char *buf;
9270    int len;
9271    PyObject *str; /* temporary string object. */
9272    PyObject *result;
9273
9274    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9275    if (!str)
9276        return NULL;
9277    result = PyUnicode_FromStringAndSize(buf, len);
9278    Py_DECREF(str);
9279    return result;
9280}
9281
9282static int
9283formatchar(Py_UNICODE *buf,
9284           size_t buflen,
9285           PyObject *v)
9286{
9287    /* presume that the buffer is at least 3 characters long */
9288    if (PyUnicode_Check(v)) {
9289        if (PyUnicode_GET_SIZE(v) == 1) {
9290            buf[0] = PyUnicode_AS_UNICODE(v)[0];
9291            buf[1] = '\0';
9292            return 1;
9293        }
9294#ifndef Py_UNICODE_WIDE
9295        if (PyUnicode_GET_SIZE(v) == 2) {
9296            /* Decode a valid surrogate pair */
9297            int c0 = PyUnicode_AS_UNICODE(v)[0];
9298            int c1 = PyUnicode_AS_UNICODE(v)[1];
9299            if (0xD800 <= c0 && c0 <= 0xDBFF &&
9300                0xDC00 <= c1 && c1 <= 0xDFFF) {
9301                buf[0] = c0;
9302                buf[1] = c1;
9303                buf[2] = '\0';
9304                return 2;
9305            }
9306        }
9307#endif
9308        goto onError;
9309    }
9310    else {
9311        /* Integer input truncated to a character */
9312        long x;
9313        x = PyLong_AsLong(v);
9314        if (x == -1 && PyErr_Occurred())
9315            goto onError;
9316
9317        if (x < 0 || x > 0x10ffff) {
9318            PyErr_SetString(PyExc_OverflowError,
9319                            "%c arg not in range(0x110000)");
9320            return -1;
9321        }
9322
9323#ifndef Py_UNICODE_WIDE
9324        if (x > 0xffff) {
9325            x -= 0x10000;
9326            buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9327            buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9328            return 2;
9329        }
9330#endif
9331        buf[0] = (Py_UNICODE) x;
9332        buf[1] = '\0';
9333        return 1;
9334    }
9335
9336  onError:
9337    PyErr_SetString(PyExc_TypeError,
9338                    "%c requires int or char");
9339    return -1;
9340}
9341
9342/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9343   FORMATBUFLEN is the length of the buffer in which chars are formatted.
9344*/
9345#define FORMATBUFLEN (size_t)10
9346
9347PyObject *PyUnicode_Format(PyObject *format,
9348                           PyObject *args)
9349{
9350    Py_UNICODE *fmt, *res;
9351    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
9352    int args_owned = 0;
9353    PyUnicodeObject *result = NULL;
9354    PyObject *dict = NULL;
9355    PyObject *uformat;
9356
9357    if (format == NULL || args == NULL) {
9358        PyErr_BadInternalCall();
9359        return NULL;
9360    }
9361    uformat = PyUnicode_FromObject(format);
9362    if (uformat == NULL)
9363        return NULL;
9364    fmt = PyUnicode_AS_UNICODE(uformat);
9365    fmtcnt = PyUnicode_GET_SIZE(uformat);
9366
9367    reslen = rescnt = fmtcnt + 100;
9368    result = _PyUnicode_New(reslen);
9369    if (result == NULL)
9370        goto onError;
9371    res = PyUnicode_AS_UNICODE(result);
9372
9373    if (PyTuple_Check(args)) {
9374        arglen = PyTuple_Size(args);
9375        argidx = 0;
9376    }
9377    else {
9378        arglen = -1;
9379        argidx = -2;
9380    }
9381    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9382        !PyUnicode_Check(args))
9383        dict = args;
9384
9385    while (--fmtcnt >= 0) {
9386        if (*fmt != '%') {
9387            if (--rescnt < 0) {
9388                rescnt = fmtcnt + 100;
9389                reslen += rescnt;
9390                if (_PyUnicode_Resize(&result, reslen) < 0)
9391                    goto onError;
9392                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9393                --rescnt;
9394            }
9395            *res++ = *fmt++;
9396        }
9397        else {
9398            /* Got a format specifier */
9399            int flags = 0;
9400            Py_ssize_t width = -1;
9401            int prec = -1;
9402            Py_UNICODE c = '\0';
9403            Py_UNICODE fill;
9404            int isnumok;
9405            PyObject *v = NULL;
9406            PyObject *temp = NULL;
9407            Py_UNICODE *pbuf;
9408            Py_UNICODE sign;
9409            Py_ssize_t len;
9410            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
9411
9412            fmt++;
9413            if (*fmt == '(') {
9414                Py_UNICODE *keystart;
9415                Py_ssize_t keylen;
9416                PyObject *key;
9417                int pcount = 1;
9418
9419                if (dict == NULL) {
9420                    PyErr_SetString(PyExc_TypeError,
9421                                    "format requires a mapping");
9422                    goto onError;
9423                }
9424                ++fmt;
9425                --fmtcnt;
9426                keystart = fmt;
9427                /* Skip over balanced parentheses */
9428                while (pcount > 0 && --fmtcnt >= 0) {
9429                    if (*fmt == ')')
9430                        --pcount;
9431                    else if (*fmt == '(')
9432                        ++pcount;
9433                    fmt++;
9434                }
9435                keylen = fmt - keystart - 1;
9436                if (fmtcnt < 0 || pcount > 0) {
9437                    PyErr_SetString(PyExc_ValueError,
9438                                    "incomplete format key");
9439                    goto onError;
9440                }
9441#if 0
9442                /* keys are converted to strings using UTF-8 and
9443                   then looked up since Python uses strings to hold
9444                   variables names etc. in its namespaces and we
9445                   wouldn't want to break common idioms. */
9446                key = PyUnicode_EncodeUTF8(keystart,
9447                                           keylen,
9448                                           NULL);
9449#else
9450                key = PyUnicode_FromUnicode(keystart, keylen);
9451#endif
9452                if (key == NULL)
9453                    goto onError;
9454                if (args_owned) {
9455                    Py_DECREF(args);
9456                    args_owned = 0;
9457                }
9458                args = PyObject_GetItem(dict, key);
9459                Py_DECREF(key);
9460                if (args == NULL) {
9461                    goto onError;
9462                }
9463                args_owned = 1;
9464                arglen = -1;
9465                argidx = -2;
9466            }
9467            while (--fmtcnt >= 0) {
9468                switch (c = *fmt++) {
9469                case '-': flags |= F_LJUST; continue;
9470                case '+': flags |= F_SIGN; continue;
9471                case ' ': flags |= F_BLANK; continue;
9472                case '#': flags |= F_ALT; continue;
9473                case '0': flags |= F_ZERO; continue;
9474                }
9475                break;
9476            }
9477            if (c == '*') {
9478                v = getnextarg(args, arglen, &argidx);
9479                if (v == NULL)
9480                    goto onError;
9481                if (!PyLong_Check(v)) {
9482                    PyErr_SetString(PyExc_TypeError,
9483                                    "* wants int");
9484                    goto onError;
9485                }
9486                width = PyLong_AsLong(v);
9487                if (width == -1 && PyErr_Occurred())
9488                    goto onError;
9489                if (width < 0) {
9490                    flags |= F_LJUST;
9491                    width = -width;
9492                }
9493                if (--fmtcnt >= 0)
9494                    c = *fmt++;
9495            }
9496            else if (c >= '0' && c <= '9') {
9497                width = c - '0';
9498                while (--fmtcnt >= 0) {
9499                    c = *fmt++;
9500                    if (c < '0' || c > '9')
9501                        break;
9502                    if ((width*10) / 10 != width) {
9503                        PyErr_SetString(PyExc_ValueError,
9504                                        "width too big");
9505                        goto onError;
9506                    }
9507                    width = width*10 + (c - '0');
9508                }
9509            }
9510            if (c == '.') {
9511                prec = 0;
9512                if (--fmtcnt >= 0)
9513                    c = *fmt++;
9514                if (c == '*') {
9515                    v = getnextarg(args, arglen, &argidx);
9516                    if (v == NULL)
9517                        goto onError;
9518                    if (!PyLong_Check(v)) {
9519                        PyErr_SetString(PyExc_TypeError,
9520                                        "* wants int");
9521                        goto onError;
9522                    }
9523                    prec = PyLong_AsLong(v);
9524                    if (prec == -1 && PyErr_Occurred())
9525                        goto onError;
9526                    if (prec < 0)
9527                        prec = 0;
9528                    if (--fmtcnt >= 0)
9529                        c = *fmt++;
9530                }
9531                else if (c >= '0' && c <= '9') {
9532                    prec = c - '0';
9533                    while (--fmtcnt >= 0) {
9534                        c = *fmt++;
9535                        if (c < '0' || c > '9')
9536                            break;
9537                        if ((prec*10) / 10 != prec) {
9538                            PyErr_SetString(PyExc_ValueError,
9539                                            "prec too big");
9540                            goto onError;
9541                        }
9542                        prec = prec*10 + (c - '0');
9543                    }
9544                }
9545            } /* prec */
9546            if (fmtcnt >= 0) {
9547                if (c == 'h' || c == 'l' || c == 'L') {
9548                    if (--fmtcnt >= 0)
9549                        c = *fmt++;
9550                }
9551            }
9552            if (fmtcnt < 0) {
9553                PyErr_SetString(PyExc_ValueError,
9554                                "incomplete format");
9555                goto onError;
9556            }
9557            if (c != '%') {
9558                v = getnextarg(args, arglen, &argidx);
9559                if (v == NULL)
9560                    goto onError;
9561            }
9562            sign = 0;
9563            fill = ' ';
9564            switch (c) {
9565
9566            case '%':
9567                pbuf = formatbuf;
9568                /* presume that buffer length is at least 1 */
9569                pbuf[0] = '%';
9570                len = 1;
9571                break;
9572
9573            case 's':
9574            case 'r':
9575            case 'a':
9576                if (PyUnicode_CheckExact(v) && c == 's') {
9577                    temp = v;
9578                    Py_INCREF(temp);
9579                }
9580                else {
9581                    if (c == 's')
9582                        temp = PyObject_Str(v);
9583                    else if (c == 'r')
9584                        temp = PyObject_Repr(v);
9585                    else
9586                        temp = PyObject_ASCII(v);
9587                    if (temp == NULL)
9588                        goto onError;
9589                    if (PyUnicode_Check(temp))
9590                        /* nothing to do */;
9591                    else {
9592                        Py_DECREF(temp);
9593                        PyErr_SetString(PyExc_TypeError,
9594                                        "%s argument has non-string str()");
9595                        goto onError;
9596                    }
9597                }
9598                pbuf = PyUnicode_AS_UNICODE(temp);
9599                len = PyUnicode_GET_SIZE(temp);
9600                if (prec >= 0 && len > prec)
9601                    len = prec;
9602                break;
9603
9604            case 'i':
9605            case 'd':
9606            case 'u':
9607            case 'o':
9608            case 'x':
9609            case 'X':
9610                if (c == 'i')
9611                    c = 'd';
9612                isnumok = 0;
9613                if (PyNumber_Check(v)) {
9614                    PyObject *iobj=NULL;
9615
9616                    if (PyLong_Check(v)) {
9617                        iobj = v;
9618                        Py_INCREF(iobj);
9619                    }
9620                    else {
9621                        iobj = PyNumber_Long(v);
9622                    }
9623                    if (iobj!=NULL) {
9624                        if (PyLong_Check(iobj)) {
9625                            isnumok = 1;
9626                            temp = formatlong(iobj, flags, prec, c);
9627                            Py_DECREF(iobj);
9628                            if (!temp)
9629                                goto onError;
9630                            pbuf = PyUnicode_AS_UNICODE(temp);
9631                            len = PyUnicode_GET_SIZE(temp);
9632                            sign = 1;
9633                        }
9634                        else {
9635                            Py_DECREF(iobj);
9636                        }
9637                    }
9638                }
9639                if (!isnumok) {
9640                    PyErr_Format(PyExc_TypeError,
9641                                 "%%%c format: a number is required, "
9642                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9643                    goto onError;
9644                }
9645                if (flags & F_ZERO)
9646                    fill = '0';
9647                break;
9648
9649            case 'e':
9650            case 'E':
9651            case 'f':
9652            case 'F':
9653            case 'g':
9654            case 'G':
9655                temp = formatfloat(v, flags, prec, c);
9656                if (!temp)
9657                    goto onError;
9658                pbuf = PyUnicode_AS_UNICODE(temp);
9659                len = PyUnicode_GET_SIZE(temp);
9660                sign = 1;
9661                if (flags & F_ZERO)
9662                    fill = '0';
9663                break;
9664
9665            case 'c':
9666                pbuf = formatbuf;
9667                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9668                if (len < 0)
9669                    goto onError;
9670                break;
9671
9672            default:
9673                PyErr_Format(PyExc_ValueError,
9674                             "unsupported format character '%c' (0x%x) "
9675                             "at index %zd",
9676                             (31<=c && c<=126) ? (char)c : '?',
9677                             (int)c,
9678                             (Py_ssize_t)(fmt - 1 -
9679                                          PyUnicode_AS_UNICODE(uformat)));
9680                goto onError;
9681            }
9682            if (sign) {
9683                if (*pbuf == '-' || *pbuf == '+') {
9684                    sign = *pbuf++;
9685                    len--;
9686                }
9687                else if (flags & F_SIGN)
9688                    sign = '+';
9689                else if (flags & F_BLANK)
9690                    sign = ' ';
9691                else
9692                    sign = 0;
9693            }
9694            if (width < len)
9695                width = len;
9696            if (rescnt - (sign != 0) < width) {
9697                reslen -= rescnt;
9698                rescnt = width + fmtcnt + 100;
9699                reslen += rescnt;
9700                if (reslen < 0) {
9701                    Py_XDECREF(temp);
9702                    PyErr_NoMemory();
9703                    goto onError;
9704                }
9705                if (_PyUnicode_Resize(&result, reslen) < 0) {
9706                    Py_XDECREF(temp);
9707                    goto onError;
9708                }
9709                res = PyUnicode_AS_UNICODE(result)
9710                    + reslen - rescnt;
9711            }
9712            if (sign) {
9713                if (fill != ' ')
9714                    *res++ = sign;
9715                rescnt--;
9716                if (width > len)
9717                    width--;
9718            }
9719            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9720                assert(pbuf[0] == '0');
9721                assert(pbuf[1] == c);
9722                if (fill != ' ') {
9723                    *res++ = *pbuf++;
9724                    *res++ = *pbuf++;
9725                }
9726                rescnt -= 2;
9727                width -= 2;
9728                if (width < 0)
9729                    width = 0;
9730                len -= 2;
9731            }
9732            if (width > len && !(flags & F_LJUST)) {
9733                do {
9734                    --rescnt;
9735                    *res++ = fill;
9736                } while (--width > len);
9737            }
9738            if (fill == ' ') {
9739                if (sign)
9740                    *res++ = sign;
9741                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9742                    assert(pbuf[0] == '0');
9743                    assert(pbuf[1] == c);
9744                    *res++ = *pbuf++;
9745                    *res++ = *pbuf++;
9746                }
9747            }
9748            Py_UNICODE_COPY(res, pbuf, len);
9749            res += len;
9750            rescnt -= len;
9751            while (--width >= len) {
9752                --rescnt;
9753                *res++ = ' ';
9754            }
9755            if (dict && (argidx < arglen) && c != '%') {
9756                PyErr_SetString(PyExc_TypeError,
9757                                "not all arguments converted during string formatting");
9758                Py_XDECREF(temp);
9759                goto onError;
9760            }
9761            Py_XDECREF(temp);
9762        } /* '%' */
9763    } /* until end */
9764    if (argidx < arglen && !dict) {
9765        PyErr_SetString(PyExc_TypeError,
9766                        "not all arguments converted during string formatting");
9767        goto onError;
9768    }
9769
9770    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9771        goto onError;
9772    if (args_owned) {
9773        Py_DECREF(args);
9774    }
9775    Py_DECREF(uformat);
9776    return (PyObject *)result;
9777
9778  onError:
9779    Py_XDECREF(result);
9780    Py_DECREF(uformat);
9781    if (args_owned) {
9782        Py_DECREF(args);
9783    }
9784    return NULL;
9785}
9786
9787static PyObject *
9788unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9789
9790static PyObject *
9791unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9792{
9793    PyObject *x = NULL;
9794    static char *kwlist[] = {"object", "encoding", "errors", 0};
9795    char *encoding = NULL;
9796    char *errors = NULL;
9797
9798    if (type != &PyUnicode_Type)
9799        return unicode_subtype_new(type, args, kwds);
9800    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9801                                     kwlist, &x, &encoding, &errors))
9802        return NULL;
9803    if (x == NULL)
9804        return (PyObject *)_PyUnicode_New(0);
9805    if (encoding == NULL && errors == NULL)
9806        return PyObject_Str(x);
9807    else
9808        return PyUnicode_FromEncodedObject(x, encoding, errors);
9809}
9810
9811static PyObject *
9812unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9813{
9814    PyUnicodeObject *tmp, *pnew;
9815    Py_ssize_t n;
9816
9817    assert(PyType_IsSubtype(type, &PyUnicode_Type));
9818    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9819    if (tmp == NULL)
9820        return NULL;
9821    assert(PyUnicode_Check(tmp));
9822    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9823    if (pnew == NULL) {
9824        Py_DECREF(tmp);
9825        return NULL;
9826    }
9827    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9828    if (pnew->str == NULL) {
9829        _Py_ForgetReference((PyObject *)pnew);
9830        PyObject_Del(pnew);
9831        Py_DECREF(tmp);
9832        return PyErr_NoMemory();
9833    }
9834    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9835    pnew->length = n;
9836    pnew->hash = tmp->hash;
9837    Py_DECREF(tmp);
9838    return (PyObject *)pnew;
9839}
9840
9841PyDoc_STRVAR(unicode_doc,
9842             "str(string[, encoding[, errors]]) -> str\n\
9843\n\
9844Create a new string object from the given encoded string.\n\
9845encoding defaults to the current default string encoding.\n\
9846errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9847
9848static PyObject *unicode_iter(PyObject *seq);
9849
9850PyTypeObject PyUnicode_Type = {
9851    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9852    "str",              /* tp_name */
9853    sizeof(PyUnicodeObject),        /* tp_size */
9854    0,                  /* tp_itemsize */
9855    /* Slots */
9856    (destructor)unicode_dealloc,    /* tp_dealloc */
9857    0,                  /* tp_print */
9858    0,                  /* tp_getattr */
9859    0,                  /* tp_setattr */
9860    0,                  /* tp_reserved */
9861    unicode_repr,           /* tp_repr */
9862    &unicode_as_number,         /* tp_as_number */
9863    &unicode_as_sequence,       /* tp_as_sequence */
9864    &unicode_as_mapping,        /* tp_as_mapping */
9865    (hashfunc) unicode_hash,        /* tp_hash*/
9866    0,                  /* tp_call*/
9867    (reprfunc) unicode_str,     /* tp_str */
9868    PyObject_GenericGetAttr,        /* tp_getattro */
9869    0,                  /* tp_setattro */
9870    0,                  /* tp_as_buffer */
9871    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9872    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
9873    unicode_doc,            /* tp_doc */
9874    0,                  /* tp_traverse */
9875    0,                  /* tp_clear */
9876    PyUnicode_RichCompare,      /* tp_richcompare */
9877    0,                  /* tp_weaklistoffset */
9878    unicode_iter,           /* tp_iter */
9879    0,                  /* tp_iternext */
9880    unicode_methods,            /* tp_methods */
9881    0,                  /* tp_members */
9882    0,                  /* tp_getset */
9883    &PyBaseObject_Type,         /* tp_base */
9884    0,                  /* tp_dict */
9885    0,                  /* tp_descr_get */
9886    0,                  /* tp_descr_set */
9887    0,                  /* tp_dictoffset */
9888    0,                  /* tp_init */
9889    0,                  /* tp_alloc */
9890    unicode_new,            /* tp_new */
9891    PyObject_Del,           /* tp_free */
9892};
9893
9894/* Initialize the Unicode implementation */
9895
9896void _PyUnicode_Init(void)
9897{
9898    int i;
9899
9900    /* XXX - move this array to unicodectype.c ? */
9901    Py_UNICODE linebreak[] = {
9902        0x000A, /* LINE FEED */
9903        0x000D, /* CARRIAGE RETURN */
9904        0x001C, /* FILE SEPARATOR */
9905        0x001D, /* GROUP SEPARATOR */
9906        0x001E, /* RECORD SEPARATOR */
9907        0x0085, /* NEXT LINE */
9908        0x2028, /* LINE SEPARATOR */
9909        0x2029, /* PARAGRAPH SEPARATOR */
9910    };
9911
9912    /* Init the implementation */
9913    free_list = NULL;
9914    numfree = 0;
9915    unicode_empty = _PyUnicode_New(0);
9916    if (!unicode_empty)
9917        return;
9918
9919    for (i = 0; i < 256; i++)
9920        unicode_latin1[i] = NULL;
9921    if (PyType_Ready(&PyUnicode_Type) < 0)
9922        Py_FatalError("Can't initialize 'unicode'");
9923
9924    /* initialize the linebreak bloom filter */
9925    bloom_linebreak = make_bloom_mask(
9926        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9927        );
9928
9929    PyType_Ready(&EncodingMapType);
9930}
9931
9932/* Finalize the Unicode implementation */
9933
9934int
9935PyUnicode_ClearFreeList(void)
9936{
9937    int freelist_size = numfree;
9938    PyUnicodeObject *u;
9939
9940    for (u = free_list; u != NULL;) {
9941        PyUnicodeObject *v = u;
9942        u = *(PyUnicodeObject **)u;
9943        if (v->str)
9944            PyObject_DEL(v->str);
9945        Py_XDECREF(v->defenc);
9946        PyObject_Del(v);
9947        numfree--;
9948    }
9949    free_list = NULL;
9950    assert(numfree == 0);
9951    return freelist_size;
9952}
9953
9954void
9955_PyUnicode_Fini(void)
9956{
9957    int i;
9958
9959    Py_XDECREF(unicode_empty);
9960    unicode_empty = NULL;
9961
9962    for (i = 0; i < 256; i++) {
9963        if (unicode_latin1[i]) {
9964            Py_DECREF(unicode_latin1[i]);
9965            unicode_latin1[i] = NULL;
9966        }
9967    }
9968    (void)PyUnicode_ClearFreeList();
9969}
9970
9971void
9972PyUnicode_InternInPlace(PyObject **p)
9973{
9974    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9975    PyObject *t;
9976    if (s == NULL || !PyUnicode_Check(s))
9977        Py_FatalError(
9978            "PyUnicode_InternInPlace: unicode strings only please!");
9979    /* If it's a subclass, we don't really know what putting
9980       it in the interned dict might do. */
9981    if (!PyUnicode_CheckExact(s))
9982        return;
9983    if (PyUnicode_CHECK_INTERNED(s))
9984        return;
9985    if (interned == NULL) {
9986        interned = PyDict_New();
9987        if (interned == NULL) {
9988            PyErr_Clear(); /* Don't leave an exception */
9989            return;
9990        }
9991    }
9992    /* It might be that the GetItem call fails even
9993       though the key is present in the dictionary,
9994       namely when this happens during a stack overflow. */
9995    Py_ALLOW_RECURSION
9996        t = PyDict_GetItem(interned, (PyObject *)s);
9997    Py_END_ALLOW_RECURSION
9998
9999        if (t) {
10000            Py_INCREF(t);
10001            Py_DECREF(*p);
10002            *p = t;
10003            return;
10004        }
10005
10006    PyThreadState_GET()->recursion_critical = 1;
10007    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10008        PyErr_Clear();
10009        PyThreadState_GET()->recursion_critical = 0;
10010        return;
10011    }
10012    PyThreadState_GET()->recursion_critical = 0;
10013    /* The two references in interned are not counted by refcnt.
10014       The deallocator will take care of this */
10015    Py_REFCNT(s) -= 2;
10016    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
10017}
10018
10019void
10020PyUnicode_InternImmortal(PyObject **p)
10021{
10022    PyUnicode_InternInPlace(p);
10023    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10024        PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10025        Py_INCREF(*p);
10026    }
10027}
10028
10029PyObject *
10030PyUnicode_InternFromString(const char *cp)
10031{
10032    PyObject *s = PyUnicode_FromString(cp);
10033    if (s == NULL)
10034        return NULL;
10035    PyUnicode_InternInPlace(&s);
10036    return s;
10037}
10038
10039void _Py_ReleaseInternedUnicodeStrings(void)
10040{
10041    PyObject *keys;
10042    PyUnicodeObject *s;
10043    Py_ssize_t i, n;
10044    Py_ssize_t immortal_size = 0, mortal_size = 0;
10045
10046    if (interned == NULL || !PyDict_Check(interned))
10047        return;
10048    keys = PyDict_Keys(interned);
10049    if (keys == NULL || !PyList_Check(keys)) {
10050        PyErr_Clear();
10051        return;
10052    }
10053
10054    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10055       detector, interned unicode strings are not forcibly deallocated;
10056       rather, we give them their stolen references back, and then clear
10057       and DECREF the interned dict. */
10058
10059    n = PyList_GET_SIZE(keys);
10060    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
10061            n);
10062    for (i = 0; i < n; i++) {
10063        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10064        switch (s->state) {
10065        case SSTATE_NOT_INTERNED:
10066            /* XXX Shouldn't happen */
10067            break;
10068        case SSTATE_INTERNED_IMMORTAL:
10069            Py_REFCNT(s) += 1;
10070            immortal_size += s->length;
10071            break;
10072        case SSTATE_INTERNED_MORTAL:
10073            Py_REFCNT(s) += 2;
10074            mortal_size += s->length;
10075            break;
10076        default:
10077            Py_FatalError("Inconsistent interned string state.");
10078        }
10079        s->state = SSTATE_NOT_INTERNED;
10080    }
10081    fprintf(stderr, "total size of all interned strings: "
10082            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10083            "mortal/immortal\n", mortal_size, immortal_size);
10084    Py_DECREF(keys);
10085    PyDict_Clear(interned);
10086    Py_DECREF(interned);
10087    interned = NULL;
10088}
10089
10090
10091/********************* Unicode Iterator **************************/
10092
10093typedef struct {
10094    PyObject_HEAD
10095    Py_ssize_t it_index;
10096    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
10097} unicodeiterobject;
10098
10099static void
10100unicodeiter_dealloc(unicodeiterobject *it)
10101{
10102    _PyObject_GC_UNTRACK(it);
10103    Py_XDECREF(it->it_seq);
10104    PyObject_GC_Del(it);
10105}
10106
10107static int
10108unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10109{
10110    Py_VISIT(it->it_seq);
10111    return 0;
10112}
10113
10114static PyObject *
10115unicodeiter_next(unicodeiterobject *it)
10116{
10117    PyUnicodeObject *seq;
10118    PyObject *item;
10119
10120    assert(it != NULL);
10121    seq = it->it_seq;
10122    if (seq == NULL)
10123        return NULL;
10124    assert(PyUnicode_Check(seq));
10125
10126    if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10127        item = PyUnicode_FromUnicode(
10128            PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
10129        if (item != NULL)
10130            ++it->it_index;
10131        return item;
10132    }
10133
10134    Py_DECREF(seq);
10135    it->it_seq = NULL;
10136    return NULL;
10137}
10138
10139static PyObject *
10140unicodeiter_len(unicodeiterobject *it)
10141{
10142    Py_ssize_t len = 0;
10143    if (it->it_seq)
10144        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10145    return PyLong_FromSsize_t(len);
10146}
10147
10148PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10149
10150static PyMethodDef unicodeiter_methods[] = {
10151    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
10152     length_hint_doc},
10153    {NULL,      NULL}       /* sentinel */
10154};
10155
10156PyTypeObject PyUnicodeIter_Type = {
10157    PyVarObject_HEAD_INIT(&PyType_Type, 0)
10158    "str_iterator",         /* tp_name */
10159    sizeof(unicodeiterobject),      /* tp_basicsize */
10160    0,                  /* tp_itemsize */
10161    /* methods */
10162    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
10163    0,                  /* tp_print */
10164    0,                  /* tp_getattr */
10165    0,                  /* tp_setattr */
10166    0,                  /* tp_reserved */
10167    0,                  /* tp_repr */
10168    0,                  /* tp_as_number */
10169    0,                  /* tp_as_sequence */
10170    0,                  /* tp_as_mapping */
10171    0,                  /* tp_hash */
10172    0,                  /* tp_call */
10173    0,                  /* tp_str */
10174    PyObject_GenericGetAttr,        /* tp_getattro */
10175    0,                  /* tp_setattro */
10176    0,                  /* tp_as_buffer */
10177    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10178    0,                  /* tp_doc */
10179    (traverseproc)unicodeiter_traverse, /* tp_traverse */
10180    0,                  /* tp_clear */
10181    0,                  /* tp_richcompare */
10182    0,                  /* tp_weaklistoffset */
10183    PyObject_SelfIter,          /* tp_iter */
10184    (iternextfunc)unicodeiter_next,     /* tp_iternext */
10185    unicodeiter_methods,            /* tp_methods */
10186    0,
10187};
10188
10189static PyObject *
10190unicode_iter(PyObject *seq)
10191{
10192    unicodeiterobject *it;
10193
10194    if (!PyUnicode_Check(seq)) {
10195        PyErr_BadInternalCall();
10196        return NULL;
10197    }
10198    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10199    if (it == NULL)
10200        return NULL;
10201    it->it_index = 0;
10202    Py_INCREF(seq);
10203    it->it_seq = (PyUnicodeObject *)seq;
10204    _PyObject_GC_TRACK(it);
10205    return (PyObject *)it;
10206}
10207
10208size_t
10209Py_UNICODE_strlen(const Py_UNICODE *u)
10210{
10211    int res = 0;
10212    while(*u++)
10213        res++;
10214    return res;
10215}
10216
10217Py_UNICODE*
10218Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10219{
10220    Py_UNICODE *u = s1;
10221    while ((*u++ = *s2++));
10222    return s1;
10223}
10224
10225Py_UNICODE*
10226Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10227{
10228    Py_UNICODE *u = s1;
10229    while ((*u++ = *s2++))
10230        if (n-- == 0)
10231            break;
10232    return s1;
10233}
10234
10235Py_UNICODE*
10236Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10237{
10238    Py_UNICODE *u1 = s1;
10239    u1 += Py_UNICODE_strlen(u1);
10240    Py_UNICODE_strcpy(u1, s2);
10241    return s1;
10242}
10243
10244int
10245Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10246{
10247    while (*s1 && *s2 && *s1 == *s2)
10248        s1++, s2++;
10249    if (*s1 && *s2)
10250        return (*s1 < *s2) ? -1 : +1;
10251    if (*s1)
10252        return 1;
10253    if (*s2)
10254        return -1;
10255    return 0;
10256}
10257
10258int
10259Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10260{
10261    register Py_UNICODE u1, u2;
10262    for (; n != 0; n--) {
10263        u1 = *s1;
10264        u2 = *s2;
10265        if (u1 != u2)
10266            return (u1 < u2) ? -1 : +1;
10267        if (u1 == '\0')
10268            return 0;
10269        s1++;
10270        s2++;
10271    }
10272    return 0;
10273}
10274
10275Py_UNICODE*
10276Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10277{
10278    const Py_UNICODE *p;
10279    for (p = s; *p; p++)
10280        if (*p == c)
10281            return (Py_UNICODE*)p;
10282    return NULL;
10283}
10284
10285Py_UNICODE*
10286Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10287{
10288    const Py_UNICODE *p;
10289    p = s + Py_UNICODE_strlen(s);
10290    while (p != s) {
10291        p--;
10292        if (*p == c)
10293            return (Py_UNICODE*)p;
10294    }
10295    return NULL;
10296}
10297
10298Py_UNICODE*
10299PyUnicode_AsUnicodeCopy(PyObject *object)
10300{
10301    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10302    Py_UNICODE *copy;
10303    Py_ssize_t size;
10304
10305    /* Ensure we won't overflow the size. */
10306    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10307        PyErr_NoMemory();
10308        return NULL;
10309    }
10310    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10311    size *= sizeof(Py_UNICODE);
10312    copy = PyMem_Malloc(size);
10313    if (copy == NULL) {
10314        PyErr_NoMemory();
10315        return NULL;
10316    }
10317    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10318    return copy;
10319}
10320
10321/* A _string module, to export formatter_parser and formatter_field_name_split
10322   to the string.Formatter class implemented in Python. */
10323
10324static PyMethodDef _string_methods[] = {
10325    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10326     METH_O, PyDoc_STR("split the argument as a field name")},
10327    {"formatter_parser", (PyCFunction) formatter_parser,
10328     METH_O, PyDoc_STR("parse the argument as a format string")},
10329    {NULL, NULL}
10330};
10331
10332static struct PyModuleDef _string_module = {
10333    PyModuleDef_HEAD_INIT,
10334    "_string",
10335    PyDoc_STR("string helper module"),
10336    0,
10337    _string_methods,
10338    NULL,
10339    NULL,
10340    NULL,
10341    NULL
10342};
10343
10344PyMODINIT_FUNC
10345PyInit__string(void)
10346{
10347    return PyModule_Create(&_string_module);
10348}
10349
10350
10351#ifdef __cplusplus
10352}
10353#endif
10354