unicodeobject.c revision e94c679df0b632bc929936ca54f0de006e1a6dc2
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "bytes_methods.h"
45
46#include "unicodeobject.h"
47#include "ucnhash.h"
48
49#ifdef MS_WINDOWS
50#include <windows.h>
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96/* This dictionary holds all interned unicode strings.  Note that references
97   to strings in this dictionary are *not* counted in the string's ob_refcnt.
98   When the interned string reaches a refcnt of 0 the string deallocation
99   function will delete the reference from this dictionary.
100
101   Another way to look at this is that to say that the actual reference
102   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
103*/
104static PyObject *interned;
105
106/* Free list for Unicode objects */
107static PyUnicodeObject *free_list;
108static int numfree;
109
110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114   shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
117/* Default encoding to use and assume when NULL is passed as encoding
118   parameter; it is fixed to "utf-8".  Always use the
119   PyUnicode_GetDefaultEncoding() API to access this global.
120
121   Don't forget to alter Py_FileSystemDefaultEncoding if you change the
122   hard coded default!
123*/
124static const char unicode_default_encoding[] = "utf-8";
125
126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128    0, 0, 0, 0, 0, 0, 0, 0,
129/*     case 0x0009: * HORIZONTAL TABULATION */
130/*     case 0x000A: * LINE FEED */
131/*     case 0x000B: * VERTICAL TABULATION */
132/*     case 0x000C: * FORM FEED */
133/*     case 0x000D: * CARRIAGE RETURN */
134    0, 1, 1, 1, 1, 1, 0, 0,
135    0, 0, 0, 0, 0, 0, 0, 0,
136/*     case 0x001C: * FILE SEPARATOR */
137/*     case 0x001D: * GROUP SEPARATOR */
138/*     case 0x001E: * RECORD SEPARATOR */
139/*     case 0x001F: * UNIT SEPARATOR */
140    0, 0, 0, 0, 1, 1, 1, 1,
141/*     case 0x0020: * SPACE */
142    1, 0, 0, 0, 0, 0, 0, 0,
143    0, 0, 0, 0, 0, 0, 0, 0,
144    0, 0, 0, 0, 0, 0, 0, 0,
145    0, 0, 0, 0, 0, 0, 0, 0,
146
147    0, 0, 0, 0, 0, 0, 0, 0,
148    0, 0, 0, 0, 0, 0, 0, 0,
149    0, 0, 0, 0, 0, 0, 0, 0,
150    0, 0, 0, 0, 0, 0, 0, 0,
151    0, 0, 0, 0, 0, 0, 0, 0,
152    0, 0, 0, 0, 0, 0, 0, 0,
153    0, 0, 0, 0, 0, 0, 0, 0,
154    0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159    0, 0, 0, 0, 0, 0, 0, 0,
160/*         0x000A, * LINE FEED */
161/*         0x000D, * CARRIAGE RETURN */
162    0, 0, 1, 0, 0, 1, 0, 0,
163    0, 0, 0, 0, 0, 0, 0, 0,
164/*         0x001C, * FILE SEPARATOR */
165/*         0x001D, * GROUP SEPARATOR */
166/*         0x001E, * RECORD SEPARATOR */
167    0, 0, 0, 0, 1, 1, 1, 0,
168    0, 0, 0, 0, 0, 0, 0, 0,
169    0, 0, 0, 0, 0, 0, 0, 0,
170    0, 0, 0, 0, 0, 0, 0, 0,
171    0, 0, 0, 0, 0, 0, 0, 0,
172
173    0, 0, 0, 0, 0, 0, 0, 0,
174    0, 0, 0, 0, 0, 0, 0, 0,
175    0, 0, 0, 0, 0, 0, 0, 0,
176    0, 0, 0, 0, 0, 0, 0, 0,
177    0, 0, 0, 0, 0, 0, 0, 0,
178    0, 0, 0, 0, 0, 0, 0, 0,
179    0, 0, 0, 0, 0, 0, 0, 0,
180    0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
184Py_UNICODE
185PyUnicode_GetMax(void)
186{
187#ifdef Py_UNICODE_WIDE
188    return 0x10FFFF;
189#else
190    /* This is actually an illegal character, so it should
191       not be passed to unichr. */
192    return 0xFFFF;
193#endif
194}
195
196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199   to keep things simple, we use a single bitmask, using the least 5
200   bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
210#define BLOOM_LINEBREAK(ch)                                             \
211    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
212     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216    /* calculate simple bloom-style bitmask for a given unicode string */
217
218    long mask;
219    Py_ssize_t i;
220
221    mask = 0;
222    for (i = 0; i < len; i++)
223        mask |= (1 << (ptr[i] & 0x1F));
224
225    return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230    Py_ssize_t i;
231
232    for (i = 0; i < setlen; i++)
233        if (set[i] == chr)
234            return 1;
235
236    return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
240    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
242/* --- Unicode Object ----------------------------------------------------- */
243
244static
245int unicode_resize(register PyUnicodeObject *unicode,
246                   Py_ssize_t length)
247{
248    void *oldstr;
249
250    /* Shortcut if there's nothing much to do. */
251    if (unicode->length == length)
252        goto reset;
253
254    /* Resizing shared object (unicode_empty or single character
255       objects) in-place is not allowed. Use PyUnicode_Resize()
256       instead ! */
257
258    if (unicode == unicode_empty ||
259        (unicode->length == 1 &&
260         unicode->str[0] < 256U &&
261         unicode_latin1[unicode->str[0]] == unicode)) {
262        PyErr_SetString(PyExc_SystemError,
263                        "can't resize shared str objects");
264        return -1;
265    }
266
267    /* We allocate one more byte to make sure the string is Ux0000 terminated.
268       The overallocation is also used by fastsearch, which assumes that it's
269       safe to look at str[length] (without making any assumptions about what
270       it contains). */
271
272    oldstr = unicode->str;
273    unicode->str = PyObject_REALLOC(unicode->str,
274                                    sizeof(Py_UNICODE) * (length + 1));
275    if (!unicode->str) {
276        unicode->str = (Py_UNICODE *)oldstr;
277        PyErr_NoMemory();
278        return -1;
279    }
280    unicode->str[length] = 0;
281    unicode->length = length;
282
283  reset:
284    /* Reset the object caches */
285    if (unicode->defenc) {
286        Py_DECREF(unicode->defenc);
287        unicode->defenc = NULL;
288    }
289    unicode->hash = -1;
290
291    return 0;
292}
293
294/* We allocate one more byte to make sure the string is
295   Ux0000 terminated; some code (e.g. new_identifier)
296   relies on that.
297
298   XXX This allocator could further be enhanced by assuring that the
299   free list never reduces its size below 1.
300
301*/
302
303static
304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305{
306    register PyUnicodeObject *unicode;
307
308    /* Optimization for empty strings */
309    if (length == 0 && unicode_empty != NULL) {
310        Py_INCREF(unicode_empty);
311        return unicode_empty;
312    }
313
314    /* Ensure we won't overflow the size. */
315    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316        return (PyUnicodeObject *)PyErr_NoMemory();
317    }
318
319    /* Unicode freelist & memory allocation */
320    if (free_list) {
321        unicode = free_list;
322        free_list = *(PyUnicodeObject **)unicode;
323        numfree--;
324        if (unicode->str) {
325            /* Keep-Alive optimization: we only upsize the buffer,
326               never downsize it. */
327            if ((unicode->length < length) &&
328                unicode_resize(unicode, length) < 0) {
329                PyObject_DEL(unicode->str);
330                unicode->str = NULL;
331            }
332        }
333        else {
334            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336        }
337        PyObject_INIT(unicode, &PyUnicode_Type);
338    }
339    else {
340        size_t new_size;
341        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
342        if (unicode == NULL)
343            return NULL;
344        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
346    }
347
348    if (!unicode->str) {
349        PyErr_NoMemory();
350        goto onError;
351    }
352    /* Initialize the first element to guard against cases where
353     * the caller fails before initializing str -- unicode_resize()
354     * reads str[0], and the Keep-Alive optimization can keep memory
355     * allocated for str alive across a call to unicode_dealloc(unicode).
356     * We don't want unicode_resize to read uninitialized memory in
357     * that case.
358     */
359    unicode->str[0] = 0;
360    unicode->str[length] = 0;
361    unicode->length = length;
362    unicode->hash = -1;
363    unicode->state = 0;
364    unicode->defenc = NULL;
365    return unicode;
366
367  onError:
368    /* XXX UNREF/NEWREF interface should be more symmetrical */
369    _Py_DEC_REFTOTAL;
370    _Py_ForgetReference((PyObject *)unicode);
371    PyObject_Del(unicode);
372    return NULL;
373}
374
375static
376void unicode_dealloc(register PyUnicodeObject *unicode)
377{
378    switch (PyUnicode_CHECK_INTERNED(unicode)) {
379    case SSTATE_NOT_INTERNED:
380        break;
381
382    case SSTATE_INTERNED_MORTAL:
383        /* revive dead object temporarily for DelItem */
384        Py_REFCNT(unicode) = 3;
385        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386            Py_FatalError(
387                "deletion of interned string failed");
388        break;
389
390    case SSTATE_INTERNED_IMMORTAL:
391        Py_FatalError("Immortal interned string died.");
392
393    default:
394        Py_FatalError("Inconsistent interned string state.");
395    }
396
397    if (PyUnicode_CheckExact(unicode) &&
398        numfree < PyUnicode_MAXFREELIST) {
399        /* Keep-Alive optimization */
400        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
401            PyObject_DEL(unicode->str);
402            unicode->str = NULL;
403            unicode->length = 0;
404        }
405        if (unicode->defenc) {
406            Py_DECREF(unicode->defenc);
407            unicode->defenc = NULL;
408        }
409        /* Add to free list */
410        *(PyUnicodeObject **)unicode = free_list;
411        free_list = unicode;
412        numfree++;
413    }
414    else {
415        PyObject_DEL(unicode->str);
416        Py_XDECREF(unicode->defenc);
417        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
418    }
419}
420
421static
422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
423{
424    register PyUnicodeObject *v;
425
426    /* Argument checks */
427    if (unicode == NULL) {
428        PyErr_BadInternalCall();
429        return -1;
430    }
431    v = *unicode;
432    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
433        PyErr_BadInternalCall();
434        return -1;
435    }
436
437    /* Resizing unicode_empty and single character objects is not
438       possible since these are being shared. We simply return a fresh
439       copy with the same Unicode content. */
440    if (v->length != length &&
441        (v == unicode_empty || v->length == 1)) {
442        PyUnicodeObject *w = _PyUnicode_New(length);
443        if (w == NULL)
444            return -1;
445        Py_UNICODE_COPY(w->str, v->str,
446                        length < v->length ? length : v->length);
447        Py_DECREF(*unicode);
448        *unicode = w;
449        return 0;
450    }
451
452    /* Note that we don't have to modify *unicode for unshared Unicode
453       objects, since we can modify them in-place. */
454    return unicode_resize(v, length);
455}
456
457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
458{
459    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
460}
461
462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
463                                Py_ssize_t size)
464{
465    PyUnicodeObject *unicode;
466
467    /* If the Unicode data is known at construction time, we can apply
468       some optimizations which share commonly used objects. */
469    if (u != NULL) {
470
471        /* Optimization for empty strings */
472        if (size == 0 && unicode_empty != NULL) {
473            Py_INCREF(unicode_empty);
474            return (PyObject *)unicode_empty;
475        }
476
477        /* Single character Unicode objects in the Latin-1 range are
478           shared when using this constructor */
479        if (size == 1 && *u < 256) {
480            unicode = unicode_latin1[*u];
481            if (!unicode) {
482                unicode = _PyUnicode_New(1);
483                if (!unicode)
484                    return NULL;
485                unicode->str[0] = *u;
486                unicode_latin1[*u] = unicode;
487            }
488            Py_INCREF(unicode);
489            return (PyObject *)unicode;
490        }
491    }
492
493    unicode = _PyUnicode_New(size);
494    if (!unicode)
495        return NULL;
496
497    /* Copy the Unicode data into the new object */
498    if (u != NULL)
499        Py_UNICODE_COPY(unicode->str, u, size);
500
501    return (PyObject *)unicode;
502}
503
504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
505{
506    PyUnicodeObject *unicode;
507
508    if (size < 0) {
509        PyErr_SetString(PyExc_SystemError,
510                        "Negative size passed to PyUnicode_FromStringAndSize");
511        return NULL;
512    }
513
514    /* If the Unicode data is known at construction time, we can apply
515       some optimizations which share commonly used objects.
516       Also, this means the input must be UTF-8, so fall back to the
517       UTF-8 decoder at the end. */
518    if (u != NULL) {
519
520        /* Optimization for empty strings */
521        if (size == 0 && unicode_empty != NULL) {
522            Py_INCREF(unicode_empty);
523            return (PyObject *)unicode_empty;
524        }
525
526        /* Single characters are shared when using this constructor.
527           Restrict to ASCII, since the input must be UTF-8. */
528        if (size == 1 && Py_CHARMASK(*u) < 128) {
529            unicode = unicode_latin1[Py_CHARMASK(*u)];
530            if (!unicode) {
531                unicode = _PyUnicode_New(1);
532                if (!unicode)
533                    return NULL;
534                unicode->str[0] = Py_CHARMASK(*u);
535                unicode_latin1[Py_CHARMASK(*u)] = unicode;
536            }
537            Py_INCREF(unicode);
538            return (PyObject *)unicode;
539        }
540
541        return PyUnicode_DecodeUTF8(u, size, NULL);
542    }
543
544    unicode = _PyUnicode_New(size);
545    if (!unicode)
546        return NULL;
547
548    return (PyObject *)unicode;
549}
550
551PyObject *PyUnicode_FromString(const char *u)
552{
553    size_t size = strlen(u);
554    if (size > PY_SSIZE_T_MAX) {
555        PyErr_SetString(PyExc_OverflowError, "input too long");
556        return NULL;
557    }
558
559    return PyUnicode_FromStringAndSize(u, size);
560}
561
562#ifdef HAVE_WCHAR_H
563
564PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
565                                 Py_ssize_t size)
566{
567    PyUnicodeObject *unicode;
568
569    if (w == NULL) {
570        if (size == 0)
571            return PyUnicode_FromStringAndSize(NULL, 0);
572        PyErr_BadInternalCall();
573        return NULL;
574    }
575
576    if (size == -1) {
577        size = wcslen(w);
578    }
579
580    unicode = _PyUnicode_New(size);
581    if (!unicode)
582        return NULL;
583
584    /* Copy the wchar_t data into the new object */
585#ifdef HAVE_USABLE_WCHAR_T
586    memcpy(unicode->str, w, size * sizeof(wchar_t));
587#else
588    {
589        register Py_UNICODE *u;
590        register Py_ssize_t i;
591        u = PyUnicode_AS_UNICODE(unicode);
592        for (i = size; i > 0; i--)
593            *u++ = *w++;
594    }
595#endif
596
597    return (PyObject *)unicode;
598}
599
600static void
601makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
602{
603    *fmt++ = '%';
604    if (width) {
605        if (zeropad)
606            *fmt++ = '0';
607        fmt += sprintf(fmt, "%d", width);
608    }
609    if (precision)
610        fmt += sprintf(fmt, ".%d", precision);
611    if (longflag)
612        *fmt++ = 'l';
613    else if (size_tflag) {
614        char *f = PY_FORMAT_SIZE_T;
615        while (*f)
616            *fmt++ = *f++;
617    }
618    *fmt++ = c;
619    *fmt = '\0';
620}
621
622#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
623
624PyObject *
625PyUnicode_FromFormatV(const char *format, va_list vargs)
626{
627    va_list count;
628    Py_ssize_t callcount = 0;
629    PyObject **callresults = NULL;
630    PyObject **callresult = NULL;
631    Py_ssize_t n = 0;
632    int width = 0;
633    int precision = 0;
634    int zeropad;
635    const char* f;
636    Py_UNICODE *s;
637    PyObject *string;
638    /* used by sprintf */
639    char buffer[21];
640    /* use abuffer instead of buffer, if we need more space
641     * (which can happen if there's a format specifier with width). */
642    char *abuffer = NULL;
643    char *realbuffer;
644    Py_ssize_t abuffersize = 0;
645    char fmt[60]; /* should be enough for %0width.precisionld */
646    const char *copy;
647
648#ifdef VA_LIST_IS_ARRAY
649    Py_MEMCPY(count, vargs, sizeof(va_list));
650#else
651#ifdef  __va_copy
652    __va_copy(count, vargs);
653#else
654    count = vargs;
655#endif
656#endif
657    /* step 1: count the number of %S/%R/%A format specifications
658     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
659     * these objects once during step 3 and put the result in
660     an array) */
661    for (f = format; *f; f++) {
662        if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
663            ++callcount;
664    }
665    /* step 2: allocate memory for the results of
666     * PyObject_Str()/PyObject_Repr() calls */
667    if (callcount) {
668        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
669        if (!callresults) {
670            PyErr_NoMemory();
671            return NULL;
672        }
673        callresult = callresults;
674    }
675    /* step 3: figure out how large a buffer we need */
676    for (f = format; *f; f++) {
677        if (*f == '%') {
678            const char* p = f;
679            width = 0;
680            while (ISDIGIT((unsigned)*f))
681                width = (width*10) + *f++ - '0';
682            while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
683                ;
684
685            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
686             * they don't affect the amount of space we reserve.
687             */
688            if ((*f == 'l' || *f == 'z') &&
689                (f[1] == 'd' || f[1] == 'u'))
690                ++f;
691
692            switch (*f) {
693            case 'c':
694                (void)va_arg(count, int);
695                /* fall through... */
696            case '%':
697                n++;
698                break;
699            case 'd': case 'u': case 'i': case 'x':
700                (void) va_arg(count, int);
701                /* 20 bytes is enough to hold a 64-bit
702                   integer.  Decimal takes the most space.
703                   This isn't enough for octal.
704                   If a width is specified we need more
705                   (which we allocate later). */
706                if (width < 20)
707                    width = 20;
708                n += width;
709                if (abuffersize < width)
710                    abuffersize = width;
711                break;
712            case 's':
713            {
714                /* UTF-8 */
715                unsigned char*s;
716                s = va_arg(count, unsigned char*);
717                while (*s) {
718                    if (*s < 128) {
719                        n++; s++;
720                    } else if (*s < 0xc0) {
721                        /* invalid UTF-8 */
722                        n++; s++;
723                    } else if (*s < 0xc0) {
724                        n++;
725                        s++; if(!*s)break;
726                        s++;
727                    } else if (*s < 0xe0) {
728                        n++;
729                        s++; if(!*s)break;
730                        s++; if(!*s)break;
731                        s++;
732                    } else {
733#ifdef Py_UNICODE_WIDE
734                        n++;
735#else
736                        n+=2;
737#endif
738                        s++; if(!*s)break;
739                        s++; if(!*s)break;
740                        s++; if(!*s)break;
741                        s++;
742                    }
743                }
744                break;
745            }
746            case 'U':
747            {
748                PyObject *obj = va_arg(count, PyObject *);
749                assert(obj && PyUnicode_Check(obj));
750                n += PyUnicode_GET_SIZE(obj);
751                break;
752            }
753            case 'V':
754            {
755                PyObject *obj = va_arg(count, PyObject *);
756                const char *str = va_arg(count, const char *);
757                assert(obj || str);
758                assert(!obj || PyUnicode_Check(obj));
759                if (obj)
760                    n += PyUnicode_GET_SIZE(obj);
761                else
762                    n += strlen(str);
763                break;
764            }
765            case 'S':
766            {
767                PyObject *obj = va_arg(count, PyObject *);
768                PyObject *str;
769                assert(obj);
770                str = PyObject_Str(obj);
771                if (!str)
772                    goto fail;
773                n += PyUnicode_GET_SIZE(str);
774                /* Remember the str and switch to the next slot */
775                *callresult++ = str;
776                break;
777            }
778            case 'R':
779            {
780                PyObject *obj = va_arg(count, PyObject *);
781                PyObject *repr;
782                assert(obj);
783                repr = PyObject_Repr(obj);
784                if (!repr)
785                    goto fail;
786                n += PyUnicode_GET_SIZE(repr);
787                /* Remember the repr and switch to the next slot */
788                *callresult++ = repr;
789                break;
790            }
791            case 'A':
792            {
793                PyObject *obj = va_arg(count, PyObject *);
794                PyObject *ascii;
795                assert(obj);
796                ascii = PyObject_ASCII(obj);
797                if (!ascii)
798                    goto fail;
799                n += PyUnicode_GET_SIZE(ascii);
800                /* Remember the repr and switch to the next slot */
801                *callresult++ = ascii;
802                break;
803            }
804            case 'p':
805                (void) va_arg(count, int);
806                /* maximum 64-bit pointer representation:
807                 * 0xffffffffffffffff
808                 * so 19 characters is enough.
809                 * XXX I count 18 -- what's the extra for?
810                 */
811                n += 19;
812                break;
813            default:
814                /* if we stumble upon an unknown
815                   formatting code, copy the rest of
816                   the format string to the output
817                   string. (we cannot just skip the
818                   code, since there's no way to know
819                   what's in the argument list) */
820                n += strlen(p);
821                goto expand;
822            }
823        } else
824            n++;
825    }
826  expand:
827    if (abuffersize > 20) {
828        abuffer = PyObject_Malloc(abuffersize);
829        if (!abuffer) {
830            PyErr_NoMemory();
831            goto fail;
832        }
833        realbuffer = abuffer;
834    }
835    else
836        realbuffer = buffer;
837    /* step 4: fill the buffer */
838    /* Since we've analyzed how much space we need for the worst case,
839       we don't have to resize the string.
840       There can be no errors beyond this point. */
841    string = PyUnicode_FromUnicode(NULL, n);
842    if (!string)
843        goto fail;
844
845    s = PyUnicode_AS_UNICODE(string);
846    callresult = callresults;
847
848    for (f = format; *f; f++) {
849        if (*f == '%') {
850            const char* p = f++;
851            int longflag = 0;
852            int size_tflag = 0;
853            zeropad = (*f == '0');
854            /* parse the width.precision part */
855            width = 0;
856            while (ISDIGIT((unsigned)*f))
857                width = (width*10) + *f++ - '0';
858            precision = 0;
859            if (*f == '.') {
860                f++;
861                while (ISDIGIT((unsigned)*f))
862                    precision = (precision*10) + *f++ - '0';
863            }
864            /* handle the long flag, but only for %ld and %lu.
865               others can be added when necessary. */
866            if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
867                longflag = 1;
868                ++f;
869            }
870            /* handle the size_t flag. */
871            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
872                size_tflag = 1;
873                ++f;
874            }
875
876            switch (*f) {
877            case 'c':
878                *s++ = va_arg(vargs, int);
879                break;
880            case 'd':
881                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
882                if (longflag)
883                    sprintf(realbuffer, fmt, va_arg(vargs, long));
884                else if (size_tflag)
885                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
886                else
887                    sprintf(realbuffer, fmt, va_arg(vargs, int));
888                appendstring(realbuffer);
889                break;
890            case 'u':
891                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
892                if (longflag)
893                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
894                else if (size_tflag)
895                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
896                else
897                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
898                appendstring(realbuffer);
899                break;
900            case 'i':
901                makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
902                sprintf(realbuffer, fmt, va_arg(vargs, int));
903                appendstring(realbuffer);
904                break;
905            case 'x':
906                makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
907                sprintf(realbuffer, fmt, va_arg(vargs, int));
908                appendstring(realbuffer);
909                break;
910            case 's':
911            {
912                /* Parameter must be UTF-8 encoded.
913                   In case of encoding errors, use
914                   the replacement character. */
915                PyObject *u;
916                p = va_arg(vargs, char*);
917                u = PyUnicode_DecodeUTF8(p, strlen(p),
918                                         "replace");
919                if (!u)
920                    goto fail;
921                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
922                                PyUnicode_GET_SIZE(u));
923                s += PyUnicode_GET_SIZE(u);
924                Py_DECREF(u);
925                break;
926            }
927            case 'U':
928            {
929                PyObject *obj = va_arg(vargs, PyObject *);
930                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932                s += size;
933                break;
934            }
935            case 'V':
936            {
937                PyObject *obj = va_arg(vargs, PyObject *);
938                const char *str = va_arg(vargs, const char *);
939                if (obj) {
940                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942                    s += size;
943                } else {
944                    appendstring(str);
945                }
946                break;
947            }
948            case 'S':
949            case 'R':
950            {
951                Py_UNICODE *ucopy;
952                Py_ssize_t usize;
953                Py_ssize_t upos;
954                /* unused, since we already have the result */
955                (void) va_arg(vargs, PyObject *);
956                ucopy = PyUnicode_AS_UNICODE(*callresult);
957                usize = PyUnicode_GET_SIZE(*callresult);
958                for (upos = 0; upos<usize;)
959                    *s++ = ucopy[upos++];
960                /* We're done with the unicode()/repr() => forget it */
961                Py_DECREF(*callresult);
962                /* switch to next unicode()/repr() result */
963                ++callresult;
964                break;
965            }
966            case 'p':
967                sprintf(buffer, "%p", va_arg(vargs, void*));
968                /* %p is ill-defined:  ensure leading 0x. */
969                if (buffer[1] == 'X')
970                    buffer[1] = 'x';
971                else if (buffer[1] != 'x') {
972                    memmove(buffer+2, buffer, strlen(buffer)+1);
973                    buffer[0] = '0';
974                    buffer[1] = 'x';
975                }
976                appendstring(buffer);
977                break;
978            case '%':
979                *s++ = '%';
980                break;
981            default:
982                appendstring(p);
983                goto end;
984            }
985        } else
986            *s++ = *f;
987    }
988
989  end:
990    if (callresults)
991        PyObject_Free(callresults);
992    if (abuffer)
993        PyObject_Free(abuffer);
994    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
995    return string;
996  fail:
997    if (callresults) {
998        PyObject **callresult2 = callresults;
999        while (callresult2 < callresult) {
1000            Py_DECREF(*callresult2);
1001            ++callresult2;
1002        }
1003        PyObject_Free(callresults);
1004    }
1005    if (abuffer)
1006        PyObject_Free(abuffer);
1007    return NULL;
1008}
1009
1010#undef appendstring
1011
1012PyObject *
1013PyUnicode_FromFormat(const char *format, ...)
1014{
1015    PyObject* ret;
1016    va_list vargs;
1017
1018#ifdef HAVE_STDARG_PROTOTYPES
1019    va_start(vargs, format);
1020#else
1021    va_start(vargs);
1022#endif
1023    ret = PyUnicode_FromFormatV(format, vargs);
1024    va_end(vargs);
1025    return ret;
1026}
1027
1028Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1029                                wchar_t *w,
1030                                Py_ssize_t size)
1031{
1032    if (unicode == NULL) {
1033        PyErr_BadInternalCall();
1034        return -1;
1035    }
1036
1037    /* If possible, try to copy the 0-termination as well */
1038    if (size > PyUnicode_GET_SIZE(unicode))
1039        size = PyUnicode_GET_SIZE(unicode) + 1;
1040
1041#ifdef HAVE_USABLE_WCHAR_T
1042    memcpy(w, unicode->str, size * sizeof(wchar_t));
1043#else
1044    {
1045        register Py_UNICODE *u;
1046        register Py_ssize_t i;
1047        u = PyUnicode_AS_UNICODE(unicode);
1048        for (i = size; i > 0; i--)
1049            *w++ = *u++;
1050    }
1051#endif
1052
1053    if (size > PyUnicode_GET_SIZE(unicode))
1054        return PyUnicode_GET_SIZE(unicode);
1055    else
1056        return size;
1057}
1058
1059#endif
1060
1061PyObject *PyUnicode_FromOrdinal(int ordinal)
1062{
1063    Py_UNICODE s[2];
1064
1065    if (ordinal < 0 || ordinal > 0x10ffff) {
1066        PyErr_SetString(PyExc_ValueError,
1067                        "chr() arg not in range(0x110000)");
1068        return NULL;
1069    }
1070
1071#ifndef Py_UNICODE_WIDE
1072    if (ordinal > 0xffff) {
1073        ordinal -= 0x10000;
1074        s[0] = 0xD800 | (ordinal >> 10);
1075        s[1] = 0xDC00 | (ordinal & 0x3FF);
1076        return PyUnicode_FromUnicode(s, 2);
1077    }
1078#endif
1079
1080    s[0] = (Py_UNICODE)ordinal;
1081    return PyUnicode_FromUnicode(s, 1);
1082}
1083
1084PyObject *PyUnicode_FromObject(register PyObject *obj)
1085{
1086    /* XXX Perhaps we should make this API an alias of
1087       PyObject_Str() instead ?! */
1088    if (PyUnicode_CheckExact(obj)) {
1089        Py_INCREF(obj);
1090        return obj;
1091    }
1092    if (PyUnicode_Check(obj)) {
1093        /* For a Unicode subtype that's not a Unicode object,
1094           return a true Unicode object with the same data. */
1095        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1096                                     PyUnicode_GET_SIZE(obj));
1097    }
1098    PyErr_Format(PyExc_TypeError,
1099                 "Can't convert '%.100s' object to str implicitly",
1100                 Py_TYPE(obj)->tp_name);
1101    return NULL;
1102}
1103
1104PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1105                                      const char *encoding,
1106                                      const char *errors)
1107{
1108    const char *s = NULL;
1109    Py_ssize_t len;
1110    PyObject *v;
1111
1112    if (obj == NULL) {
1113        PyErr_BadInternalCall();
1114        return NULL;
1115    }
1116
1117    if (PyUnicode_Check(obj)) {
1118        PyErr_SetString(PyExc_TypeError,
1119                        "decoding str is not supported");
1120        return NULL;
1121    }
1122
1123    /* Coerce object */
1124    if (PyBytes_Check(obj)) {
1125        s = PyBytes_AS_STRING(obj);
1126        len = PyBytes_GET_SIZE(obj);
1127    }
1128    else if (PyByteArray_Check(obj)) {
1129        s = PyByteArray_AS_STRING(obj);
1130        len = PyByteArray_GET_SIZE(obj);
1131    }
1132    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1133        /* Overwrite the error message with something more useful in
1134           case of a TypeError. */
1135        if (PyErr_ExceptionMatches(PyExc_TypeError))
1136            PyErr_Format(PyExc_TypeError,
1137                         "coercing to str: need string or buffer, "
1138                         "%.80s found",
1139                         Py_TYPE(obj)->tp_name);
1140        goto onError;
1141    }
1142
1143    /* Convert to Unicode */
1144    if (len == 0) {
1145        Py_INCREF(unicode_empty);
1146        v = (PyObject *)unicode_empty;
1147    }
1148    else
1149        v = PyUnicode_Decode(s, len, encoding, errors);
1150
1151    return v;
1152
1153  onError:
1154    return NULL;
1155}
1156
1157PyObject *PyUnicode_Decode(const char *s,
1158                           Py_ssize_t size,
1159                           const char *encoding,
1160                           const char *errors)
1161{
1162    PyObject *buffer = NULL, *unicode;
1163    Py_buffer info;
1164    char lower[20];  /* Enough for any encoding name we recognize */
1165    char *l;
1166    const char *e;
1167
1168    if (encoding == NULL)
1169        encoding = PyUnicode_GetDefaultEncoding();
1170
1171    /* Convert encoding to lower case and replace '_' with '-' in order to
1172       catch e.g. UTF_8 */
1173    e = encoding;
1174    l = lower;
1175    while (*e && l < &lower[(sizeof lower) - 2]) {
1176        if (ISUPPER(*e)) {
1177            *l++ = TOLOWER(*e++);
1178        }
1179        else if (*e == '_') {
1180            *l++ = '-';
1181            e++;
1182        }
1183        else {
1184            *l++ = *e++;
1185        }
1186    }
1187    *l = '\0';
1188
1189    /* Shortcuts for common default encodings */
1190    if (strcmp(lower, "utf-8") == 0)
1191        return PyUnicode_DecodeUTF8(s, size, errors);
1192    else if ((strcmp(lower, "latin-1") == 0) ||
1193             (strcmp(lower, "iso-8859-1") == 0))
1194        return PyUnicode_DecodeLatin1(s, size, errors);
1195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196    else if (strcmp(lower, "mbcs") == 0)
1197        return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
1199    else if (strcmp(lower, "ascii") == 0)
1200        return PyUnicode_DecodeASCII(s, size, errors);
1201    else if (strcmp(lower, "utf-16") == 0)
1202        return PyUnicode_DecodeUTF16(s, size, errors, 0);
1203    else if (strcmp(lower, "utf-32") == 0)
1204        return PyUnicode_DecodeUTF32(s, size, errors, 0);
1205
1206    /* Decode via the codec registry */
1207    buffer = NULL;
1208    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1209        goto onError;
1210    buffer = PyMemoryView_FromBuffer(&info);
1211    if (buffer == NULL)
1212        goto onError;
1213    unicode = PyCodec_Decode(buffer, encoding, errors);
1214    if (unicode == NULL)
1215        goto onError;
1216    if (!PyUnicode_Check(unicode)) {
1217        PyErr_Format(PyExc_TypeError,
1218                     "decoder did not return a str object (type=%.400s)",
1219                     Py_TYPE(unicode)->tp_name);
1220        Py_DECREF(unicode);
1221        goto onError;
1222    }
1223    Py_DECREF(buffer);
1224    return unicode;
1225
1226  onError:
1227    Py_XDECREF(buffer);
1228    return NULL;
1229}
1230
1231PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1232                                    const char *encoding,
1233                                    const char *errors)
1234{
1235    PyObject *v;
1236
1237    if (!PyUnicode_Check(unicode)) {
1238        PyErr_BadArgument();
1239        goto onError;
1240    }
1241
1242    if (encoding == NULL)
1243        encoding = PyUnicode_GetDefaultEncoding();
1244
1245    /* Decode via the codec registry */
1246    v = PyCodec_Decode(unicode, encoding, errors);
1247    if (v == NULL)
1248        goto onError;
1249    return v;
1250
1251  onError:
1252    return NULL;
1253}
1254
1255PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1256                                     const char *encoding,
1257                                     const char *errors)
1258{
1259    PyObject *v;
1260
1261    if (!PyUnicode_Check(unicode)) {
1262        PyErr_BadArgument();
1263        goto onError;
1264    }
1265
1266    if (encoding == NULL)
1267        encoding = PyUnicode_GetDefaultEncoding();
1268
1269    /* Decode via the codec registry */
1270    v = PyCodec_Decode(unicode, encoding, errors);
1271    if (v == NULL)
1272        goto onError;
1273    if (!PyUnicode_Check(v)) {
1274        PyErr_Format(PyExc_TypeError,
1275                     "decoder did not return a str object (type=%.400s)",
1276                     Py_TYPE(v)->tp_name);
1277        Py_DECREF(v);
1278        goto onError;
1279    }
1280    return v;
1281
1282  onError:
1283    return NULL;
1284}
1285
1286PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1287                           Py_ssize_t size,
1288                           const char *encoding,
1289                           const char *errors)
1290{
1291    PyObject *v, *unicode;
1292
1293    unicode = PyUnicode_FromUnicode(s, size);
1294    if (unicode == NULL)
1295        return NULL;
1296    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1297    Py_DECREF(unicode);
1298    return v;
1299}
1300
1301PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1302                                    const char *encoding,
1303                                    const char *errors)
1304{
1305    PyObject *v;
1306
1307    if (!PyUnicode_Check(unicode)) {
1308        PyErr_BadArgument();
1309        goto onError;
1310    }
1311
1312    if (encoding == NULL)
1313        encoding = PyUnicode_GetDefaultEncoding();
1314
1315    /* Encode via the codec registry */
1316    v = PyCodec_Encode(unicode, encoding, errors);
1317    if (v == NULL)
1318        goto onError;
1319    return v;
1320
1321  onError:
1322    return NULL;
1323}
1324
1325PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1326                                    const char *encoding,
1327                                    const char *errors)
1328{
1329    PyObject *v;
1330
1331    if (!PyUnicode_Check(unicode)) {
1332        PyErr_BadArgument();
1333        return NULL;
1334    }
1335
1336    if (encoding == NULL)
1337        encoding = PyUnicode_GetDefaultEncoding();
1338
1339    /* Shortcuts for common default encodings */
1340    if (errors == NULL) {
1341        if (strcmp(encoding, "utf-8") == 0)
1342            return PyUnicode_AsUTF8String(unicode);
1343        else if (strcmp(encoding, "latin-1") == 0)
1344            return PyUnicode_AsLatin1String(unicode);
1345#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1346        else if (strcmp(encoding, "mbcs") == 0)
1347            return PyUnicode_AsMBCSString(unicode);
1348#endif
1349        else if (strcmp(encoding, "ascii") == 0)
1350            return PyUnicode_AsASCIIString(unicode);
1351        /* During bootstrap, we may need to find the encodings
1352           package, to load the file system encoding, and require the
1353           file system encoding in order to load the encodings
1354           package.
1355
1356           Break out of this dependency by assuming that the path to
1357           the encodings module is ASCII-only.  XXX could try wcstombs
1358           instead, if the file system encoding is the locale's
1359           encoding. */
1360        else if (Py_FileSystemDefaultEncoding &&
1361                 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1362                 !PyThreadState_GET()->interp->codecs_initialized)
1363            return PyUnicode_AsASCIIString(unicode);
1364    }
1365
1366    /* Encode via the codec registry */
1367    v = PyCodec_Encode(unicode, encoding, errors);
1368    if (v == NULL)
1369        return NULL;
1370
1371    /* The normal path */
1372    if (PyBytes_Check(v))
1373        return v;
1374
1375    /* If the codec returns a buffer, raise a warning and convert to bytes */
1376    if (PyByteArray_Check(v)) {
1377        char msg[100];
1378        PyObject *b;
1379        PyOS_snprintf(msg, sizeof(msg),
1380                      "encoder %s returned buffer instead of bytes",
1381                      encoding);
1382        if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1383            Py_DECREF(v);
1384            return NULL;
1385        }
1386
1387        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1388        Py_DECREF(v);
1389        return b;
1390    }
1391
1392    PyErr_Format(PyExc_TypeError,
1393                 "encoder did not return a bytes object (type=%.400s)",
1394                 Py_TYPE(v)->tp_name);
1395    Py_DECREF(v);
1396    return NULL;
1397}
1398
1399PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1400                                     const char *encoding,
1401                                     const char *errors)
1402{
1403    PyObject *v;
1404
1405    if (!PyUnicode_Check(unicode)) {
1406        PyErr_BadArgument();
1407        goto onError;
1408    }
1409
1410    if (encoding == NULL)
1411        encoding = PyUnicode_GetDefaultEncoding();
1412
1413    /* Encode via the codec registry */
1414    v = PyCodec_Encode(unicode, encoding, errors);
1415    if (v == NULL)
1416        goto onError;
1417    if (!PyUnicode_Check(v)) {
1418        PyErr_Format(PyExc_TypeError,
1419                     "encoder did not return an str object (type=%.400s)",
1420                     Py_TYPE(v)->tp_name);
1421        Py_DECREF(v);
1422        goto onError;
1423    }
1424    return v;
1425
1426  onError:
1427    return NULL;
1428}
1429
1430PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1431                                            const char *errors)
1432{
1433    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1434    if (v)
1435        return v;
1436    if (errors != NULL)
1437        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1438    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1439                             PyUnicode_GET_SIZE(unicode),
1440                             NULL);
1441    if (!v)
1442        return NULL;
1443    ((PyUnicodeObject *)unicode)->defenc = v;
1444    return v;
1445}
1446
1447PyObject*
1448PyUnicode_DecodeFSDefault(const char *s) {
1449    Py_ssize_t size = (Py_ssize_t)strlen(s);
1450    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1451}
1452
1453PyObject*
1454PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1455{
1456    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1457       can be undefined. If it is case, decode using UTF-8. The following assumes
1458       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1459       bootstrapping process where the codecs aren't ready yet.
1460    */
1461    if (Py_FileSystemDefaultEncoding) {
1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1463        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1464            return PyUnicode_DecodeMBCS(s, size, "replace");
1465        }
1466#elif defined(__APPLE__)
1467        if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1468            return PyUnicode_DecodeUTF8(s, size, "replace");
1469        }
1470#endif
1471        return PyUnicode_Decode(s, size,
1472                                Py_FileSystemDefaultEncoding,
1473                                "replace");
1474    }
1475    else {
1476        return PyUnicode_DecodeUTF8(s, size, "replace");
1477    }
1478}
1479
1480char*
1481_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1482{
1483    PyObject *bytes;
1484    if (!PyUnicode_Check(unicode)) {
1485        PyErr_BadArgument();
1486        return NULL;
1487    }
1488    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1489    if (bytes == NULL)
1490        return NULL;
1491    if (psize != NULL)
1492        *psize = PyBytes_GET_SIZE(bytes);
1493    return PyBytes_AS_STRING(bytes);
1494}
1495
1496char*
1497_PyUnicode_AsString(PyObject *unicode)
1498{
1499    return _PyUnicode_AsStringAndSize(unicode, NULL);
1500}
1501
1502Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1503{
1504    if (!PyUnicode_Check(unicode)) {
1505        PyErr_BadArgument();
1506        goto onError;
1507    }
1508    return PyUnicode_AS_UNICODE(unicode);
1509
1510  onError:
1511    return NULL;
1512}
1513
1514Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1515{
1516    if (!PyUnicode_Check(unicode)) {
1517        PyErr_BadArgument();
1518        goto onError;
1519    }
1520    return PyUnicode_GET_SIZE(unicode);
1521
1522  onError:
1523    return -1;
1524}
1525
1526const char *PyUnicode_GetDefaultEncoding(void)
1527{
1528    return unicode_default_encoding;
1529}
1530
1531int PyUnicode_SetDefaultEncoding(const char *encoding)
1532{
1533    if (strcmp(encoding, unicode_default_encoding) != 0) {
1534        PyErr_Format(PyExc_ValueError,
1535                     "Can only set default encoding to %s",
1536                     unicode_default_encoding);
1537        return -1;
1538    }
1539    return 0;
1540}
1541
1542/* error handling callback helper:
1543   build arguments, call the callback and check the arguments,
1544   if no exception occurred, copy the replacement to the output
1545   and adjust various state variables.
1546   return 0 on success, -1 on error
1547*/
1548
1549static
1550int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1551                                     const char *encoding, const char *reason,
1552                                     const char **input, const char **inend, Py_ssize_t *startinpos,
1553                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1554                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1555{
1556    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
1557
1558    PyObject *restuple = NULL;
1559    PyObject *repunicode = NULL;
1560    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1561    Py_ssize_t insize;
1562    Py_ssize_t requiredsize;
1563    Py_ssize_t newpos;
1564    Py_UNICODE *repptr;
1565    PyObject *inputobj = NULL;
1566    Py_ssize_t repsize;
1567    int res = -1;
1568
1569    if (*errorHandler == NULL) {
1570        *errorHandler = PyCodec_LookupError(errors);
1571        if (*errorHandler == NULL)
1572            goto onError;
1573    }
1574
1575    if (*exceptionObject == NULL) {
1576        *exceptionObject = PyUnicodeDecodeError_Create(
1577            encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1578        if (*exceptionObject == NULL)
1579            goto onError;
1580    }
1581    else {
1582        if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1583            goto onError;
1584        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1585            goto onError;
1586        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1587            goto onError;
1588    }
1589
1590    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1591    if (restuple == NULL)
1592        goto onError;
1593    if (!PyTuple_Check(restuple)) {
1594        PyErr_Format(PyExc_TypeError, &argparse[4]);
1595        goto onError;
1596    }
1597    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1598        goto onError;
1599
1600    /* Copy back the bytes variables, which might have been modified by the
1601       callback */
1602    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1603    if (!inputobj)
1604        goto onError;
1605    if (!PyBytes_Check(inputobj)) {
1606        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1607    }
1608    *input = PyBytes_AS_STRING(inputobj);
1609    insize = PyBytes_GET_SIZE(inputobj);
1610    *inend = *input + insize;
1611    /* we can DECREF safely, as the exception has another reference,
1612       so the object won't go away. */
1613    Py_DECREF(inputobj);
1614
1615    if (newpos<0)
1616        newpos = insize+newpos;
1617    if (newpos<0 || newpos>insize) {
1618        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1619        goto onError;
1620    }
1621
1622    /* need more space? (at least enough for what we
1623       have+the replacement+the rest of the string (starting
1624       at the new input position), so we won't have to check space
1625       when there are no errors in the rest of the string) */
1626    repptr = PyUnicode_AS_UNICODE(repunicode);
1627    repsize = PyUnicode_GET_SIZE(repunicode);
1628    requiredsize = *outpos + repsize + insize-newpos;
1629    if (requiredsize > outsize) {
1630        if (requiredsize<2*outsize)
1631            requiredsize = 2*outsize;
1632        if (_PyUnicode_Resize(output, requiredsize) < 0)
1633            goto onError;
1634        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1635    }
1636    *endinpos = newpos;
1637    *inptr = *input + newpos;
1638    Py_UNICODE_COPY(*outptr, repptr, repsize);
1639    *outptr += repsize;
1640    *outpos += repsize;
1641
1642    /* we made it! */
1643    res = 0;
1644
1645  onError:
1646    Py_XDECREF(restuple);
1647    return res;
1648}
1649
1650/* --- UTF-7 Codec -------------------------------------------------------- */
1651
1652/* see RFC2152 for details */
1653
1654static
1655char utf7_special[128] = {
1656    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1657       encoded:
1658       0 - not special
1659       1 - special
1660       2 - whitespace (optional)
1661       3 - RFC2152 Set O (optional) */
1662    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1663    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1665    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1666    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1668    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1669    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1670
1671};
1672
1673/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1674   warnings about the comparison always being false; since
1675   utf7_special[0] is 1, we can safely make that one comparison
1676   true  */
1677
1678#define SPECIAL(c, encodeO, encodeWS)                   \
1679    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1680     (encodeWS && (utf7_special[(c)] == 2)) ||          \
1681     (encodeO && (utf7_special[(c)] == 3)))
1682
1683#define B64(n)                                                          \
1684    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1685#define B64CHAR(c)                              \
1686    (ISALNUM(c) || (c) == '+' || (c) == '/')
1687#define UB64(c)                                         \
1688    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?   \
1689     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1690
1691#define ENCODE(out, ch, bits)                   \
1692    while (bits >= 6) {                         \
1693        *out++ = B64(ch >> (bits-6));           \
1694        bits -= 6;                              \
1695    }
1696
1697#define DECODE(out, ch, bits, surrogate)                                \
1698    while (bits >= 16) {                                                \
1699        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1700        bits -= 16;                                                     \
1701        if (surrogate) {                                                \
1702            /* We have already generated an error for the high surrogate \
1703               so let's not bother seeing if the low surrogate is correct or not */ \
1704            surrogate = 0;                                              \
1705        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1706            /* This is a surrogate pair. Unfortunately we can't represent \
1707               it in a 16-bit character */                              \
1708            surrogate = 1;                                              \
1709            errmsg = "code pairs are not supported";                    \
1710            goto utf7Error;                                             \
1711        } else {                                                        \
1712            *out++ = outCh;                                             \
1713        }                                                               \
1714    }
1715
1716PyObject *PyUnicode_DecodeUTF7(const char *s,
1717                               Py_ssize_t size,
1718                               const char *errors)
1719{
1720    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1721}
1722
1723PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1724                                       Py_ssize_t size,
1725                                       const char *errors,
1726                                       Py_ssize_t *consumed)
1727{
1728    const char *starts = s;
1729    Py_ssize_t startinpos;
1730    Py_ssize_t endinpos;
1731    Py_ssize_t outpos;
1732    const char *e;
1733    PyUnicodeObject *unicode;
1734    Py_UNICODE *p;
1735    const char *errmsg = "";
1736    int inShift = 0;
1737    unsigned int bitsleft = 0;
1738    unsigned long charsleft = 0;
1739    int surrogate = 0;
1740    PyObject *errorHandler = NULL;
1741    PyObject *exc = NULL;
1742
1743    unicode = _PyUnicode_New(size);
1744    if (!unicode)
1745        return NULL;
1746    if (size == 0) {
1747        if (consumed)
1748            *consumed = 0;
1749        return (PyObject *)unicode;
1750    }
1751
1752    p = unicode->str;
1753    e = s + size;
1754
1755    while (s < e) {
1756        Py_UNICODE ch;
1757      restart:
1758        ch = (unsigned char) *s;
1759
1760        if (inShift) {
1761            if ((ch == '-') || !B64CHAR(ch)) {
1762                inShift = 0;
1763                s++;
1764
1765                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1766                if (bitsleft >= 6) {
1767                    /* The shift sequence has a partial character in it. If
1768                       bitsleft < 6 then we could just classify it as padding
1769                       but that is not the case here */
1770
1771                    errmsg = "partial character in shift sequence";
1772                    goto utf7Error;
1773                }
1774                /* According to RFC2152 the remaining bits should be zero. We
1775                   choose to signal an error/insert a replacement character
1776                   here so indicate the potential of a misencoded character. */
1777
1778                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1779                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1780                    errmsg = "non-zero padding bits in shift sequence";
1781                    goto utf7Error;
1782                }
1783
1784                if (ch == '-') {
1785                    if ((s < e) && (*(s) == '-')) {
1786                        *p++ = '-';
1787                        inShift = 1;
1788                    }
1789                } else if (SPECIAL(ch,0,0)) {
1790                    errmsg = "unexpected special character";
1791                    goto utf7Error;
1792                } else  {
1793                    *p++ = ch;
1794                }
1795            } else {
1796                charsleft = (charsleft << 6) | UB64(ch);
1797                bitsleft += 6;
1798                s++;
1799                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1800            }
1801        }
1802        else if ( ch == '+' ) {
1803            startinpos = s-starts;
1804            s++;
1805            if (s < e && *s == '-') {
1806                s++;
1807                *p++ = '+';
1808            } else
1809            {
1810                inShift = 1;
1811                bitsleft = 0;
1812            }
1813        }
1814        else if (SPECIAL(ch,0,0)) {
1815            startinpos = s-starts;
1816            errmsg = "unexpected special character";
1817            s++;
1818            goto utf7Error;
1819        }
1820        else {
1821            *p++ = ch;
1822            s++;
1823        }
1824        continue;
1825      utf7Error:
1826        outpos = p-PyUnicode_AS_UNICODE(unicode);
1827        endinpos = s-starts;
1828        if (unicode_decode_call_errorhandler(
1829                errors, &errorHandler,
1830                "utf7", errmsg,
1831                &starts, &e, &startinpos, &endinpos, &exc, &s,
1832                &unicode, &outpos, &p))
1833            goto onError;
1834    }
1835
1836    if (inShift && !consumed) {
1837        outpos = p-PyUnicode_AS_UNICODE(unicode);
1838        endinpos = size;
1839        if (unicode_decode_call_errorhandler(
1840                errors, &errorHandler,
1841                "utf7", "unterminated shift sequence",
1842                &starts, &e, &startinpos, &endinpos, &exc, &s,
1843                &unicode, &outpos, &p))
1844            goto onError;
1845        if (s < e)
1846            goto restart;
1847    }
1848    if (consumed) {
1849        if(inShift)
1850            *consumed = startinpos;
1851        else
1852            *consumed = s-starts;
1853    }
1854
1855    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1856        goto onError;
1857
1858    Py_XDECREF(errorHandler);
1859    Py_XDECREF(exc);
1860    return (PyObject *)unicode;
1861
1862  onError:
1863    Py_XDECREF(errorHandler);
1864    Py_XDECREF(exc);
1865    Py_DECREF(unicode);
1866    return NULL;
1867}
1868
1869
1870PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1871                               Py_ssize_t size,
1872                               int encodeSetO,
1873                               int encodeWhiteSpace,
1874                               const char *errors)
1875{
1876    PyObject *v;
1877    /* It might be possible to tighten this worst case */
1878    Py_ssize_t cbAllocated = 5 * size;
1879    int inShift = 0;
1880    Py_ssize_t i = 0;
1881    unsigned int bitsleft = 0;
1882    unsigned long charsleft = 0;
1883    char * out;
1884    char * start;
1885
1886    if (size == 0)
1887        return PyBytes_FromStringAndSize(NULL, 0);
1888
1889    if (cbAllocated / 5 != size)
1890        return PyErr_NoMemory();
1891
1892    v = PyBytes_FromStringAndSize(NULL, cbAllocated);
1893    if (v == NULL)
1894        return NULL;
1895
1896    start = out = PyBytes_AS_STRING(v);
1897    for (;i < size; ++i) {
1898        Py_UNICODE ch = s[i];
1899
1900        if (!inShift) {
1901            if (ch == '+') {
1902                *out++ = '+';
1903                *out++ = '-';
1904            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1905                charsleft = ch;
1906                bitsleft = 16;
1907                *out++ = '+';
1908                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1909                inShift = bitsleft > 0;
1910            } else {
1911                *out++ = (char) ch;
1912            }
1913        } else {
1914            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1915                *out++ = B64(charsleft << (6-bitsleft));
1916                charsleft = 0;
1917                bitsleft = 0;
1918                /* Characters not in the BASE64 set implicitly unshift the sequence
1919                   so no '-' is required, except if the character is itself a '-' */
1920                if (B64CHAR(ch) || ch == '-') {
1921                    *out++ = '-';
1922                }
1923                inShift = 0;
1924                *out++ = (char) ch;
1925            } else {
1926                bitsleft += 16;
1927                charsleft = (charsleft << 16) | ch;
1928                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1929
1930                /* If the next character is special then we dont' need to terminate
1931                   the shift sequence. If the next character is not a BASE64 character
1932                   or '-' then the shift sequence will be terminated implicitly and we
1933                   don't have to insert a '-'. */
1934
1935                if (bitsleft == 0) {
1936                    if (i + 1 < size) {
1937                        Py_UNICODE ch2 = s[i+1];
1938
1939                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1940
1941                        } else if (B64CHAR(ch2) || ch2 == '-') {
1942                            *out++ = '-';
1943                            inShift = 0;
1944                        } else {
1945                            inShift = 0;
1946                        }
1947
1948                    }
1949                    else {
1950                        *out++ = '-';
1951                        inShift = 0;
1952                    }
1953                }
1954            }
1955        }
1956    }
1957    if (bitsleft) {
1958        *out++= B64(charsleft << (6-bitsleft) );
1959        *out++ = '-';
1960    }
1961    if (_PyBytes_Resize(&v, out - start) < 0)
1962        return NULL;
1963    return v;
1964}
1965
1966#undef SPECIAL
1967#undef B64
1968#undef B64CHAR
1969#undef UB64
1970#undef ENCODE
1971#undef DECODE
1972
1973/* --- UTF-8 Codec -------------------------------------------------------- */
1974
1975static
1976char utf8_code_length[256] = {
1977    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1978       illegal prefix.  see RFC 2279 for details */
1979    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1980    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1987    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1988    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1991    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1992    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1993    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1994    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1995};
1996
1997PyObject *PyUnicode_DecodeUTF8(const char *s,
1998                               Py_ssize_t size,
1999                               const char *errors)
2000{
2001    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2002}
2003
2004/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2005#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2006
2007/* Mask to quickly check whether a C 'long' contains a
2008   non-ASCII, UTF8-encoded char. */
2009#if (SIZEOF_LONG == 8)
2010# define ASCII_CHAR_MASK 0x8080808080808080L
2011#elif (SIZEOF_LONG == 4)
2012# define ASCII_CHAR_MASK 0x80808080L
2013#else
2014# error C 'long' size should be either 4 or 8!
2015#endif
2016
2017PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2018                                       Py_ssize_t size,
2019                                       const char *errors,
2020                                       Py_ssize_t *consumed)
2021{
2022    const char *starts = s;
2023    int n;
2024    Py_ssize_t startinpos;
2025    Py_ssize_t endinpos;
2026    Py_ssize_t outpos;
2027    const char *e, *aligned_end;
2028    PyUnicodeObject *unicode;
2029    Py_UNICODE *p;
2030    const char *errmsg = "";
2031    PyObject *errorHandler = NULL;
2032    PyObject *exc = NULL;
2033
2034    /* Note: size will always be longer than the resulting Unicode
2035       character count */
2036    unicode = _PyUnicode_New(size);
2037    if (!unicode)
2038        return NULL;
2039    if (size == 0) {
2040        if (consumed)
2041            *consumed = 0;
2042        return (PyObject *)unicode;
2043    }
2044
2045    /* Unpack UTF-8 encoded data */
2046    p = unicode->str;
2047    e = s + size;
2048    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2049
2050    while (s < e) {
2051        Py_UCS4 ch = (unsigned char)*s;
2052
2053        if (ch < 0x80) {
2054            /* Fast path for runs of ASCII characters. Given that common UTF-8
2055               input will consist of an overwhelming majority of ASCII
2056               characters, we try to optimize for this case by checking
2057               as many characters as a C 'long' can contain.
2058               First, check if we can do an aligned read, as most CPUs have
2059               a penalty for unaligned reads.
2060            */
2061            if (!((size_t) s & LONG_PTR_MASK)) {
2062                /* Help register allocation */
2063                register const char *_s = s;
2064                register Py_UNICODE *_p = p;
2065                while (_s < aligned_end) {
2066                    /* Read a whole long at a time (either 4 or 8 bytes),
2067                       and do a fast unrolled copy if it only contains ASCII
2068                       characters. */
2069                    unsigned long data = *(unsigned long *) _s;
2070                    if (data & ASCII_CHAR_MASK)
2071                        break;
2072                    _p[0] = (unsigned char) _s[0];
2073                    _p[1] = (unsigned char) _s[1];
2074                    _p[2] = (unsigned char) _s[2];
2075                    _p[3] = (unsigned char) _s[3];
2076#if (SIZEOF_LONG == 8)
2077                    _p[4] = (unsigned char) _s[4];
2078                    _p[5] = (unsigned char) _s[5];
2079                    _p[6] = (unsigned char) _s[6];
2080                    _p[7] = (unsigned char) _s[7];
2081#endif
2082                    _s += SIZEOF_LONG;
2083                    _p += SIZEOF_LONG;
2084                }
2085                s = _s;
2086                p = _p;
2087                if (s == e)
2088                    break;
2089                ch = (unsigned char)*s;
2090            }
2091        }
2092
2093        if (ch < 0x80) {
2094            *p++ = (Py_UNICODE)ch;
2095            s++;
2096            continue;
2097        }
2098
2099        n = utf8_code_length[ch];
2100
2101        if (s + n > e) {
2102            if (consumed)
2103                break;
2104            else {
2105                errmsg = "unexpected end of data";
2106                startinpos = s-starts;
2107                endinpos = size;
2108                goto utf8Error;
2109            }
2110        }
2111
2112        switch (n) {
2113
2114        case 0:
2115            errmsg = "unexpected code byte";
2116            startinpos = s-starts;
2117            endinpos = startinpos+1;
2118            goto utf8Error;
2119
2120        case 1:
2121            errmsg = "internal error";
2122            startinpos = s-starts;
2123            endinpos = startinpos+1;
2124            goto utf8Error;
2125
2126        case 2:
2127            if ((s[1] & 0xc0) != 0x80) {
2128                errmsg = "invalid data";
2129                startinpos = s-starts;
2130                endinpos = startinpos+2;
2131                goto utf8Error;
2132            }
2133            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2134            if (ch < 0x80) {
2135                startinpos = s-starts;
2136                endinpos = startinpos+2;
2137                errmsg = "illegal encoding";
2138                goto utf8Error;
2139            }
2140            else
2141                *p++ = (Py_UNICODE)ch;
2142            break;
2143
2144        case 3:
2145            if ((s[1] & 0xc0) != 0x80 ||
2146                (s[2] & 0xc0) != 0x80) {
2147                errmsg = "invalid data";
2148                startinpos = s-starts;
2149                endinpos = startinpos+3;
2150                goto utf8Error;
2151            }
2152            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2153            if (ch < 0x0800) {
2154                /* Note: UTF-8 encodings of surrogates are considered
2155                   legal UTF-8 sequences;
2156
2157                   XXX For wide builds (UCS-4) we should probably try
2158                   to recombine the surrogates into a single code
2159                   unit.
2160                */
2161                errmsg = "illegal encoding";
2162                startinpos = s-starts;
2163                endinpos = startinpos+3;
2164                goto utf8Error;
2165            }
2166            else
2167                *p++ = (Py_UNICODE)ch;
2168            break;
2169
2170        case 4:
2171            if ((s[1] & 0xc0) != 0x80 ||
2172                (s[2] & 0xc0) != 0x80 ||
2173                (s[3] & 0xc0) != 0x80) {
2174                errmsg = "invalid data";
2175                startinpos = s-starts;
2176                endinpos = startinpos+4;
2177                goto utf8Error;
2178            }
2179            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2180                ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2181            /* validate and convert to UTF-16 */
2182            if ((ch < 0x10000)        /* minimum value allowed for 4
2183                                         byte encoding */
2184                || (ch > 0x10ffff))   /* maximum value allowed for
2185                                         UTF-16 */
2186            {
2187                errmsg = "illegal encoding";
2188                startinpos = s-starts;
2189                endinpos = startinpos+4;
2190                goto utf8Error;
2191            }
2192#ifdef Py_UNICODE_WIDE
2193            *p++ = (Py_UNICODE)ch;
2194#else
2195            /*  compute and append the two surrogates: */
2196
2197            /*  translate from 10000..10FFFF to 0..FFFF */
2198            ch -= 0x10000;
2199
2200            /*  high surrogate = top 10 bits added to D800 */
2201            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2202
2203            /*  low surrogate = bottom 10 bits added to DC00 */
2204            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2205#endif
2206            break;
2207
2208        default:
2209            /* Other sizes are only needed for UCS-4 */
2210            errmsg = "unsupported Unicode code range";
2211            startinpos = s-starts;
2212            endinpos = startinpos+n;
2213            goto utf8Error;
2214        }
2215        s += n;
2216        continue;
2217
2218      utf8Error:
2219        outpos = p-PyUnicode_AS_UNICODE(unicode);
2220        if (unicode_decode_call_errorhandler(
2221                errors, &errorHandler,
2222                "utf8", errmsg,
2223                &starts, &e, &startinpos, &endinpos, &exc, &s,
2224                &unicode, &outpos, &p))
2225            goto onError;
2226        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2227    }
2228    if (consumed)
2229        *consumed = s-starts;
2230
2231    /* Adjust length */
2232    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2233        goto onError;
2234
2235    Py_XDECREF(errorHandler);
2236    Py_XDECREF(exc);
2237    return (PyObject *)unicode;
2238
2239  onError:
2240    Py_XDECREF(errorHandler);
2241    Py_XDECREF(exc);
2242    Py_DECREF(unicode);
2243    return NULL;
2244}
2245
2246#undef ASCII_CHAR_MASK
2247
2248
2249/* Allocation strategy:  if the string is short, convert into a stack buffer
2250   and allocate exactly as much space needed at the end.  Else allocate the
2251   maximum possible needed (4 result bytes per Unicode character), and return
2252   the excess memory at the end.
2253*/
2254PyObject *
2255PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2256                     Py_ssize_t size,
2257                     const char *errors)
2258{
2259#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2260
2261    Py_ssize_t i;                /* index into s of next input byte */
2262    PyObject *result;            /* result string object */
2263    char *p;                     /* next free byte in output buffer */
2264    Py_ssize_t nallocated;      /* number of result bytes allocated */
2265    Py_ssize_t nneeded;            /* number of result bytes needed */
2266    char stackbuf[MAX_SHORT_UNICHARS * 4];
2267
2268    assert(s != NULL);
2269    assert(size >= 0);
2270
2271    if (size <= MAX_SHORT_UNICHARS) {
2272        /* Write into the stack buffer; nallocated can't overflow.
2273         * At the end, we'll allocate exactly as much heap space as it
2274         * turns out we need.
2275         */
2276        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2277        result = NULL;   /* will allocate after we're done */
2278        p = stackbuf;
2279    }
2280    else {
2281        /* Overallocate on the heap, and give the excess back at the end. */
2282        nallocated = size * 4;
2283        if (nallocated / 4 != size)  /* overflow! */
2284            return PyErr_NoMemory();
2285        result = PyBytes_FromStringAndSize(NULL, nallocated);
2286        if (result == NULL)
2287            return NULL;
2288        p = PyBytes_AS_STRING(result);
2289    }
2290
2291    for (i = 0; i < size;) {
2292        Py_UCS4 ch = s[i++];
2293
2294        if (ch < 0x80)
2295            /* Encode ASCII */
2296            *p++ = (char) ch;
2297
2298        else if (ch < 0x0800) {
2299            /* Encode Latin-1 */
2300            *p++ = (char)(0xc0 | (ch >> 6));
2301            *p++ = (char)(0x80 | (ch & 0x3f));
2302        }
2303        else {
2304            /* Encode UCS2 Unicode ordinals */
2305            if (ch < 0x10000) {
2306                /* Special case: check for high surrogate */
2307                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2308                    Py_UCS4 ch2 = s[i];
2309                    /* Check for low surrogate and combine the two to
2310                       form a UCS4 value */
2311                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2312                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2313                        i++;
2314                        goto encodeUCS4;
2315                    }
2316                    /* Fall through: handles isolated high surrogates */
2317                }
2318                *p++ = (char)(0xe0 | (ch >> 12));
2319                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2320                *p++ = (char)(0x80 | (ch & 0x3f));
2321                continue;
2322            }
2323          encodeUCS4:
2324            /* Encode UCS4 Unicode ordinals */
2325            *p++ = (char)(0xf0 | (ch >> 18));
2326            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2327            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2328            *p++ = (char)(0x80 | (ch & 0x3f));
2329        }
2330    }
2331
2332    if (result == NULL) {
2333        /* This was stack allocated. */
2334        nneeded = p - stackbuf;
2335        assert(nneeded <= nallocated);
2336        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2337    }
2338    else {
2339        /* Cut back to size actually needed. */
2340        nneeded = p - PyBytes_AS_STRING(result);
2341        assert(nneeded <= nallocated);
2342        _PyBytes_Resize(&result, nneeded);
2343    }
2344    return result;
2345
2346#undef MAX_SHORT_UNICHARS
2347}
2348
2349PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2350{
2351    if (!PyUnicode_Check(unicode)) {
2352        PyErr_BadArgument();
2353        return NULL;
2354    }
2355    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2356                                PyUnicode_GET_SIZE(unicode),
2357                                NULL);
2358}
2359
2360/* --- UTF-32 Codec ------------------------------------------------------- */
2361
2362PyObject *
2363PyUnicode_DecodeUTF32(const char *s,
2364                      Py_ssize_t size,
2365                      const char *errors,
2366                      int *byteorder)
2367{
2368    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2369}
2370
2371PyObject *
2372PyUnicode_DecodeUTF32Stateful(const char *s,
2373                              Py_ssize_t size,
2374                              const char *errors,
2375                              int *byteorder,
2376                              Py_ssize_t *consumed)
2377{
2378    const char *starts = s;
2379    Py_ssize_t startinpos;
2380    Py_ssize_t endinpos;
2381    Py_ssize_t outpos;
2382    PyUnicodeObject *unicode;
2383    Py_UNICODE *p;
2384#ifndef Py_UNICODE_WIDE
2385    int i, pairs;
2386#else
2387    const int pairs = 0;
2388#endif
2389    const unsigned char *q, *e;
2390    int bo = 0;       /* assume native ordering by default */
2391    const char *errmsg = "";
2392    /* Offsets from q for retrieving bytes in the right order. */
2393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2394    int iorder[] = {0, 1, 2, 3};
2395#else
2396    int iorder[] = {3, 2, 1, 0};
2397#endif
2398    PyObject *errorHandler = NULL;
2399    PyObject *exc = NULL;
2400    /* On narrow builds we split characters outside the BMP into two
2401       codepoints => count how much extra space we need. */
2402#ifndef Py_UNICODE_WIDE
2403    for (i = pairs = 0; i < size/4; i++)
2404        if (((Py_UCS4 *)s)[i] >= 0x10000)
2405            pairs++;
2406#endif
2407
2408    /* This might be one to much, because of a BOM */
2409    unicode = _PyUnicode_New((size+3)/4+pairs);
2410    if (!unicode)
2411        return NULL;
2412    if (size == 0)
2413        return (PyObject *)unicode;
2414
2415    /* Unpack UTF-32 encoded data */
2416    p = unicode->str;
2417    q = (unsigned char *)s;
2418    e = q + size;
2419
2420    if (byteorder)
2421        bo = *byteorder;
2422
2423    /* Check for BOM marks (U+FEFF) in the input and adjust current
2424       byte order setting accordingly. In native mode, the leading BOM
2425       mark is skipped, in all other modes, it is copied to the output
2426       stream as-is (giving a ZWNBSP character). */
2427    if (bo == 0) {
2428        if (size >= 4) {
2429            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2430                (q[iorder[1]] << 8) | q[iorder[0]];
2431#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2432            if (bom == 0x0000FEFF) {
2433                q += 4;
2434                bo = -1;
2435            }
2436            else if (bom == 0xFFFE0000) {
2437                q += 4;
2438                bo = 1;
2439            }
2440#else
2441            if (bom == 0x0000FEFF) {
2442                q += 4;
2443                bo = 1;
2444            }
2445            else if (bom == 0xFFFE0000) {
2446                q += 4;
2447                bo = -1;
2448            }
2449#endif
2450        }
2451    }
2452
2453    if (bo == -1) {
2454        /* force LE */
2455        iorder[0] = 0;
2456        iorder[1] = 1;
2457        iorder[2] = 2;
2458        iorder[3] = 3;
2459    }
2460    else if (bo == 1) {
2461        /* force BE */
2462        iorder[0] = 3;
2463        iorder[1] = 2;
2464        iorder[2] = 1;
2465        iorder[3] = 0;
2466    }
2467
2468    while (q < e) {
2469        Py_UCS4 ch;
2470        /* remaining bytes at the end? (size should be divisible by 4) */
2471        if (e-q<4) {
2472            if (consumed)
2473                break;
2474            errmsg = "truncated data";
2475            startinpos = ((const char *)q)-starts;
2476            endinpos = ((const char *)e)-starts;
2477            goto utf32Error;
2478            /* The remaining input chars are ignored if the callback
2479               chooses to skip the input */
2480        }
2481        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2482            (q[iorder[1]] << 8) | q[iorder[0]];
2483
2484        if (ch >= 0x110000)
2485        {
2486            errmsg = "codepoint not in range(0x110000)";
2487            startinpos = ((const char *)q)-starts;
2488            endinpos = startinpos+4;
2489            goto utf32Error;
2490        }
2491#ifndef Py_UNICODE_WIDE
2492        if (ch >= 0x10000)
2493        {
2494            *p++ = 0xD800 | ((ch-0x10000) >> 10);
2495            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2496        }
2497        else
2498#endif
2499            *p++ = ch;
2500        q += 4;
2501        continue;
2502      utf32Error:
2503        outpos = p-PyUnicode_AS_UNICODE(unicode);
2504        if (unicode_decode_call_errorhandler(
2505                errors, &errorHandler,
2506                "utf32", errmsg,
2507                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2508                &unicode, &outpos, &p))
2509            goto onError;
2510    }
2511
2512    if (byteorder)
2513        *byteorder = bo;
2514
2515    if (consumed)
2516        *consumed = (const char *)q-starts;
2517
2518    /* Adjust length */
2519    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2520        goto onError;
2521
2522    Py_XDECREF(errorHandler);
2523    Py_XDECREF(exc);
2524    return (PyObject *)unicode;
2525
2526  onError:
2527    Py_DECREF(unicode);
2528    Py_XDECREF(errorHandler);
2529    Py_XDECREF(exc);
2530    return NULL;
2531}
2532
2533PyObject *
2534PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2535                      Py_ssize_t size,
2536                      const char *errors,
2537                      int byteorder)
2538{
2539    PyObject *v;
2540    unsigned char *p;
2541    Py_ssize_t nsize, bytesize;
2542#ifndef Py_UNICODE_WIDE
2543    Py_ssize_t i, pairs;
2544#else
2545    const int pairs = 0;
2546#endif
2547    /* Offsets from p for storing byte pairs in the right order. */
2548#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2549    int iorder[] = {0, 1, 2, 3};
2550#else
2551    int iorder[] = {3, 2, 1, 0};
2552#endif
2553
2554#define STORECHAR(CH)                           \
2555    do {                                        \
2556        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2557        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2558        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2559        p[iorder[0]] = (CH) & 0xff;             \
2560        p += 4;                                 \
2561    } while(0)
2562
2563    /* In narrow builds we can output surrogate pairs as one codepoint,
2564       so we need less space. */
2565#ifndef Py_UNICODE_WIDE
2566    for (i = pairs = 0; i < size-1; i++)
2567        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2568            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2569            pairs++;
2570#endif
2571    nsize = (size - pairs + (byteorder == 0));
2572    bytesize = nsize * 4;
2573    if (bytesize / 4 != nsize)
2574        return PyErr_NoMemory();
2575    v = PyBytes_FromStringAndSize(NULL, bytesize);
2576    if (v == NULL)
2577        return NULL;
2578
2579    p = (unsigned char *)PyBytes_AS_STRING(v);
2580    if (byteorder == 0)
2581        STORECHAR(0xFEFF);
2582    if (size == 0)
2583        goto done;
2584
2585    if (byteorder == -1) {
2586        /* force LE */
2587        iorder[0] = 0;
2588        iorder[1] = 1;
2589        iorder[2] = 2;
2590        iorder[3] = 3;
2591    }
2592    else if (byteorder == 1) {
2593        /* force BE */
2594        iorder[0] = 3;
2595        iorder[1] = 2;
2596        iorder[2] = 1;
2597        iorder[3] = 0;
2598    }
2599
2600    while (size-- > 0) {
2601        Py_UCS4 ch = *s++;
2602#ifndef Py_UNICODE_WIDE
2603        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2604            Py_UCS4 ch2 = *s;
2605            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2606                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2607                s++;
2608                size--;
2609            }
2610        }
2611#endif
2612        STORECHAR(ch);
2613    }
2614
2615  done:
2616    return v;
2617#undef STORECHAR
2618}
2619
2620PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2621{
2622    if (!PyUnicode_Check(unicode)) {
2623        PyErr_BadArgument();
2624        return NULL;
2625    }
2626    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2627                                 PyUnicode_GET_SIZE(unicode),
2628                                 NULL,
2629                                 0);
2630}
2631
2632/* --- UTF-16 Codec ------------------------------------------------------- */
2633
2634PyObject *
2635PyUnicode_DecodeUTF16(const char *s,
2636                      Py_ssize_t size,
2637                      const char *errors,
2638                      int *byteorder)
2639{
2640    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2641}
2642
2643/* Two masks for fast checking of whether a C 'long' may contain
2644   UTF16-encoded surrogate characters. This is an efficient heuristic,
2645   assuming that non-surrogate characters with a code point >= 0x8000 are
2646   rare in most input.
2647   FAST_CHAR_MASK is used when the input is in native byte ordering,
2648   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
2649*/
2650#if (SIZEOF_LONG == 8)
2651# define FAST_CHAR_MASK         0x8000800080008000L
2652# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2653#elif (SIZEOF_LONG == 4)
2654# define FAST_CHAR_MASK         0x80008000L
2655# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2656#else
2657# error C 'long' size should be either 4 or 8!
2658#endif
2659
2660PyObject *
2661PyUnicode_DecodeUTF16Stateful(const char *s,
2662                              Py_ssize_t size,
2663                              const char *errors,
2664                              int *byteorder,
2665                              Py_ssize_t *consumed)
2666{
2667    const char *starts = s;
2668    Py_ssize_t startinpos;
2669    Py_ssize_t endinpos;
2670    Py_ssize_t outpos;
2671    PyUnicodeObject *unicode;
2672    Py_UNICODE *p;
2673    const unsigned char *q, *e, *aligned_end;
2674    int bo = 0;       /* assume native ordering by default */
2675    int native_ordering = 0;
2676    const char *errmsg = "";
2677    /* Offsets from q for retrieving byte pairs in the right order. */
2678#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2679    int ihi = 1, ilo = 0;
2680#else
2681    int ihi = 0, ilo = 1;
2682#endif
2683    PyObject *errorHandler = NULL;
2684    PyObject *exc = NULL;
2685
2686    /* Note: size will always be longer than the resulting Unicode
2687       character count */
2688    unicode = _PyUnicode_New(size);
2689    if (!unicode)
2690        return NULL;
2691    if (size == 0)
2692        return (PyObject *)unicode;
2693
2694    /* Unpack UTF-16 encoded data */
2695    p = unicode->str;
2696    q = (unsigned char *)s;
2697    e = q + size - 1;
2698
2699    if (byteorder)
2700        bo = *byteorder;
2701
2702    /* Check for BOM marks (U+FEFF) in the input and adjust current
2703       byte order setting accordingly. In native mode, the leading BOM
2704       mark is skipped, in all other modes, it is copied to the output
2705       stream as-is (giving a ZWNBSP character). */
2706    if (bo == 0) {
2707        if (size >= 2) {
2708            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2709#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2710            if (bom == 0xFEFF) {
2711                q += 2;
2712                bo = -1;
2713            }
2714            else if (bom == 0xFFFE) {
2715                q += 2;
2716                bo = 1;
2717            }
2718#else
2719            if (bom == 0xFEFF) {
2720                q += 2;
2721                bo = 1;
2722            }
2723            else if (bom == 0xFFFE) {
2724                q += 2;
2725                bo = -1;
2726            }
2727#endif
2728        }
2729    }
2730
2731    if (bo == -1) {
2732        /* force LE */
2733        ihi = 1;
2734        ilo = 0;
2735    }
2736    else if (bo == 1) {
2737        /* force BE */
2738        ihi = 0;
2739        ilo = 1;
2740    }
2741#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2742    native_ordering = ilo < ihi;
2743#else
2744    native_ordering = ilo > ihi;
2745#endif
2746
2747    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
2748    while (q < e) {
2749        Py_UNICODE ch;
2750        /* First check for possible aligned read of a C 'long'. Unaligned
2751           reads are more expensive, better to defer to another iteration. */
2752        if (!((size_t) q & LONG_PTR_MASK)) {
2753            /* Fast path for runs of non-surrogate chars. */
2754            register const unsigned char *_q = q;
2755            Py_UNICODE *_p = p;
2756            if (native_ordering) {
2757                /* Native ordering is simple: as long as the input cannot
2758                   possibly contain a surrogate char, do an unrolled copy
2759                   of several 16-bit code points to the target object.
2760                   The non-surrogate check is done on several input bytes
2761                   at a time (as many as a C 'long' can contain). */
2762                while (_q < aligned_end) {
2763                    unsigned long data = * (unsigned long *) _q;
2764                    if (data & FAST_CHAR_MASK)
2765                        break;
2766                    _p[0] = ((unsigned short *) _q)[0];
2767                    _p[1] = ((unsigned short *) _q)[1];
2768#if (SIZEOF_LONG == 8)
2769                    _p[2] = ((unsigned short *) _q)[2];
2770                    _p[3] = ((unsigned short *) _q)[3];
2771#endif
2772                    _q += SIZEOF_LONG;
2773                    _p += SIZEOF_LONG / 2;
2774                }
2775            }
2776            else {
2777                /* Byteswapped ordering is similar, but we must decompose
2778                   the copy bytewise, and take care of zero'ing out the
2779                   upper bytes if the target object is in 32-bit units
2780                   (that is, in UCS-4 builds). */
2781                while (_q < aligned_end) {
2782                    unsigned long data = * (unsigned long *) _q;
2783                    if (data & SWAPPED_FAST_CHAR_MASK)
2784                        break;
2785                    /* Zero upper bytes in UCS-4 builds */
2786#if (Py_UNICODE_SIZE > 2)
2787                    _p[0] = 0;
2788                    _p[1] = 0;
2789#if (SIZEOF_LONG == 8)
2790                    _p[2] = 0;
2791                    _p[3] = 0;
2792#endif
2793#endif
2794                    /* Issue #4916; UCS-4 builds on big endian machines must
2795                       fill the two last bytes of each 4-byte unit. */
2796#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2797# define OFF 2
2798#else
2799# define OFF 0
2800#endif
2801                    ((unsigned char *) _p)[OFF + 1] = _q[0];
2802                    ((unsigned char *) _p)[OFF + 0] = _q[1];
2803                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
2804                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
2805#if (SIZEOF_LONG == 8)
2806                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
2807                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
2808                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
2809                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
2810#endif
2811#undef OFF
2812                    _q += SIZEOF_LONG;
2813                    _p += SIZEOF_LONG / 2;
2814                }
2815            }
2816            p = _p;
2817            q = _q;
2818            if (q >= e)
2819                break;
2820        }
2821        ch = (q[ihi] << 8) | q[ilo];
2822
2823        q += 2;
2824
2825        if (ch < 0xD800 || ch > 0xDFFF) {
2826            *p++ = ch;
2827            continue;
2828        }
2829
2830        /* UTF-16 code pair: */
2831        if (q > e) {
2832            errmsg = "unexpected end of data";
2833            startinpos = (((const char *)q) - 2) - starts;
2834            endinpos = ((const char *)e) + 1 - starts;
2835            goto utf16Error;
2836        }
2837        if (0xD800 <= ch && ch <= 0xDBFF) {
2838            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2839            q += 2;
2840            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2841#ifndef Py_UNICODE_WIDE
2842                *p++ = ch;
2843                *p++ = ch2;
2844#else
2845                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2846#endif
2847                continue;
2848            }
2849            else {
2850                errmsg = "illegal UTF-16 surrogate";
2851                startinpos = (((const char *)q)-4)-starts;
2852                endinpos = startinpos+2;
2853                goto utf16Error;
2854            }
2855
2856        }
2857        errmsg = "illegal encoding";
2858        startinpos = (((const char *)q)-2)-starts;
2859        endinpos = startinpos+2;
2860        /* Fall through to report the error */
2861
2862      utf16Error:
2863        outpos = p - PyUnicode_AS_UNICODE(unicode);
2864        if (unicode_decode_call_errorhandler(
2865                errors,
2866                &errorHandler,
2867                "utf16", errmsg,
2868                &starts,
2869                (const char **)&e,
2870                &startinpos,
2871                &endinpos,
2872                &exc,
2873                (const char **)&q,
2874                &unicode,
2875                &outpos,
2876                &p))
2877            goto onError;
2878    }
2879    /* remaining byte at the end? (size should be even) */
2880    if (e == q) {
2881        if (!consumed) {
2882            errmsg = "truncated data";
2883            startinpos = ((const char *)q) - starts;
2884            endinpos = ((const char *)e) + 1 - starts;
2885            outpos = p - PyUnicode_AS_UNICODE(unicode);
2886            if (unicode_decode_call_errorhandler(
2887                    errors,
2888                    &errorHandler,
2889                    "utf16", errmsg,
2890                    &starts,
2891                    (const char **)&e,
2892                    &startinpos,
2893                    &endinpos,
2894                    &exc,
2895                    (const char **)&q,
2896                    &unicode,
2897                    &outpos,
2898                    &p))
2899                goto onError;
2900            /* The remaining input chars are ignored if the callback
2901               chooses to skip the input */
2902        }
2903    }
2904
2905    if (byteorder)
2906        *byteorder = bo;
2907
2908    if (consumed)
2909        *consumed = (const char *)q-starts;
2910
2911    /* Adjust length */
2912    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2913        goto onError;
2914
2915    Py_XDECREF(errorHandler);
2916    Py_XDECREF(exc);
2917    return (PyObject *)unicode;
2918
2919  onError:
2920    Py_DECREF(unicode);
2921    Py_XDECREF(errorHandler);
2922    Py_XDECREF(exc);
2923    return NULL;
2924}
2925
2926#undef FAST_CHAR_MASK
2927#undef SWAPPED_FAST_CHAR_MASK
2928
2929PyObject *
2930PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2931                      Py_ssize_t size,
2932                      const char *errors,
2933                      int byteorder)
2934{
2935    PyObject *v;
2936    unsigned char *p;
2937    Py_ssize_t nsize, bytesize;
2938#ifdef Py_UNICODE_WIDE
2939    Py_ssize_t i, pairs;
2940#else
2941    const int pairs = 0;
2942#endif
2943    /* Offsets from p for storing byte pairs in the right order. */
2944#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2945    int ihi = 1, ilo = 0;
2946#else
2947    int ihi = 0, ilo = 1;
2948#endif
2949
2950#define STORECHAR(CH)                           \
2951    do {                                        \
2952        p[ihi] = ((CH) >> 8) & 0xff;            \
2953        p[ilo] = (CH) & 0xff;                   \
2954        p += 2;                                 \
2955    } while(0)
2956
2957#ifdef Py_UNICODE_WIDE
2958    for (i = pairs = 0; i < size; i++)
2959        if (s[i] >= 0x10000)
2960            pairs++;
2961#endif
2962    /* 2 * (size + pairs + (byteorder == 0)) */
2963    if (size > PY_SSIZE_T_MAX ||
2964        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2965        return PyErr_NoMemory();
2966    nsize = size + pairs + (byteorder == 0);
2967    bytesize = nsize * 2;
2968    if (bytesize / 2 != nsize)
2969        return PyErr_NoMemory();
2970    v = PyBytes_FromStringAndSize(NULL, bytesize);
2971    if (v == NULL)
2972        return NULL;
2973
2974    p = (unsigned char *)PyBytes_AS_STRING(v);
2975    if (byteorder == 0)
2976        STORECHAR(0xFEFF);
2977    if (size == 0)
2978        goto done;
2979
2980    if (byteorder == -1) {
2981        /* force LE */
2982        ihi = 1;
2983        ilo = 0;
2984    }
2985    else if (byteorder == 1) {
2986        /* force BE */
2987        ihi = 0;
2988        ilo = 1;
2989    }
2990
2991    while (size-- > 0) {
2992        Py_UNICODE ch = *s++;
2993        Py_UNICODE ch2 = 0;
2994#ifdef Py_UNICODE_WIDE
2995        if (ch >= 0x10000) {
2996            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2997            ch  = 0xD800 | ((ch-0x10000) >> 10);
2998        }
2999#endif
3000        STORECHAR(ch);
3001        if (ch2)
3002            STORECHAR(ch2);
3003    }
3004
3005  done:
3006    return v;
3007#undef STORECHAR
3008}
3009
3010PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3011{
3012    if (!PyUnicode_Check(unicode)) {
3013        PyErr_BadArgument();
3014        return NULL;
3015    }
3016    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3017                                 PyUnicode_GET_SIZE(unicode),
3018                                 NULL,
3019                                 0);
3020}
3021
3022/* --- Unicode Escape Codec ----------------------------------------------- */
3023
3024static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3025
3026PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3027                                        Py_ssize_t size,
3028                                        const char *errors)
3029{
3030    const char *starts = s;
3031    Py_ssize_t startinpos;
3032    Py_ssize_t endinpos;
3033    Py_ssize_t outpos;
3034    int i;
3035    PyUnicodeObject *v;
3036    Py_UNICODE *p;
3037    const char *end;
3038    char* message;
3039    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3040    PyObject *errorHandler = NULL;
3041    PyObject *exc = NULL;
3042
3043    /* Escaped strings will always be longer than the resulting
3044       Unicode string, so we start with size here and then reduce the
3045       length after conversion to the true value.
3046       (but if the error callback returns a long replacement string
3047       we'll have to allocate more space) */
3048    v = _PyUnicode_New(size);
3049    if (v == NULL)
3050        goto onError;
3051    if (size == 0)
3052        return (PyObject *)v;
3053
3054    p = PyUnicode_AS_UNICODE(v);
3055    end = s + size;
3056
3057    while (s < end) {
3058        unsigned char c;
3059        Py_UNICODE x;
3060        int digits;
3061
3062        /* Non-escape characters are interpreted as Unicode ordinals */
3063        if (*s != '\\') {
3064            *p++ = (unsigned char) *s++;
3065            continue;
3066        }
3067
3068        startinpos = s-starts;
3069        /* \ - Escapes */
3070        s++;
3071        c = *s++;
3072        if (s > end)
3073            c = '\0'; /* Invalid after \ */
3074        switch (c) {
3075
3076            /* \x escapes */
3077        case '\n': break;
3078        case '\\': *p++ = '\\'; break;
3079        case '\'': *p++ = '\''; break;
3080        case '\"': *p++ = '\"'; break;
3081        case 'b': *p++ = '\b'; break;
3082        case 'f': *p++ = '\014'; break; /* FF */
3083        case 't': *p++ = '\t'; break;
3084        case 'n': *p++ = '\n'; break;
3085        case 'r': *p++ = '\r'; break;
3086        case 'v': *p++ = '\013'; break; /* VT */
3087        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3088
3089            /* \OOO (octal) escapes */
3090        case '0': case '1': case '2': case '3':
3091        case '4': case '5': case '6': case '7':
3092            x = s[-1] - '0';
3093            if (s < end && '0' <= *s && *s <= '7') {
3094                x = (x<<3) + *s++ - '0';
3095                if (s < end && '0' <= *s && *s <= '7')
3096                    x = (x<<3) + *s++ - '0';
3097            }
3098            *p++ = x;
3099            break;
3100
3101            /* hex escapes */
3102            /* \xXX */
3103        case 'x':
3104            digits = 2;
3105            message = "truncated \\xXX escape";
3106            goto hexescape;
3107
3108            /* \uXXXX */
3109        case 'u':
3110            digits = 4;
3111            message = "truncated \\uXXXX escape";
3112            goto hexescape;
3113
3114            /* \UXXXXXXXX */
3115        case 'U':
3116            digits = 8;
3117            message = "truncated \\UXXXXXXXX escape";
3118        hexescape:
3119            chr = 0;
3120            outpos = p-PyUnicode_AS_UNICODE(v);
3121            if (s+digits>end) {
3122                endinpos = size;
3123                if (unicode_decode_call_errorhandler(
3124                        errors, &errorHandler,
3125                        "unicodeescape", "end of string in escape sequence",
3126                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3127                        &v, &outpos, &p))
3128                    goto onError;
3129                goto nextByte;
3130            }
3131            for (i = 0; i < digits; ++i) {
3132                c = (unsigned char) s[i];
3133                if (!ISXDIGIT(c)) {
3134                    endinpos = (s+i+1)-starts;
3135                    if (unicode_decode_call_errorhandler(
3136                            errors, &errorHandler,
3137                            "unicodeescape", message,
3138                            &starts, &end, &startinpos, &endinpos, &exc, &s,
3139                            &v, &outpos, &p))
3140                        goto onError;
3141                    goto nextByte;
3142                }
3143                chr = (chr<<4) & ~0xF;
3144                if (c >= '0' && c <= '9')
3145                    chr += c - '0';
3146                else if (c >= 'a' && c <= 'f')
3147                    chr += 10 + c - 'a';
3148                else
3149                    chr += 10 + c - 'A';
3150            }
3151            s += i;
3152            if (chr == 0xffffffff && PyErr_Occurred())
3153                /* _decoding_error will have already written into the
3154                   target buffer. */
3155                break;
3156        store:
3157            /* when we get here, chr is a 32-bit unicode character */
3158            if (chr <= 0xffff)
3159                /* UCS-2 character */
3160                *p++ = (Py_UNICODE) chr;
3161            else if (chr <= 0x10ffff) {
3162                /* UCS-4 character. Either store directly, or as
3163                   surrogate pair. */
3164#ifdef Py_UNICODE_WIDE
3165                *p++ = chr;
3166#else
3167                chr -= 0x10000L;
3168                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3169                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3170#endif
3171            } else {
3172                endinpos = s-starts;
3173                outpos = p-PyUnicode_AS_UNICODE(v);
3174                if (unicode_decode_call_errorhandler(
3175                        errors, &errorHandler,
3176                        "unicodeescape", "illegal Unicode character",
3177                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3178                        &v, &outpos, &p))
3179                    goto onError;
3180            }
3181            break;
3182
3183            /* \N{name} */
3184        case 'N':
3185            message = "malformed \\N character escape";
3186            if (ucnhash_CAPI == NULL) {
3187                /* load the unicode data module */
3188                PyObject *m, *api;
3189                m = PyImport_ImportModuleNoBlock("unicodedata");
3190                if (m == NULL)
3191                    goto ucnhashError;
3192                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
3193                Py_DECREF(m);
3194                if (api == NULL)
3195                    goto ucnhashError;
3196                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
3197                Py_DECREF(api);
3198                if (ucnhash_CAPI == NULL)
3199                    goto ucnhashError;
3200            }
3201            if (*s == '{') {
3202                const char *start = s+1;
3203                /* look for the closing brace */
3204                while (*s != '}' && s < end)
3205                    s++;
3206                if (s > start && s < end && *s == '}') {
3207                    /* found a name.  look it up in the unicode database */
3208                    message = "unknown Unicode character name";
3209                    s++;
3210                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3211                        goto store;
3212                }
3213            }
3214            endinpos = s-starts;
3215            outpos = p-PyUnicode_AS_UNICODE(v);
3216            if (unicode_decode_call_errorhandler(
3217                    errors, &errorHandler,
3218                    "unicodeescape", message,
3219                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3220                    &v, &outpos, &p))
3221                goto onError;
3222            break;
3223
3224        default:
3225            if (s > end) {
3226                message = "\\ at end of string";
3227                s--;
3228                endinpos = s-starts;
3229                outpos = p-PyUnicode_AS_UNICODE(v);
3230                if (unicode_decode_call_errorhandler(
3231                        errors, &errorHandler,
3232                        "unicodeescape", message,
3233                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3234                        &v, &outpos, &p))
3235                    goto onError;
3236            }
3237            else {
3238                *p++ = '\\';
3239                *p++ = (unsigned char)s[-1];
3240            }
3241            break;
3242        }
3243      nextByte:
3244        ;
3245    }
3246    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3247        goto onError;
3248    Py_XDECREF(errorHandler);
3249    Py_XDECREF(exc);
3250    return (PyObject *)v;
3251
3252  ucnhashError:
3253    PyErr_SetString(
3254        PyExc_UnicodeError,
3255        "\\N escapes not supported (can't load unicodedata module)"
3256        );
3257    Py_XDECREF(v);
3258    Py_XDECREF(errorHandler);
3259    Py_XDECREF(exc);
3260    return NULL;
3261
3262  onError:
3263    Py_XDECREF(v);
3264    Py_XDECREF(errorHandler);
3265    Py_XDECREF(exc);
3266    return NULL;
3267}
3268
3269/* Return a Unicode-Escape string version of the Unicode object.
3270
3271   If quotes is true, the string is enclosed in u"" or u'' quotes as
3272   appropriate.
3273
3274*/
3275
3276Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3277                                             Py_ssize_t size,
3278                                             Py_UNICODE ch)
3279{
3280    /* like wcschr, but doesn't stop at NULL characters */
3281
3282    while (size-- > 0) {
3283        if (*s == ch)
3284            return s;
3285        s++;
3286    }
3287
3288    return NULL;
3289}
3290
3291static const char *hexdigits = "0123456789abcdef";
3292
3293PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3294                                        Py_ssize_t size)
3295{
3296    PyObject *repr;
3297    char *p;
3298
3299#ifdef Py_UNICODE_WIDE
3300    const Py_ssize_t expandsize = 10;
3301#else
3302    const Py_ssize_t expandsize = 6;
3303#endif
3304
3305    /* XXX(nnorwitz): rather than over-allocating, it would be
3306       better to choose a different scheme.  Perhaps scan the
3307       first N-chars of the string and allocate based on that size.
3308    */
3309    /* Initial allocation is based on the longest-possible unichr
3310       escape.
3311
3312       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3313       unichr, so in this case it's the longest unichr escape. In
3314       narrow (UTF-16) builds this is five chars per source unichr
3315       since there are two unichrs in the surrogate pair, so in narrow
3316       (UTF-16) builds it's not the longest unichr escape.
3317
3318       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3319       so in the narrow (UTF-16) build case it's the longest unichr
3320       escape.
3321    */
3322
3323    if (size == 0)
3324        return PyBytes_FromStringAndSize(NULL, 0);
3325
3326    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3327        return PyErr_NoMemory();
3328
3329    repr = PyBytes_FromStringAndSize(NULL,
3330                                     2
3331                                     + expandsize*size
3332                                     + 1);
3333    if (repr == NULL)
3334        return NULL;
3335
3336    p = PyBytes_AS_STRING(repr);
3337
3338    while (size-- > 0) {
3339        Py_UNICODE ch = *s++;
3340
3341        /* Escape backslashes */
3342        if (ch == '\\') {
3343            *p++ = '\\';
3344            *p++ = (char) ch;
3345            continue;
3346        }
3347
3348#ifdef Py_UNICODE_WIDE
3349        /* Map 21-bit characters to '\U00xxxxxx' */
3350        else if (ch >= 0x10000) {
3351            *p++ = '\\';
3352            *p++ = 'U';
3353            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3354            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3355            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3356            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3357            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3358            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3359            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3360            *p++ = hexdigits[ch & 0x0000000F];
3361            continue;
3362        }
3363#else
3364        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3365        else if (ch >= 0xD800 && ch < 0xDC00) {
3366            Py_UNICODE ch2;
3367            Py_UCS4 ucs;
3368
3369            ch2 = *s++;
3370            size--;
3371            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3372                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3373                *p++ = '\\';
3374                *p++ = 'U';
3375                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3376                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3377                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3378                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3379                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3380                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3381                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3382                *p++ = hexdigits[ucs & 0x0000000F];
3383                continue;
3384            }
3385            /* Fall through: isolated surrogates are copied as-is */
3386            s--;
3387            size++;
3388        }
3389#endif
3390
3391        /* Map 16-bit characters to '\uxxxx' */
3392        if (ch >= 256) {
3393            *p++ = '\\';
3394            *p++ = 'u';
3395            *p++ = hexdigits[(ch >> 12) & 0x000F];
3396            *p++ = hexdigits[(ch >> 8) & 0x000F];
3397            *p++ = hexdigits[(ch >> 4) & 0x000F];
3398            *p++ = hexdigits[ch & 0x000F];
3399        }
3400
3401        /* Map special whitespace to '\t', \n', '\r' */
3402        else if (ch == '\t') {
3403            *p++ = '\\';
3404            *p++ = 't';
3405        }
3406        else if (ch == '\n') {
3407            *p++ = '\\';
3408            *p++ = 'n';
3409        }
3410        else if (ch == '\r') {
3411            *p++ = '\\';
3412            *p++ = 'r';
3413        }
3414
3415        /* Map non-printable US ASCII to '\xhh' */
3416        else if (ch < ' ' || ch >= 0x7F) {
3417            *p++ = '\\';
3418            *p++ = 'x';
3419            *p++ = hexdigits[(ch >> 4) & 0x000F];
3420            *p++ = hexdigits[ch & 0x000F];
3421        }
3422
3423        /* Copy everything else as-is */
3424        else
3425            *p++ = (char) ch;
3426    }
3427
3428    assert(p - PyBytes_AS_STRING(repr) > 0);
3429    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3430        return NULL;
3431    return repr;
3432}
3433
3434PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3435{
3436    PyObject *s;
3437    if (!PyUnicode_Check(unicode)) {
3438        PyErr_BadArgument();
3439        return NULL;
3440    }
3441    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3442                                      PyUnicode_GET_SIZE(unicode));
3443    return s;
3444}
3445
3446/* --- Raw Unicode Escape Codec ------------------------------------------- */
3447
3448PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3449                                           Py_ssize_t size,
3450                                           const char *errors)
3451{
3452    const char *starts = s;
3453    Py_ssize_t startinpos;
3454    Py_ssize_t endinpos;
3455    Py_ssize_t outpos;
3456    PyUnicodeObject *v;
3457    Py_UNICODE *p;
3458    const char *end;
3459    const char *bs;
3460    PyObject *errorHandler = NULL;
3461    PyObject *exc = NULL;
3462
3463    /* Escaped strings will always be longer than the resulting
3464       Unicode string, so we start with size here and then reduce the
3465       length after conversion to the true value. (But decoding error
3466       handler might have to resize the string) */
3467    v = _PyUnicode_New(size);
3468    if (v == NULL)
3469        goto onError;
3470    if (size == 0)
3471        return (PyObject *)v;
3472    p = PyUnicode_AS_UNICODE(v);
3473    end = s + size;
3474    while (s < end) {
3475        unsigned char c;
3476        Py_UCS4 x;
3477        int i;
3478        int count;
3479
3480        /* Non-escape characters are interpreted as Unicode ordinals */
3481        if (*s != '\\') {
3482            *p++ = (unsigned char)*s++;
3483            continue;
3484        }
3485        startinpos = s-starts;
3486
3487        /* \u-escapes are only interpreted iff the number of leading
3488           backslashes if odd */
3489        bs = s;
3490        for (;s < end;) {
3491            if (*s != '\\')
3492                break;
3493            *p++ = (unsigned char)*s++;
3494        }
3495        if (((s - bs) & 1) == 0 ||
3496            s >= end ||
3497            (*s != 'u' && *s != 'U')) {
3498            continue;
3499        }
3500        p--;
3501        count = *s=='u' ? 4 : 8;
3502        s++;
3503
3504        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3505        outpos = p-PyUnicode_AS_UNICODE(v);
3506        for (x = 0, i = 0; i < count; ++i, ++s) {
3507            c = (unsigned char)*s;
3508            if (!ISXDIGIT(c)) {
3509                endinpos = s-starts;
3510                if (unicode_decode_call_errorhandler(
3511                        errors, &errorHandler,
3512                        "rawunicodeescape", "truncated \\uXXXX",
3513                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3514                        &v, &outpos, &p))
3515                    goto onError;
3516                goto nextByte;
3517            }
3518            x = (x<<4) & ~0xF;
3519            if (c >= '0' && c <= '9')
3520                x += c - '0';
3521            else if (c >= 'a' && c <= 'f')
3522                x += 10 + c - 'a';
3523            else
3524                x += 10 + c - 'A';
3525        }
3526        if (x <= 0xffff)
3527            /* UCS-2 character */
3528            *p++ = (Py_UNICODE) x;
3529        else if (x <= 0x10ffff) {
3530            /* UCS-4 character. Either store directly, or as
3531               surrogate pair. */
3532#ifdef Py_UNICODE_WIDE
3533            *p++ = (Py_UNICODE) x;
3534#else
3535            x -= 0x10000L;
3536            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3537            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3538#endif
3539        } else {
3540            endinpos = s-starts;
3541            outpos = p-PyUnicode_AS_UNICODE(v);
3542            if (unicode_decode_call_errorhandler(
3543                    errors, &errorHandler,
3544                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3545                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3546                    &v, &outpos, &p))
3547                goto onError;
3548        }
3549      nextByte:
3550        ;
3551    }
3552    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3553        goto onError;
3554    Py_XDECREF(errorHandler);
3555    Py_XDECREF(exc);
3556    return (PyObject *)v;
3557
3558  onError:
3559    Py_XDECREF(v);
3560    Py_XDECREF(errorHandler);
3561    Py_XDECREF(exc);
3562    return NULL;
3563}
3564
3565PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3566                                           Py_ssize_t size)
3567{
3568    PyObject *repr;
3569    char *p;
3570    char *q;
3571
3572#ifdef Py_UNICODE_WIDE
3573    const Py_ssize_t expandsize = 10;
3574#else
3575    const Py_ssize_t expandsize = 6;
3576#endif
3577
3578    if (size > PY_SSIZE_T_MAX / expandsize)
3579        return PyErr_NoMemory();
3580
3581    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
3582    if (repr == NULL)
3583        return NULL;
3584    if (size == 0)
3585        return repr;
3586
3587    p = q = PyBytes_AS_STRING(repr);
3588    while (size-- > 0) {
3589        Py_UNICODE ch = *s++;
3590#ifdef Py_UNICODE_WIDE
3591        /* Map 32-bit characters to '\Uxxxxxxxx' */
3592        if (ch >= 0x10000) {
3593            *p++ = '\\';
3594            *p++ = 'U';
3595            *p++ = hexdigits[(ch >> 28) & 0xf];
3596            *p++ = hexdigits[(ch >> 24) & 0xf];
3597            *p++ = hexdigits[(ch >> 20) & 0xf];
3598            *p++ = hexdigits[(ch >> 16) & 0xf];
3599            *p++ = hexdigits[(ch >> 12) & 0xf];
3600            *p++ = hexdigits[(ch >> 8) & 0xf];
3601            *p++ = hexdigits[(ch >> 4) & 0xf];
3602            *p++ = hexdigits[ch & 15];
3603        }
3604        else
3605#else
3606            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3607            if (ch >= 0xD800 && ch < 0xDC00) {
3608                Py_UNICODE ch2;
3609                Py_UCS4 ucs;
3610
3611                ch2 = *s++;
3612                size--;
3613                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3614                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3615                    *p++ = '\\';
3616                    *p++ = 'U';
3617                    *p++ = hexdigits[(ucs >> 28) & 0xf];
3618                    *p++ = hexdigits[(ucs >> 24) & 0xf];
3619                    *p++ = hexdigits[(ucs >> 20) & 0xf];
3620                    *p++ = hexdigits[(ucs >> 16) & 0xf];
3621                    *p++ = hexdigits[(ucs >> 12) & 0xf];
3622                    *p++ = hexdigits[(ucs >> 8) & 0xf];
3623                    *p++ = hexdigits[(ucs >> 4) & 0xf];
3624                    *p++ = hexdigits[ucs & 0xf];
3625                    continue;
3626                }
3627                /* Fall through: isolated surrogates are copied as-is */
3628                s--;
3629                size++;
3630            }
3631#endif
3632        /* Map 16-bit characters to '\uxxxx' */
3633        if (ch >= 256) {
3634            *p++ = '\\';
3635            *p++ = 'u';
3636            *p++ = hexdigits[(ch >> 12) & 0xf];
3637            *p++ = hexdigits[(ch >> 8) & 0xf];
3638            *p++ = hexdigits[(ch >> 4) & 0xf];
3639            *p++ = hexdigits[ch & 15];
3640        }
3641        /* Copy everything else as-is */
3642        else
3643            *p++ = (char) ch;
3644    }
3645    size = p - q;
3646
3647    assert(size > 0);
3648    if (_PyBytes_Resize(&repr, size) < 0)
3649        return NULL;
3650    return repr;
3651}
3652
3653PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3654{
3655    PyObject *s;
3656    if (!PyUnicode_Check(unicode)) {
3657        PyErr_BadArgument();
3658        return NULL;
3659    }
3660    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3661                                         PyUnicode_GET_SIZE(unicode));
3662
3663    return s;
3664}
3665
3666/* --- Unicode Internal Codec ------------------------------------------- */
3667
3668PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3669                                           Py_ssize_t size,
3670                                           const char *errors)
3671{
3672    const char *starts = s;
3673    Py_ssize_t startinpos;
3674    Py_ssize_t endinpos;
3675    Py_ssize_t outpos;
3676    PyUnicodeObject *v;
3677    Py_UNICODE *p;
3678    const char *end;
3679    const char *reason;
3680    PyObject *errorHandler = NULL;
3681    PyObject *exc = NULL;
3682
3683#ifdef Py_UNICODE_WIDE
3684    Py_UNICODE unimax = PyUnicode_GetMax();
3685#endif
3686
3687    /* XXX overflow detection missing */
3688    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3689    if (v == NULL)
3690        goto onError;
3691    if (PyUnicode_GetSize((PyObject *)v) == 0)
3692        return (PyObject *)v;
3693    p = PyUnicode_AS_UNICODE(v);
3694    end = s + size;
3695
3696    while (s < end) {
3697        memcpy(p, s, sizeof(Py_UNICODE));
3698        /* We have to sanity check the raw data, otherwise doom looms for
3699           some malformed UCS-4 data. */
3700        if (
3701#ifdef Py_UNICODE_WIDE
3702            *p > unimax || *p < 0 ||
3703#endif
3704            end-s < Py_UNICODE_SIZE
3705            )
3706        {
3707            startinpos = s - starts;
3708            if (end-s < Py_UNICODE_SIZE) {
3709                endinpos = end-starts;
3710                reason = "truncated input";
3711            }
3712            else {
3713                endinpos = s - starts + Py_UNICODE_SIZE;
3714                reason = "illegal code point (> 0x10FFFF)";
3715            }
3716            outpos = p - PyUnicode_AS_UNICODE(v);
3717            if (unicode_decode_call_errorhandler(
3718                    errors, &errorHandler,
3719                    "unicode_internal", reason,
3720                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3721                    &v, &outpos, &p)) {
3722                goto onError;
3723            }
3724        }
3725        else {
3726            p++;
3727            s += Py_UNICODE_SIZE;
3728        }
3729    }
3730
3731    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3732        goto onError;
3733    Py_XDECREF(errorHandler);
3734    Py_XDECREF(exc);
3735    return (PyObject *)v;
3736
3737  onError:
3738    Py_XDECREF(v);
3739    Py_XDECREF(errorHandler);
3740    Py_XDECREF(exc);
3741    return NULL;
3742}
3743
3744/* --- Latin-1 Codec ------------------------------------------------------ */
3745
3746PyObject *PyUnicode_DecodeLatin1(const char *s,
3747                                 Py_ssize_t size,
3748                                 const char *errors)
3749{
3750    PyUnicodeObject *v;
3751    Py_UNICODE *p;
3752    const char *e, *unrolled_end;
3753
3754    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3755    if (size == 1) {
3756        Py_UNICODE r = *(unsigned char*)s;
3757        return PyUnicode_FromUnicode(&r, 1);
3758    }
3759
3760    v = _PyUnicode_New(size);
3761    if (v == NULL)
3762        goto onError;
3763    if (size == 0)
3764        return (PyObject *)v;
3765    p = PyUnicode_AS_UNICODE(v);
3766    e = s + size;
3767    /* Unrolling the copy makes it much faster by reducing the looping
3768       overhead. This is similar to what many memcpy() implementations do. */
3769    unrolled_end = e - 4;
3770    while (s < unrolled_end) {
3771        p[0] = (unsigned char) s[0];
3772        p[1] = (unsigned char) s[1];
3773        p[2] = (unsigned char) s[2];
3774        p[3] = (unsigned char) s[3];
3775        s += 4;
3776        p += 4;
3777    }
3778    while (s < e)
3779        *p++ = (unsigned char) *s++;
3780    return (PyObject *)v;
3781
3782  onError:
3783    Py_XDECREF(v);
3784    return NULL;
3785}
3786
3787/* create or adjust a UnicodeEncodeError */
3788static void make_encode_exception(PyObject **exceptionObject,
3789                                  const char *encoding,
3790                                  const Py_UNICODE *unicode, Py_ssize_t size,
3791                                  Py_ssize_t startpos, Py_ssize_t endpos,
3792                                  const char *reason)
3793{
3794    if (*exceptionObject == NULL) {
3795        *exceptionObject = PyUnicodeEncodeError_Create(
3796            encoding, unicode, size, startpos, endpos, reason);
3797    }
3798    else {
3799        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3800            goto onError;
3801        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3802            goto onError;
3803        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3804            goto onError;
3805        return;
3806      onError:
3807        Py_DECREF(*exceptionObject);
3808        *exceptionObject = NULL;
3809    }
3810}
3811
3812/* raises a UnicodeEncodeError */
3813static void raise_encode_exception(PyObject **exceptionObject,
3814                                   const char *encoding,
3815                                   const Py_UNICODE *unicode, Py_ssize_t size,
3816                                   Py_ssize_t startpos, Py_ssize_t endpos,
3817                                   const char *reason)
3818{
3819    make_encode_exception(exceptionObject,
3820                          encoding, unicode, size, startpos, endpos, reason);
3821    if (*exceptionObject != NULL)
3822        PyCodec_StrictErrors(*exceptionObject);
3823}
3824
3825/* error handling callback helper:
3826   build arguments, call the callback and check the arguments,
3827   put the result into newpos and return the replacement string, which
3828   has to be freed by the caller */
3829static PyObject *unicode_encode_call_errorhandler(const char *errors,
3830                                                  PyObject **errorHandler,
3831                                                  const char *encoding, const char *reason,
3832                                                  const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3833                                                  Py_ssize_t startpos, Py_ssize_t endpos,
3834                                                  Py_ssize_t *newpos)
3835{
3836    static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
3837
3838    PyObject *restuple;
3839    PyObject *resunicode;
3840
3841    if (*errorHandler == NULL) {
3842        *errorHandler = PyCodec_LookupError(errors);
3843        if (*errorHandler == NULL)
3844            return NULL;
3845    }
3846
3847    make_encode_exception(exceptionObject,
3848                          encoding, unicode, size, startpos, endpos, reason);
3849    if (*exceptionObject == NULL)
3850        return NULL;
3851
3852    restuple = PyObject_CallFunctionObjArgs(
3853        *errorHandler, *exceptionObject, NULL);
3854    if (restuple == NULL)
3855        return NULL;
3856    if (!PyTuple_Check(restuple)) {
3857        PyErr_Format(PyExc_TypeError, &argparse[4]);
3858        Py_DECREF(restuple);
3859        return NULL;
3860    }
3861    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3862                          &resunicode, newpos)) {
3863        Py_DECREF(restuple);
3864        return NULL;
3865    }
3866    if (*newpos<0)
3867        *newpos = size+*newpos;
3868    if (*newpos<0 || *newpos>size) {
3869        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3870        Py_DECREF(restuple);
3871        return NULL;
3872    }
3873    Py_INCREF(resunicode);
3874    Py_DECREF(restuple);
3875    return resunicode;
3876}
3877
3878static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3879                                     Py_ssize_t size,
3880                                     const char *errors,
3881                                     int limit)
3882{
3883    /* output object */
3884    PyObject *res;
3885    /* pointers to the beginning and end+1 of input */
3886    const Py_UNICODE *startp = p;
3887    const Py_UNICODE *endp = p + size;
3888    /* pointer to the beginning of the unencodable characters */
3889    /* const Py_UNICODE *badp = NULL; */
3890    /* pointer into the output */
3891    char *str;
3892    /* current output position */
3893    Py_ssize_t ressize;
3894    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3895    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3896    PyObject *errorHandler = NULL;
3897    PyObject *exc = NULL;
3898    /* the following variable is used for caching string comparisons
3899     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3900    int known_errorHandler = -1;
3901
3902    /* allocate enough for a simple encoding without
3903       replacements, if we need more, we'll resize */
3904    if (size == 0)
3905        return PyBytes_FromStringAndSize(NULL, 0);
3906    res = PyBytes_FromStringAndSize(NULL, size);
3907    if (res == NULL)
3908        return NULL;
3909    str = PyBytes_AS_STRING(res);
3910    ressize = size;
3911
3912    while (p<endp) {
3913        Py_UNICODE c = *p;
3914
3915        /* can we encode this? */
3916        if (c<limit) {
3917            /* no overflow check, because we know that the space is enough */
3918            *str++ = (char)c;
3919            ++p;
3920        }
3921        else {
3922            Py_ssize_t unicodepos = p-startp;
3923            Py_ssize_t requiredsize;
3924            PyObject *repunicode;
3925            Py_ssize_t repsize;
3926            Py_ssize_t newpos;
3927            Py_ssize_t respos;
3928            Py_UNICODE *uni2;
3929            /* startpos for collecting unencodable chars */
3930            const Py_UNICODE *collstart = p;
3931            const Py_UNICODE *collend = p;
3932            /* find all unecodable characters */
3933            while ((collend < endp) && ((*collend)>=limit))
3934                ++collend;
3935            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3936            if (known_errorHandler==-1) {
3937                if ((errors==NULL) || (!strcmp(errors, "strict")))
3938                    known_errorHandler = 1;
3939                else if (!strcmp(errors, "replace"))
3940                    known_errorHandler = 2;
3941                else if (!strcmp(errors, "ignore"))
3942                    known_errorHandler = 3;
3943                else if (!strcmp(errors, "xmlcharrefreplace"))
3944                    known_errorHandler = 4;
3945                else
3946                    known_errorHandler = 0;
3947            }
3948            switch (known_errorHandler) {
3949            case 1: /* strict */
3950                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3951                goto onError;
3952            case 2: /* replace */
3953                while (collstart++<collend)
3954                    *str++ = '?'; /* fall through */
3955            case 3: /* ignore */
3956                p = collend;
3957                break;
3958            case 4: /* xmlcharrefreplace */
3959                respos = str - PyBytes_AS_STRING(res);
3960                /* determine replacement size (temporarily (mis)uses p) */
3961                for (p = collstart, repsize = 0; p < collend; ++p) {
3962                    if (*p<10)
3963                        repsize += 2+1+1;
3964                    else if (*p<100)
3965                        repsize += 2+2+1;
3966                    else if (*p<1000)
3967                        repsize += 2+3+1;
3968                    else if (*p<10000)
3969                        repsize += 2+4+1;
3970#ifndef Py_UNICODE_WIDE
3971                    else
3972                        repsize += 2+5+1;
3973#else
3974                    else if (*p<100000)
3975                        repsize += 2+5+1;
3976                    else if (*p<1000000)
3977                        repsize += 2+6+1;
3978                    else
3979                        repsize += 2+7+1;
3980#endif
3981                }
3982                requiredsize = respos+repsize+(endp-collend);
3983                if (requiredsize > ressize) {
3984                    if (requiredsize<2*ressize)
3985                        requiredsize = 2*ressize;
3986                    if (_PyBytes_Resize(&res, requiredsize))
3987                        goto onError;
3988                    str = PyBytes_AS_STRING(res) + respos;
3989                    ressize = requiredsize;
3990                }
3991                /* generate replacement (temporarily (mis)uses p) */
3992                for (p = collstart; p < collend; ++p) {
3993                    str += sprintf(str, "&#%d;", (int)*p);
3994                }
3995                p = collend;
3996                break;
3997            default:
3998                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3999                                                              encoding, reason, startp, size, &exc,
4000                                                              collstart-startp, collend-startp, &newpos);
4001                if (repunicode == NULL)
4002                    goto onError;
4003                /* need more space? (at least enough for what we
4004                   have+the replacement+the rest of the string, so
4005                   we won't have to check space for encodable characters) */
4006                respos = str - PyBytes_AS_STRING(res);
4007                repsize = PyUnicode_GET_SIZE(repunicode);
4008                requiredsize = respos+repsize+(endp-collend);
4009                if (requiredsize > ressize) {
4010                    if (requiredsize<2*ressize)
4011                        requiredsize = 2*ressize;
4012                    if (_PyBytes_Resize(&res, requiredsize)) {
4013                        Py_DECREF(repunicode);
4014                        goto onError;
4015                    }
4016                    str = PyBytes_AS_STRING(res) + respos;
4017                    ressize = requiredsize;
4018                }
4019                /* check if there is anything unencodable in the replacement
4020                   and copy it to the output */
4021                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4022                    c = *uni2;
4023                    if (c >= limit) {
4024                        raise_encode_exception(&exc, encoding, startp, size,
4025                                               unicodepos, unicodepos+1, reason);
4026                        Py_DECREF(repunicode);
4027                        goto onError;
4028                    }
4029                    *str = (char)c;
4030                }
4031                p = startp + newpos;
4032                Py_DECREF(repunicode);
4033            }
4034        }
4035    }
4036    /* Resize if we allocated to much */
4037    size = str - PyBytes_AS_STRING(res);
4038    if (size < ressize) { /* If this falls res will be NULL */
4039        assert(size >= 0);
4040        if (_PyBytes_Resize(&res, size) < 0)
4041            goto onError;
4042    }
4043
4044    Py_XDECREF(errorHandler);
4045    Py_XDECREF(exc);
4046    return res;
4047
4048  onError:
4049    Py_XDECREF(res);
4050    Py_XDECREF(errorHandler);
4051    Py_XDECREF(exc);
4052    return NULL;
4053}
4054
4055PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4056                                 Py_ssize_t size,
4057                                 const char *errors)
4058{
4059    return unicode_encode_ucs1(p, size, errors, 256);
4060}
4061
4062PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4063{
4064    if (!PyUnicode_Check(unicode)) {
4065        PyErr_BadArgument();
4066        return NULL;
4067    }
4068    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4069                                  PyUnicode_GET_SIZE(unicode),
4070                                  NULL);
4071}
4072
4073/* --- 7-bit ASCII Codec -------------------------------------------------- */
4074
4075PyObject *PyUnicode_DecodeASCII(const char *s,
4076                                Py_ssize_t size,
4077                                const char *errors)
4078{
4079    const char *starts = s;
4080    PyUnicodeObject *v;
4081    Py_UNICODE *p;
4082    Py_ssize_t startinpos;
4083    Py_ssize_t endinpos;
4084    Py_ssize_t outpos;
4085    const char *e;
4086    PyObject *errorHandler = NULL;
4087    PyObject *exc = NULL;
4088
4089    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4090    if (size == 1 && *(unsigned char*)s < 128) {
4091        Py_UNICODE r = *(unsigned char*)s;
4092        return PyUnicode_FromUnicode(&r, 1);
4093    }
4094
4095    v = _PyUnicode_New(size);
4096    if (v == NULL)
4097        goto onError;
4098    if (size == 0)
4099        return (PyObject *)v;
4100    p = PyUnicode_AS_UNICODE(v);
4101    e = s + size;
4102    while (s < e) {
4103        register unsigned char c = (unsigned char)*s;
4104        if (c < 128) {
4105            *p++ = c;
4106            ++s;
4107        }
4108        else {
4109            startinpos = s-starts;
4110            endinpos = startinpos + 1;
4111            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4112            if (unicode_decode_call_errorhandler(
4113                    errors, &errorHandler,
4114                    "ascii", "ordinal not in range(128)",
4115                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4116                    &v, &outpos, &p))
4117                goto onError;
4118        }
4119    }
4120    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4121        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4122            goto onError;
4123    Py_XDECREF(errorHandler);
4124    Py_XDECREF(exc);
4125    return (PyObject *)v;
4126
4127  onError:
4128    Py_XDECREF(v);
4129    Py_XDECREF(errorHandler);
4130    Py_XDECREF(exc);
4131    return NULL;
4132}
4133
4134PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4135                                Py_ssize_t size,
4136                                const char *errors)
4137{
4138    return unicode_encode_ucs1(p, size, errors, 128);
4139}
4140
4141PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4142{
4143    if (!PyUnicode_Check(unicode)) {
4144        PyErr_BadArgument();
4145        return NULL;
4146    }
4147    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4148                                 PyUnicode_GET_SIZE(unicode),
4149                                 NULL);
4150}
4151
4152#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4153
4154/* --- MBCS codecs for Windows -------------------------------------------- */
4155
4156#if SIZEOF_INT < SIZEOF_SSIZE_T
4157#define NEED_RETRY
4158#endif
4159
4160/* XXX This code is limited to "true" double-byte encodings, as
4161   a) it assumes an incomplete character consists of a single byte, and
4162   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4163   encodings, see IsDBCSLeadByteEx documentation. */
4164
4165static int is_dbcs_lead_byte(const char *s, int offset)
4166{
4167    const char *curr = s + offset;
4168
4169    if (IsDBCSLeadByte(*curr)) {
4170        const char *prev = CharPrev(s, curr);
4171        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4172    }
4173    return 0;
4174}
4175
4176/*
4177 * Decode MBCS string into unicode object. If 'final' is set, converts
4178 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4179 */
4180static int decode_mbcs(PyUnicodeObject **v,
4181                       const char *s, /* MBCS string */
4182                       int size, /* sizeof MBCS string */
4183                       int final)
4184{
4185    Py_UNICODE *p;
4186    Py_ssize_t n = 0;
4187    int usize = 0;
4188
4189    assert(size >= 0);
4190
4191    /* Skip trailing lead-byte unless 'final' is set */
4192    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4193        --size;
4194
4195    /* First get the size of the result */
4196    if (size > 0) {
4197        usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4198        if (usize == 0) {
4199            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4200            return -1;
4201        }
4202    }
4203
4204    if (*v == NULL) {
4205        /* Create unicode object */
4206        *v = _PyUnicode_New(usize);
4207        if (*v == NULL)
4208            return -1;
4209    }
4210    else {
4211        /* Extend unicode object */
4212        n = PyUnicode_GET_SIZE(*v);
4213        if (_PyUnicode_Resize(v, n + usize) < 0)
4214            return -1;
4215    }
4216
4217    /* Do the conversion */
4218    if (size > 0) {
4219        p = PyUnicode_AS_UNICODE(*v) + n;
4220        if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4221            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4222            return -1;
4223        }
4224    }
4225
4226    return size;
4227}
4228
4229PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4230                                       Py_ssize_t size,
4231                                       const char *errors,
4232                                       Py_ssize_t *consumed)
4233{
4234    PyUnicodeObject *v = NULL;
4235    int done;
4236
4237    if (consumed)
4238        *consumed = 0;
4239
4240#ifdef NEED_RETRY
4241  retry:
4242    if (size > INT_MAX)
4243        done = decode_mbcs(&v, s, INT_MAX, 0);
4244    else
4245#endif
4246        done = decode_mbcs(&v, s, (int)size, !consumed);
4247
4248    if (done < 0) {
4249        Py_XDECREF(v);
4250        return NULL;
4251    }
4252
4253    if (consumed)
4254        *consumed += done;
4255
4256#ifdef NEED_RETRY
4257    if (size > INT_MAX) {
4258        s += done;
4259        size -= done;
4260        goto retry;
4261    }
4262#endif
4263
4264    return (PyObject *)v;
4265}
4266
4267PyObject *PyUnicode_DecodeMBCS(const char *s,
4268                               Py_ssize_t size,
4269                               const char *errors)
4270{
4271    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4272}
4273
4274/*
4275 * Convert unicode into string object (MBCS).
4276 * Returns 0 if succeed, -1 otherwise.
4277 */
4278static int encode_mbcs(PyObject **repr,
4279                       const Py_UNICODE *p, /* unicode */
4280                       int size) /* size of unicode */
4281{
4282    int mbcssize = 0;
4283    Py_ssize_t n = 0;
4284
4285    assert(size >= 0);
4286
4287    /* First get the size of the result */
4288    if (size > 0) {
4289        mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4290        if (mbcssize == 0) {
4291            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4292            return -1;
4293        }
4294    }
4295
4296    if (*repr == NULL) {
4297        /* Create string object */
4298        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4299        if (*repr == NULL)
4300            return -1;
4301    }
4302    else {
4303        /* Extend string object */
4304        n = PyBytes_Size(*repr);
4305        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4306            return -1;
4307    }
4308
4309    /* Do the conversion */
4310    if (size > 0) {
4311        char *s = PyBytes_AS_STRING(*repr) + n;
4312        if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4313            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4314            return -1;
4315        }
4316    }
4317
4318    return 0;
4319}
4320
4321PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4322                               Py_ssize_t size,
4323                               const char *errors)
4324{
4325    PyObject *repr = NULL;
4326    int ret;
4327
4328#ifdef NEED_RETRY
4329  retry:
4330    if (size > INT_MAX)
4331        ret = encode_mbcs(&repr, p, INT_MAX);
4332    else
4333#endif
4334        ret = encode_mbcs(&repr, p, (int)size);
4335
4336    if (ret < 0) {
4337        Py_XDECREF(repr);
4338        return NULL;
4339    }
4340
4341#ifdef NEED_RETRY
4342    if (size > INT_MAX) {
4343        p += INT_MAX;
4344        size -= INT_MAX;
4345        goto retry;
4346    }
4347#endif
4348
4349    return repr;
4350}
4351
4352PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4353{
4354    if (!PyUnicode_Check(unicode)) {
4355        PyErr_BadArgument();
4356        return NULL;
4357    }
4358    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4359                                PyUnicode_GET_SIZE(unicode),
4360                                NULL);
4361}
4362
4363#undef NEED_RETRY
4364
4365#endif /* MS_WINDOWS */
4366
4367/* --- Character Mapping Codec -------------------------------------------- */
4368
4369PyObject *PyUnicode_DecodeCharmap(const char *s,
4370                                  Py_ssize_t size,
4371                                  PyObject *mapping,
4372                                  const char *errors)
4373{
4374    const char *starts = s;
4375    Py_ssize_t startinpos;
4376    Py_ssize_t endinpos;
4377    Py_ssize_t outpos;
4378    const char *e;
4379    PyUnicodeObject *v;
4380    Py_UNICODE *p;
4381    Py_ssize_t extrachars = 0;
4382    PyObject *errorHandler = NULL;
4383    PyObject *exc = NULL;
4384    Py_UNICODE *mapstring = NULL;
4385    Py_ssize_t maplen = 0;
4386
4387    /* Default to Latin-1 */
4388    if (mapping == NULL)
4389        return PyUnicode_DecodeLatin1(s, size, errors);
4390
4391    v = _PyUnicode_New(size);
4392    if (v == NULL)
4393        goto onError;
4394    if (size == 0)
4395        return (PyObject *)v;
4396    p = PyUnicode_AS_UNICODE(v);
4397    e = s + size;
4398    if (PyUnicode_CheckExact(mapping)) {
4399        mapstring = PyUnicode_AS_UNICODE(mapping);
4400        maplen = PyUnicode_GET_SIZE(mapping);
4401        while (s < e) {
4402            unsigned char ch = *s;
4403            Py_UNICODE x = 0xfffe; /* illegal value */
4404
4405            if (ch < maplen)
4406                x = mapstring[ch];
4407
4408            if (x == 0xfffe) {
4409                /* undefined mapping */
4410                outpos = p-PyUnicode_AS_UNICODE(v);
4411                startinpos = s-starts;
4412                endinpos = startinpos+1;
4413                if (unicode_decode_call_errorhandler(
4414                        errors, &errorHandler,
4415                        "charmap", "character maps to <undefined>",
4416                        &starts, &e, &startinpos, &endinpos, &exc, &s,
4417                        &v, &outpos, &p)) {
4418                    goto onError;
4419                }
4420                continue;
4421            }
4422            *p++ = x;
4423            ++s;
4424        }
4425    }
4426    else {
4427        while (s < e) {
4428            unsigned char ch = *s;
4429            PyObject *w, *x;
4430
4431            /* Get mapping (char ordinal -> integer, Unicode char or None) */
4432            w = PyLong_FromLong((long)ch);
4433            if (w == NULL)
4434                goto onError;
4435            x = PyObject_GetItem(mapping, w);
4436            Py_DECREF(w);
4437            if (x == NULL) {
4438                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4439                    /* No mapping found means: mapping is undefined. */
4440                    PyErr_Clear();
4441                    x = Py_None;
4442                    Py_INCREF(x);
4443                } else
4444                    goto onError;
4445            }
4446
4447            /* Apply mapping */
4448            if (PyLong_Check(x)) {
4449                long value = PyLong_AS_LONG(x);
4450                if (value < 0 || value > 65535) {
4451                    PyErr_SetString(PyExc_TypeError,
4452                                    "character mapping must be in range(65536)");
4453                    Py_DECREF(x);
4454                    goto onError;
4455                }
4456                *p++ = (Py_UNICODE)value;
4457            }
4458            else if (x == Py_None) {
4459                /* undefined mapping */
4460                outpos = p-PyUnicode_AS_UNICODE(v);
4461                startinpos = s-starts;
4462                endinpos = startinpos+1;
4463                if (unicode_decode_call_errorhandler(
4464                        errors, &errorHandler,
4465                        "charmap", "character maps to <undefined>",
4466                        &starts, &e, &startinpos, &endinpos, &exc, &s,
4467                        &v, &outpos, &p)) {
4468                    Py_DECREF(x);
4469                    goto onError;
4470                }
4471                Py_DECREF(x);
4472                continue;
4473            }
4474            else if (PyUnicode_Check(x)) {
4475                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4476
4477                if (targetsize == 1)
4478                    /* 1-1 mapping */
4479                    *p++ = *PyUnicode_AS_UNICODE(x);
4480
4481                else if (targetsize > 1) {
4482                    /* 1-n mapping */
4483                    if (targetsize > extrachars) {
4484                        /* resize first */
4485                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4486                        Py_ssize_t needed = (targetsize - extrachars) + \
4487                            (targetsize << 2);
4488                        extrachars += needed;
4489                        /* XXX overflow detection missing */
4490                        if (_PyUnicode_Resize(&v,
4491                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
4492                            Py_DECREF(x);
4493                            goto onError;
4494                        }
4495                        p = PyUnicode_AS_UNICODE(v) + oldpos;
4496                    }
4497                    Py_UNICODE_COPY(p,
4498                                    PyUnicode_AS_UNICODE(x),
4499                                    targetsize);
4500                    p += targetsize;
4501                    extrachars -= targetsize;
4502                }
4503                /* 1-0 mapping: skip the character */
4504            }
4505            else {
4506                /* wrong return value */
4507                PyErr_SetString(PyExc_TypeError,
4508                                "character mapping must return integer, None or str");
4509                Py_DECREF(x);
4510                goto onError;
4511            }
4512            Py_DECREF(x);
4513            ++s;
4514        }
4515    }
4516    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4517        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4518            goto onError;
4519    Py_XDECREF(errorHandler);
4520    Py_XDECREF(exc);
4521    return (PyObject *)v;
4522
4523  onError:
4524    Py_XDECREF(errorHandler);
4525    Py_XDECREF(exc);
4526    Py_XDECREF(v);
4527    return NULL;
4528}
4529
4530/* Charmap encoding: the lookup table */
4531
4532struct encoding_map{
4533    PyObject_HEAD
4534    unsigned char level1[32];
4535    int count2, count3;
4536    unsigned char level23[1];
4537};
4538
4539static PyObject*
4540encoding_map_size(PyObject *obj, PyObject* args)
4541{
4542    struct encoding_map *map = (struct encoding_map*)obj;
4543    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4544                           128*map->count3);
4545}
4546
4547static PyMethodDef encoding_map_methods[] = {
4548    {"size", encoding_map_size, METH_NOARGS,
4549     PyDoc_STR("Return the size (in bytes) of this object") },
4550    { 0 }
4551};
4552
4553static void
4554encoding_map_dealloc(PyObject* o)
4555{
4556    PyObject_FREE(o);
4557}
4558
4559static PyTypeObject EncodingMapType = {
4560    PyVarObject_HEAD_INIT(NULL, 0)
4561    "EncodingMap",          /*tp_name*/
4562    sizeof(struct encoding_map),   /*tp_basicsize*/
4563    0,                      /*tp_itemsize*/
4564    /* methods */
4565    encoding_map_dealloc,   /*tp_dealloc*/
4566    0,                      /*tp_print*/
4567    0,                      /*tp_getattr*/
4568    0,                      /*tp_setattr*/
4569    0,                      /*tp_reserved*/
4570    0,                      /*tp_repr*/
4571    0,                      /*tp_as_number*/
4572    0,                      /*tp_as_sequence*/
4573    0,                      /*tp_as_mapping*/
4574    0,                      /*tp_hash*/
4575    0,                      /*tp_call*/
4576    0,                      /*tp_str*/
4577    0,                      /*tp_getattro*/
4578    0,                      /*tp_setattro*/
4579    0,                      /*tp_as_buffer*/
4580    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4581    0,                      /*tp_doc*/
4582    0,                      /*tp_traverse*/
4583    0,                      /*tp_clear*/
4584    0,                      /*tp_richcompare*/
4585    0,                      /*tp_weaklistoffset*/
4586    0,                      /*tp_iter*/
4587    0,                      /*tp_iternext*/
4588    encoding_map_methods,   /*tp_methods*/
4589    0,                      /*tp_members*/
4590    0,                      /*tp_getset*/
4591    0,                      /*tp_base*/
4592    0,                      /*tp_dict*/
4593    0,                      /*tp_descr_get*/
4594    0,                      /*tp_descr_set*/
4595    0,                      /*tp_dictoffset*/
4596    0,                      /*tp_init*/
4597    0,                      /*tp_alloc*/
4598    0,                      /*tp_new*/
4599    0,                      /*tp_free*/
4600    0,                      /*tp_is_gc*/
4601};
4602
4603PyObject*
4604PyUnicode_BuildEncodingMap(PyObject* string)
4605{
4606    Py_UNICODE *decode;
4607    PyObject *result;
4608    struct encoding_map *mresult;
4609    int i;
4610    int need_dict = 0;
4611    unsigned char level1[32];
4612    unsigned char level2[512];
4613    unsigned char *mlevel1, *mlevel2, *mlevel3;
4614    int count2 = 0, count3 = 0;
4615
4616    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4617        PyErr_BadArgument();
4618        return NULL;
4619    }
4620    decode = PyUnicode_AS_UNICODE(string);
4621    memset(level1, 0xFF, sizeof level1);
4622    memset(level2, 0xFF, sizeof level2);
4623
4624    /* If there isn't a one-to-one mapping of NULL to \0,
4625       or if there are non-BMP characters, we need to use
4626       a mapping dictionary. */
4627    if (decode[0] != 0)
4628        need_dict = 1;
4629    for (i = 1; i < 256; i++) {
4630        int l1, l2;
4631        if (decode[i] == 0
4632#ifdef Py_UNICODE_WIDE
4633            || decode[i] > 0xFFFF
4634#endif
4635            ) {
4636            need_dict = 1;
4637            break;
4638        }
4639        if (decode[i] == 0xFFFE)
4640            /* unmapped character */
4641            continue;
4642        l1 = decode[i] >> 11;
4643        l2 = decode[i] >> 7;
4644        if (level1[l1] == 0xFF)
4645            level1[l1] = count2++;
4646        if (level2[l2] == 0xFF)
4647            level2[l2] = count3++;
4648    }
4649
4650    if (count2 >= 0xFF || count3 >= 0xFF)
4651        need_dict = 1;
4652
4653    if (need_dict) {
4654        PyObject *result = PyDict_New();
4655        PyObject *key, *value;
4656        if (!result)
4657            return NULL;
4658        for (i = 0; i < 256; i++) {
4659            key = value = NULL;
4660            key = PyLong_FromLong(decode[i]);
4661            value = PyLong_FromLong(i);
4662            if (!key || !value)
4663                goto failed1;
4664            if (PyDict_SetItem(result, key, value) == -1)
4665                goto failed1;
4666            Py_DECREF(key);
4667            Py_DECREF(value);
4668        }
4669        return result;
4670      failed1:
4671        Py_XDECREF(key);
4672        Py_XDECREF(value);
4673        Py_DECREF(result);
4674        return NULL;
4675    }
4676
4677    /* Create a three-level trie */
4678    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4679                             16*count2 + 128*count3 - 1);
4680    if (!result)
4681        return PyErr_NoMemory();
4682    PyObject_Init(result, &EncodingMapType);
4683    mresult = (struct encoding_map*)result;
4684    mresult->count2 = count2;
4685    mresult->count3 = count3;
4686    mlevel1 = mresult->level1;
4687    mlevel2 = mresult->level23;
4688    mlevel3 = mresult->level23 + 16*count2;
4689    memcpy(mlevel1, level1, 32);
4690    memset(mlevel2, 0xFF, 16*count2);
4691    memset(mlevel3, 0, 128*count3);
4692    count3 = 0;
4693    for (i = 1; i < 256; i++) {
4694        int o1, o2, o3, i2, i3;
4695        if (decode[i] == 0xFFFE)
4696            /* unmapped character */
4697            continue;
4698        o1 = decode[i]>>11;
4699        o2 = (decode[i]>>7) & 0xF;
4700        i2 = 16*mlevel1[o1] + o2;
4701        if (mlevel2[i2] == 0xFF)
4702            mlevel2[i2] = count3++;
4703        o3 = decode[i] & 0x7F;
4704        i3 = 128*mlevel2[i2] + o3;
4705        mlevel3[i3] = i;
4706    }
4707    return result;
4708}
4709
4710static int
4711encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4712{
4713    struct encoding_map *map = (struct encoding_map*)mapping;
4714    int l1 = c>>11;
4715    int l2 = (c>>7) & 0xF;
4716    int l3 = c & 0x7F;
4717    int i;
4718
4719#ifdef Py_UNICODE_WIDE
4720    if (c > 0xFFFF) {
4721        return -1;
4722    }
4723#endif
4724    if (c == 0)
4725        return 0;
4726    /* level 1*/
4727    i = map->level1[l1];
4728    if (i == 0xFF) {
4729        return -1;
4730    }
4731    /* level 2*/
4732    i = map->level23[16*i+l2];
4733    if (i == 0xFF) {
4734        return -1;
4735    }
4736    /* level 3 */
4737    i = map->level23[16*map->count2 + 128*i + l3];
4738    if (i == 0) {
4739        return -1;
4740    }
4741    return i;
4742}
4743
4744/* Lookup the character ch in the mapping. If the character
4745   can't be found, Py_None is returned (or NULL, if another
4746   error occurred). */
4747static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4748{
4749    PyObject *w = PyLong_FromLong((long)c);
4750    PyObject *x;
4751
4752    if (w == NULL)
4753        return NULL;
4754    x = PyObject_GetItem(mapping, w);
4755    Py_DECREF(w);
4756    if (x == NULL) {
4757        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4758            /* No mapping found means: mapping is undefined. */
4759            PyErr_Clear();
4760            x = Py_None;
4761            Py_INCREF(x);
4762            return x;
4763        } else
4764            return NULL;
4765    }
4766    else if (x == Py_None)
4767        return x;
4768    else if (PyLong_Check(x)) {
4769        long value = PyLong_AS_LONG(x);
4770        if (value < 0 || value > 255) {
4771            PyErr_SetString(PyExc_TypeError,
4772                            "character mapping must be in range(256)");
4773            Py_DECREF(x);
4774            return NULL;
4775        }
4776        return x;
4777    }
4778    else if (PyBytes_Check(x))
4779        return x;
4780    else {
4781        /* wrong return value */
4782        PyErr_Format(PyExc_TypeError,
4783                     "character mapping must return integer, bytes or None, not %.400s",
4784                     x->ob_type->tp_name);
4785        Py_DECREF(x);
4786        return NULL;
4787    }
4788}
4789
4790static int
4791charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4792{
4793    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4794    /* exponentially overallocate to minimize reallocations */
4795    if (requiredsize < 2*outsize)
4796        requiredsize = 2*outsize;
4797    if (_PyBytes_Resize(outobj, requiredsize))
4798        return -1;
4799    return 0;
4800}
4801
4802typedef enum charmapencode_result {
4803    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4804}charmapencode_result;
4805/* lookup the character, put the result in the output string and adjust
4806   various state variables. Resize the output bytes object if not enough
4807   space is available. Return a new reference to the object that
4808   was put in the output buffer, or Py_None, if the mapping was undefined
4809   (in which case no character was written) or NULL, if a
4810   reallocation error occurred. The caller must decref the result */
4811static
4812charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4813                                          PyObject **outobj, Py_ssize_t *outpos)
4814{
4815    PyObject *rep;
4816    char *outstart;
4817    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4818
4819    if (Py_TYPE(mapping) == &EncodingMapType) {
4820        int res = encoding_map_lookup(c, mapping);
4821        Py_ssize_t requiredsize = *outpos+1;
4822        if (res == -1)
4823            return enc_FAILED;
4824        if (outsize<requiredsize)
4825            if (charmapencode_resize(outobj, outpos, requiredsize))
4826                return enc_EXCEPTION;
4827        outstart = PyBytes_AS_STRING(*outobj);
4828        outstart[(*outpos)++] = (char)res;
4829        return enc_SUCCESS;
4830    }
4831
4832    rep = charmapencode_lookup(c, mapping);
4833    if (rep==NULL)
4834        return enc_EXCEPTION;
4835    else if (rep==Py_None) {
4836        Py_DECREF(rep);
4837        return enc_FAILED;
4838    } else {
4839        if (PyLong_Check(rep)) {
4840            Py_ssize_t requiredsize = *outpos+1;
4841            if (outsize<requiredsize)
4842                if (charmapencode_resize(outobj, outpos, requiredsize)) {
4843                    Py_DECREF(rep);
4844                    return enc_EXCEPTION;
4845                }
4846            outstart = PyBytes_AS_STRING(*outobj);
4847            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
4848        }
4849        else {
4850            const char *repchars = PyBytes_AS_STRING(rep);
4851            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4852            Py_ssize_t requiredsize = *outpos+repsize;
4853            if (outsize<requiredsize)
4854                if (charmapencode_resize(outobj, outpos, requiredsize)) {
4855                    Py_DECREF(rep);
4856                    return enc_EXCEPTION;
4857                }
4858            outstart = PyBytes_AS_STRING(*outobj);
4859            memcpy(outstart + *outpos, repchars, repsize);
4860            *outpos += repsize;
4861        }
4862    }
4863    Py_DECREF(rep);
4864    return enc_SUCCESS;
4865}
4866
4867/* handle an error in PyUnicode_EncodeCharmap
4868   Return 0 on success, -1 on error */
4869static
4870int charmap_encoding_error(
4871    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4872    PyObject **exceptionObject,
4873    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4874    PyObject **res, Py_ssize_t *respos)
4875{
4876    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4877    Py_ssize_t repsize;
4878    Py_ssize_t newpos;
4879    Py_UNICODE *uni2;
4880    /* startpos for collecting unencodable chars */
4881    Py_ssize_t collstartpos = *inpos;
4882    Py_ssize_t collendpos = *inpos+1;
4883    Py_ssize_t collpos;
4884    char *encoding = "charmap";
4885    char *reason = "character maps to <undefined>";
4886    charmapencode_result x;
4887
4888    /* find all unencodable characters */
4889    while (collendpos < size) {
4890        PyObject *rep;
4891        if (Py_TYPE(mapping) == &EncodingMapType) {
4892            int res = encoding_map_lookup(p[collendpos], mapping);
4893            if (res != -1)
4894                break;
4895            ++collendpos;
4896            continue;
4897        }
4898
4899        rep = charmapencode_lookup(p[collendpos], mapping);
4900        if (rep==NULL)
4901            return -1;
4902        else if (rep!=Py_None) {
4903            Py_DECREF(rep);
4904            break;
4905        }
4906        Py_DECREF(rep);
4907        ++collendpos;
4908    }
4909    /* cache callback name lookup
4910     * (if not done yet, i.e. it's the first error) */
4911    if (*known_errorHandler==-1) {
4912        if ((errors==NULL) || (!strcmp(errors, "strict")))
4913            *known_errorHandler = 1;
4914        else if (!strcmp(errors, "replace"))
4915            *known_errorHandler = 2;
4916        else if (!strcmp(errors, "ignore"))
4917            *known_errorHandler = 3;
4918        else if (!strcmp(errors, "xmlcharrefreplace"))
4919            *known_errorHandler = 4;
4920        else
4921            *known_errorHandler = 0;
4922    }
4923    switch (*known_errorHandler) {
4924    case 1: /* strict */
4925        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4926        return -1;
4927    case 2: /* replace */
4928        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4929            x = charmapencode_output('?', mapping, res, respos);
4930            if (x==enc_EXCEPTION) {
4931                return -1;
4932            }
4933            else if (x==enc_FAILED) {
4934                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4935                return -1;
4936            }
4937        }
4938        /* fall through */
4939    case 3: /* ignore */
4940        *inpos = collendpos;
4941        break;
4942    case 4: /* xmlcharrefreplace */
4943        /* generate replacement (temporarily (mis)uses p) */
4944        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4945            char buffer[2+29+1+1];
4946            char *cp;
4947            sprintf(buffer, "&#%d;", (int)p[collpos]);
4948            for (cp = buffer; *cp; ++cp) {
4949                x = charmapencode_output(*cp, mapping, res, respos);
4950                if (x==enc_EXCEPTION)
4951                    return -1;
4952                else if (x==enc_FAILED) {
4953                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4954                    return -1;
4955                }
4956            }
4957        }
4958        *inpos = collendpos;
4959        break;
4960    default:
4961        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4962                                                      encoding, reason, p, size, exceptionObject,
4963                                                      collstartpos, collendpos, &newpos);
4964        if (repunicode == NULL)
4965            return -1;
4966        /* generate replacement  */
4967        repsize = PyUnicode_GET_SIZE(repunicode);
4968        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4969            x = charmapencode_output(*uni2, mapping, res, respos);
4970            if (x==enc_EXCEPTION) {
4971                return -1;
4972            }
4973            else if (x==enc_FAILED) {
4974                Py_DECREF(repunicode);
4975                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4976                return -1;
4977            }
4978        }
4979        *inpos = newpos;
4980        Py_DECREF(repunicode);
4981    }
4982    return 0;
4983}
4984
4985PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4986                                  Py_ssize_t size,
4987                                  PyObject *mapping,
4988                                  const char *errors)
4989{
4990    /* output object */
4991    PyObject *res = NULL;
4992    /* current input position */
4993    Py_ssize_t inpos = 0;
4994    /* current output position */
4995    Py_ssize_t respos = 0;
4996    PyObject *errorHandler = NULL;
4997    PyObject *exc = NULL;
4998    /* the following variable is used for caching string comparisons
4999     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5000     * 3=ignore, 4=xmlcharrefreplace */
5001    int known_errorHandler = -1;
5002
5003    /* Default to Latin-1 */
5004    if (mapping == NULL)
5005        return PyUnicode_EncodeLatin1(p, size, errors);
5006
5007    /* allocate enough for a simple encoding without
5008       replacements, if we need more, we'll resize */
5009    res = PyBytes_FromStringAndSize(NULL, size);
5010    if (res == NULL)
5011        goto onError;
5012    if (size == 0)
5013        return res;
5014
5015    while (inpos<size) {
5016        /* try to encode it */
5017        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5018        if (x==enc_EXCEPTION) /* error */
5019            goto onError;
5020        if (x==enc_FAILED) { /* unencodable character */
5021            if (charmap_encoding_error(p, size, &inpos, mapping,
5022                                       &exc,
5023                                       &known_errorHandler, &errorHandler, errors,
5024                                       &res, &respos)) {
5025                goto onError;
5026            }
5027        }
5028        else
5029            /* done with this character => adjust input position */
5030            ++inpos;
5031    }
5032
5033    /* Resize if we allocated to much */
5034    if (respos<PyBytes_GET_SIZE(res))
5035        if (_PyBytes_Resize(&res, respos) < 0)
5036            goto onError;
5037
5038    Py_XDECREF(exc);
5039    Py_XDECREF(errorHandler);
5040    return res;
5041
5042  onError:
5043    Py_XDECREF(res);
5044    Py_XDECREF(exc);
5045    Py_XDECREF(errorHandler);
5046    return NULL;
5047}
5048
5049PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5050                                    PyObject *mapping)
5051{
5052    if (!PyUnicode_Check(unicode) || mapping == NULL) {
5053        PyErr_BadArgument();
5054        return NULL;
5055    }
5056    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5057                                   PyUnicode_GET_SIZE(unicode),
5058                                   mapping,
5059                                   NULL);
5060}
5061
5062/* create or adjust a UnicodeTranslateError */
5063static void make_translate_exception(PyObject **exceptionObject,
5064                                     const Py_UNICODE *unicode, Py_ssize_t size,
5065                                     Py_ssize_t startpos, Py_ssize_t endpos,
5066                                     const char *reason)
5067{
5068    if (*exceptionObject == NULL) {
5069        *exceptionObject = PyUnicodeTranslateError_Create(
5070            unicode, size, startpos, endpos, reason);
5071    }
5072    else {
5073        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5074            goto onError;
5075        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5076            goto onError;
5077        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5078            goto onError;
5079        return;
5080      onError:
5081        Py_DECREF(*exceptionObject);
5082        *exceptionObject = NULL;
5083    }
5084}
5085
5086/* raises a UnicodeTranslateError */
5087static void raise_translate_exception(PyObject **exceptionObject,
5088                                      const Py_UNICODE *unicode, Py_ssize_t size,
5089                                      Py_ssize_t startpos, Py_ssize_t endpos,
5090                                      const char *reason)
5091{
5092    make_translate_exception(exceptionObject,
5093                             unicode, size, startpos, endpos, reason);
5094    if (*exceptionObject != NULL)
5095        PyCodec_StrictErrors(*exceptionObject);
5096}
5097
5098/* error handling callback helper:
5099   build arguments, call the callback and check the arguments,
5100   put the result into newpos and return the replacement string, which
5101   has to be freed by the caller */
5102static PyObject *unicode_translate_call_errorhandler(const char *errors,
5103                                                     PyObject **errorHandler,
5104                                                     const char *reason,
5105                                                     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5106                                                     Py_ssize_t startpos, Py_ssize_t endpos,
5107                                                     Py_ssize_t *newpos)
5108{
5109    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5110
5111    Py_ssize_t i_newpos;
5112    PyObject *restuple;
5113    PyObject *resunicode;
5114
5115    if (*errorHandler == NULL) {
5116        *errorHandler = PyCodec_LookupError(errors);
5117        if (*errorHandler == NULL)
5118            return NULL;
5119    }
5120
5121    make_translate_exception(exceptionObject,
5122                             unicode, size, startpos, endpos, reason);
5123    if (*exceptionObject == NULL)
5124        return NULL;
5125
5126    restuple = PyObject_CallFunctionObjArgs(
5127        *errorHandler, *exceptionObject, NULL);
5128    if (restuple == NULL)
5129        return NULL;
5130    if (!PyTuple_Check(restuple)) {
5131        PyErr_Format(PyExc_TypeError, &argparse[4]);
5132        Py_DECREF(restuple);
5133        return NULL;
5134    }
5135    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
5136                          &resunicode, &i_newpos)) {
5137        Py_DECREF(restuple);
5138        return NULL;
5139    }
5140    if (i_newpos<0)
5141        *newpos = size+i_newpos;
5142    else
5143        *newpos = i_newpos;
5144    if (*newpos<0 || *newpos>size) {
5145        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5146        Py_DECREF(restuple);
5147        return NULL;
5148    }
5149    Py_INCREF(resunicode);
5150    Py_DECREF(restuple);
5151    return resunicode;
5152}
5153
5154/* Lookup the character ch in the mapping and put the result in result,
5155   which must be decrefed by the caller.
5156   Return 0 on success, -1 on error */
5157static
5158int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5159{
5160    PyObject *w = PyLong_FromLong((long)c);
5161    PyObject *x;
5162
5163    if (w == NULL)
5164        return -1;
5165    x = PyObject_GetItem(mapping, w);
5166    Py_DECREF(w);
5167    if (x == NULL) {
5168        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5169            /* No mapping found means: use 1:1 mapping. */
5170            PyErr_Clear();
5171            *result = NULL;
5172            return 0;
5173        } else
5174            return -1;
5175    }
5176    else if (x == Py_None) {
5177        *result = x;
5178        return 0;
5179    }
5180    else if (PyLong_Check(x)) {
5181        long value = PyLong_AS_LONG(x);
5182        long max = PyUnicode_GetMax();
5183        if (value < 0 || value > max) {
5184            PyErr_Format(PyExc_TypeError,
5185                         "character mapping must be in range(0x%x)", max+1);
5186            Py_DECREF(x);
5187            return -1;
5188        }
5189        *result = x;
5190        return 0;
5191    }
5192    else if (PyUnicode_Check(x)) {
5193        *result = x;
5194        return 0;
5195    }
5196    else {
5197        /* wrong return value */
5198        PyErr_SetString(PyExc_TypeError,
5199                        "character mapping must return integer, None or str");
5200        Py_DECREF(x);
5201        return -1;
5202    }
5203}
5204/* ensure that *outobj is at least requiredsize characters long,
5205   if not reallocate and adjust various state variables.
5206   Return 0 on success, -1 on error */
5207static
5208int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
5209                               Py_ssize_t requiredsize)
5210{
5211    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
5212    if (requiredsize > oldsize) {
5213        /* remember old output position */
5214        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5215        /* exponentially overallocate to minimize reallocations */
5216        if (requiredsize < 2 * oldsize)
5217            requiredsize = 2 * oldsize;
5218        if (PyUnicode_Resize(outobj, requiredsize) < 0)
5219            return -1;
5220        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5221    }
5222    return 0;
5223}
5224/* lookup the character, put the result in the output string and adjust
5225   various state variables. Return a new reference to the object that
5226   was put in the output buffer in *result, or Py_None, if the mapping was
5227   undefined (in which case no character was written).
5228   The called must decref result.
5229   Return 0 on success, -1 on error. */
5230static
5231int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5232                            Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5233                            PyObject **res)
5234{
5235    if (charmaptranslate_lookup(*curinp, mapping, res))
5236        return -1;
5237    if (*res==NULL) {
5238        /* not found => default to 1:1 mapping */
5239        *(*outp)++ = *curinp;
5240    }
5241    else if (*res==Py_None)
5242        ;
5243    else if (PyLong_Check(*res)) {
5244        /* no overflow check, because we know that the space is enough */
5245        *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
5246    }
5247    else if (PyUnicode_Check(*res)) {
5248        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5249        if (repsize==1) {
5250            /* no overflow check, because we know that the space is enough */
5251            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5252        }
5253        else if (repsize!=0) {
5254            /* more than one character */
5255            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5256                (insize - (curinp-startinp)) +
5257                repsize - 1;
5258            if (charmaptranslate_makespace(outobj, outp, requiredsize))
5259                return -1;
5260            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5261            *outp += repsize;
5262        }
5263    }
5264    else
5265        return -1;
5266    return 0;
5267}
5268
5269PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5270                                     Py_ssize_t size,
5271                                     PyObject *mapping,
5272                                     const char *errors)
5273{
5274    /* output object */
5275    PyObject *res = NULL;
5276    /* pointers to the beginning and end+1 of input */
5277    const Py_UNICODE *startp = p;
5278    const Py_UNICODE *endp = p + size;
5279    /* pointer into the output */
5280    Py_UNICODE *str;
5281    /* current output position */
5282    Py_ssize_t respos = 0;
5283    char *reason = "character maps to <undefined>";
5284    PyObject *errorHandler = NULL;
5285    PyObject *exc = NULL;
5286    /* the following variable is used for caching string comparisons
5287     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5288     * 3=ignore, 4=xmlcharrefreplace */
5289    int known_errorHandler = -1;
5290
5291    if (mapping == NULL) {
5292        PyErr_BadArgument();
5293        return NULL;
5294    }
5295
5296    /* allocate enough for a simple 1:1 translation without
5297       replacements, if we need more, we'll resize */
5298    res = PyUnicode_FromUnicode(NULL, size);
5299    if (res == NULL)
5300        goto onError;
5301    if (size == 0)
5302        return res;
5303    str = PyUnicode_AS_UNICODE(res);
5304
5305    while (p<endp) {
5306        /* try to encode it */
5307        PyObject *x = NULL;
5308        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5309            Py_XDECREF(x);
5310            goto onError;
5311        }
5312        Py_XDECREF(x);
5313        if (x!=Py_None) /* it worked => adjust input pointer */
5314            ++p;
5315        else { /* untranslatable character */
5316            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5317            Py_ssize_t repsize;
5318            Py_ssize_t newpos;
5319            Py_UNICODE *uni2;
5320            /* startpos for collecting untranslatable chars */
5321            const Py_UNICODE *collstart = p;
5322            const Py_UNICODE *collend = p+1;
5323            const Py_UNICODE *coll;
5324
5325            /* find all untranslatable characters */
5326            while (collend < endp) {
5327                if (charmaptranslate_lookup(*collend, mapping, &x))
5328                    goto onError;
5329                Py_XDECREF(x);
5330                if (x!=Py_None)
5331                    break;
5332                ++collend;
5333            }
5334            /* cache callback name lookup
5335             * (if not done yet, i.e. it's the first error) */
5336            if (known_errorHandler==-1) {
5337                if ((errors==NULL) || (!strcmp(errors, "strict")))
5338                    known_errorHandler = 1;
5339                else if (!strcmp(errors, "replace"))
5340                    known_errorHandler = 2;
5341                else if (!strcmp(errors, "ignore"))
5342                    known_errorHandler = 3;
5343                else if (!strcmp(errors, "xmlcharrefreplace"))
5344                    known_errorHandler = 4;
5345                else
5346                    known_errorHandler = 0;
5347            }
5348            switch (known_errorHandler) {
5349            case 1: /* strict */
5350                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5351                goto onError;
5352            case 2: /* replace */
5353                /* No need to check for space, this is a 1:1 replacement */
5354                for (coll = collstart; coll<collend; ++coll)
5355                    *str++ = '?';
5356                /* fall through */
5357            case 3: /* ignore */
5358                p = collend;
5359                break;
5360            case 4: /* xmlcharrefreplace */
5361                /* generate replacement (temporarily (mis)uses p) */
5362                for (p = collstart; p < collend; ++p) {
5363                    char buffer[2+29+1+1];
5364                    char *cp;
5365                    sprintf(buffer, "&#%d;", (int)*p);
5366                    if (charmaptranslate_makespace(&res, &str,
5367                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5368                        goto onError;
5369                    for (cp = buffer; *cp; ++cp)
5370                        *str++ = *cp;
5371                }
5372                p = collend;
5373                break;
5374            default:
5375                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5376                                                                 reason, startp, size, &exc,
5377                                                                 collstart-startp, collend-startp, &newpos);
5378                if (repunicode == NULL)
5379                    goto onError;
5380                /* generate replacement  */
5381                repsize = PyUnicode_GET_SIZE(repunicode);
5382                if (charmaptranslate_makespace(&res, &str,
5383                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5384                    Py_DECREF(repunicode);
5385                    goto onError;
5386                }
5387                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5388                    *str++ = *uni2;
5389                p = startp + newpos;
5390                Py_DECREF(repunicode);
5391            }
5392        }
5393    }
5394    /* Resize if we allocated to much */
5395    respos = str-PyUnicode_AS_UNICODE(res);
5396    if (respos<PyUnicode_GET_SIZE(res)) {
5397        if (PyUnicode_Resize(&res, respos) < 0)
5398            goto onError;
5399    }
5400    Py_XDECREF(exc);
5401    Py_XDECREF(errorHandler);
5402    return res;
5403
5404  onError:
5405    Py_XDECREF(res);
5406    Py_XDECREF(exc);
5407    Py_XDECREF(errorHandler);
5408    return NULL;
5409}
5410
5411PyObject *PyUnicode_Translate(PyObject *str,
5412                              PyObject *mapping,
5413                              const char *errors)
5414{
5415    PyObject *result;
5416
5417    str = PyUnicode_FromObject(str);
5418    if (str == NULL)
5419        goto onError;
5420    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5421                                        PyUnicode_GET_SIZE(str),
5422                                        mapping,
5423                                        errors);
5424    Py_DECREF(str);
5425    return result;
5426
5427  onError:
5428    Py_XDECREF(str);
5429    return NULL;
5430}
5431
5432/* --- Decimal Encoder ---------------------------------------------------- */
5433
5434int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5435                            Py_ssize_t length,
5436                            char *output,
5437                            const char *errors)
5438{
5439    Py_UNICODE *p, *end;
5440    PyObject *errorHandler = NULL;
5441    PyObject *exc = NULL;
5442    const char *encoding = "decimal";
5443    const char *reason = "invalid decimal Unicode string";
5444    /* the following variable is used for caching string comparisons
5445     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5446    int known_errorHandler = -1;
5447
5448    if (output == NULL) {
5449        PyErr_BadArgument();
5450        return -1;
5451    }
5452
5453    p = s;
5454    end = s + length;
5455    while (p < end) {
5456        register Py_UNICODE ch = *p;
5457        int decimal;
5458        PyObject *repunicode;
5459        Py_ssize_t repsize;
5460        Py_ssize_t newpos;
5461        Py_UNICODE *uni2;
5462        Py_UNICODE *collstart;
5463        Py_UNICODE *collend;
5464
5465        if (Py_UNICODE_ISSPACE(ch)) {
5466            *output++ = ' ';
5467            ++p;
5468            continue;
5469        }
5470        decimal = Py_UNICODE_TODECIMAL(ch);
5471        if (decimal >= 0) {
5472            *output++ = '0' + decimal;
5473            ++p;
5474            continue;
5475        }
5476        if (0 < ch && ch < 256) {
5477            *output++ = (char)ch;
5478            ++p;
5479            continue;
5480        }
5481        /* All other characters are considered unencodable */
5482        collstart = p;
5483        collend = p+1;
5484        while (collend < end) {
5485            if ((0 < *collend && *collend < 256) ||
5486                !Py_UNICODE_ISSPACE(*collend) ||
5487                Py_UNICODE_TODECIMAL(*collend))
5488                break;
5489        }
5490        /* cache callback name lookup
5491         * (if not done yet, i.e. it's the first error) */
5492        if (known_errorHandler==-1) {
5493            if ((errors==NULL) || (!strcmp(errors, "strict")))
5494                known_errorHandler = 1;
5495            else if (!strcmp(errors, "replace"))
5496                known_errorHandler = 2;
5497            else if (!strcmp(errors, "ignore"))
5498                known_errorHandler = 3;
5499            else if (!strcmp(errors, "xmlcharrefreplace"))
5500                known_errorHandler = 4;
5501            else
5502                known_errorHandler = 0;
5503        }
5504        switch (known_errorHandler) {
5505        case 1: /* strict */
5506            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5507            goto onError;
5508        case 2: /* replace */
5509            for (p = collstart; p < collend; ++p)
5510                *output++ = '?';
5511            /* fall through */
5512        case 3: /* ignore */
5513            p = collend;
5514            break;
5515        case 4: /* xmlcharrefreplace */
5516            /* generate replacement (temporarily (mis)uses p) */
5517            for (p = collstart; p < collend; ++p)
5518                output += sprintf(output, "&#%d;", (int)*p);
5519            p = collend;
5520            break;
5521        default:
5522            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5523                                                          encoding, reason, s, length, &exc,
5524                                                          collstart-s, collend-s, &newpos);
5525            if (repunicode == NULL)
5526                goto onError;
5527            /* generate replacement  */
5528            repsize = PyUnicode_GET_SIZE(repunicode);
5529            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5530                Py_UNICODE ch = *uni2;
5531                if (Py_UNICODE_ISSPACE(ch))
5532                    *output++ = ' ';
5533                else {
5534                    decimal = Py_UNICODE_TODECIMAL(ch);
5535                    if (decimal >= 0)
5536                        *output++ = '0' + decimal;
5537                    else if (0 < ch && ch < 256)
5538                        *output++ = (char)ch;
5539                    else {
5540                        Py_DECREF(repunicode);
5541                        raise_encode_exception(&exc, encoding,
5542                                               s, length, collstart-s, collend-s, reason);
5543                        goto onError;
5544                    }
5545                }
5546            }
5547            p = s + newpos;
5548            Py_DECREF(repunicode);
5549        }
5550    }
5551    /* 0-terminate the output string */
5552    *output++ = '\0';
5553    Py_XDECREF(exc);
5554    Py_XDECREF(errorHandler);
5555    return 0;
5556
5557  onError:
5558    Py_XDECREF(exc);
5559    Py_XDECREF(errorHandler);
5560    return -1;
5561}
5562
5563/* --- Helpers ------------------------------------------------------------ */
5564
5565#include "stringlib/unicodedefs.h"
5566#include "stringlib/fastsearch.h"
5567#include "stringlib/count.h"
5568/* Include _ParseTupleFinds from find.h */
5569#define FROM_UNICODE
5570#include "stringlib/find.h"
5571#include "stringlib/partition.h"
5572
5573#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5574#include "stringlib/localeutil.h"
5575
5576/* helper macro to fixup start/end slice values */
5577#define FIX_START_END(obj)                      \
5578    if (start < 0)                              \
5579        start += (obj)->length;                 \
5580    if (start < 0)                              \
5581        start = 0;                              \
5582    if (end > (obj)->length)                    \
5583        end = (obj)->length;                    \
5584    if (end < 0)                                \
5585        end += (obj)->length;                   \
5586    if (end < 0)                                \
5587        end = 0;
5588
5589Py_ssize_t PyUnicode_Count(PyObject *str,
5590                           PyObject *substr,
5591                           Py_ssize_t start,
5592                           Py_ssize_t end)
5593{
5594    Py_ssize_t result;
5595    PyUnicodeObject* str_obj;
5596    PyUnicodeObject* sub_obj;
5597
5598    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5599    if (!str_obj)
5600        return -1;
5601    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5602    if (!sub_obj) {
5603        Py_DECREF(str_obj);
5604        return -1;
5605    }
5606
5607    FIX_START_END(str_obj);
5608
5609    result = stringlib_count(
5610        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5611        );
5612
5613    Py_DECREF(sub_obj);
5614    Py_DECREF(str_obj);
5615
5616    return result;
5617}
5618
5619Py_ssize_t PyUnicode_Find(PyObject *str,
5620                          PyObject *sub,
5621                          Py_ssize_t start,
5622                          Py_ssize_t end,
5623                          int direction)
5624{
5625    Py_ssize_t result;
5626
5627    str = PyUnicode_FromObject(str);
5628    if (!str)
5629        return -2;
5630    sub = PyUnicode_FromObject(sub);
5631    if (!sub) {
5632        Py_DECREF(str);
5633        return -2;
5634    }
5635
5636    if (direction > 0)
5637        result = stringlib_find_slice(
5638            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5639            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5640            start, end
5641            );
5642    else
5643        result = stringlib_rfind_slice(
5644            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5645            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5646            start, end
5647            );
5648
5649    Py_DECREF(str);
5650    Py_DECREF(sub);
5651
5652    return result;
5653}
5654
5655static
5656int tailmatch(PyUnicodeObject *self,
5657              PyUnicodeObject *substring,
5658              Py_ssize_t start,
5659              Py_ssize_t end,
5660              int direction)
5661{
5662    if (substring->length == 0)
5663        return 1;
5664
5665    FIX_START_END(self);
5666
5667    end -= substring->length;
5668    if (end < start)
5669        return 0;
5670
5671    if (direction > 0) {
5672        if (Py_UNICODE_MATCH(self, end, substring))
5673            return 1;
5674    } else {
5675        if (Py_UNICODE_MATCH(self, start, substring))
5676            return 1;
5677    }
5678
5679    return 0;
5680}
5681
5682Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5683                               PyObject *substr,
5684                               Py_ssize_t start,
5685                               Py_ssize_t end,
5686                               int direction)
5687{
5688    Py_ssize_t result;
5689
5690    str = PyUnicode_FromObject(str);
5691    if (str == NULL)
5692        return -1;
5693    substr = PyUnicode_FromObject(substr);
5694    if (substr == NULL) {
5695        Py_DECREF(str);
5696        return -1;
5697    }
5698
5699    result = tailmatch((PyUnicodeObject *)str,
5700                       (PyUnicodeObject *)substr,
5701                       start, end, direction);
5702    Py_DECREF(str);
5703    Py_DECREF(substr);
5704    return result;
5705}
5706
5707/* Apply fixfct filter to the Unicode object self and return a
5708   reference to the modified object */
5709
5710static
5711PyObject *fixup(PyUnicodeObject *self,
5712                int (*fixfct)(PyUnicodeObject *s))
5713{
5714
5715    PyUnicodeObject *u;
5716
5717    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5718    if (u == NULL)
5719        return NULL;
5720
5721    Py_UNICODE_COPY(u->str, self->str, self->length);
5722
5723    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5724        /* fixfct should return TRUE if it modified the buffer. If
5725           FALSE, return a reference to the original buffer instead
5726           (to save space, not time) */
5727        Py_INCREF(self);
5728        Py_DECREF(u);
5729        return (PyObject*) self;
5730    }
5731    return (PyObject*) u;
5732}
5733
5734static
5735int fixupper(PyUnicodeObject *self)
5736{
5737    Py_ssize_t len = self->length;
5738    Py_UNICODE *s = self->str;
5739    int status = 0;
5740
5741    while (len-- > 0) {
5742        register Py_UNICODE ch;
5743
5744        ch = Py_UNICODE_TOUPPER(*s);
5745        if (ch != *s) {
5746            status = 1;
5747            *s = ch;
5748        }
5749        s++;
5750    }
5751
5752    return status;
5753}
5754
5755static
5756int fixlower(PyUnicodeObject *self)
5757{
5758    Py_ssize_t len = self->length;
5759    Py_UNICODE *s = self->str;
5760    int status = 0;
5761
5762    while (len-- > 0) {
5763        register Py_UNICODE ch;
5764
5765        ch = Py_UNICODE_TOLOWER(*s);
5766        if (ch != *s) {
5767            status = 1;
5768            *s = ch;
5769        }
5770        s++;
5771    }
5772
5773    return status;
5774}
5775
5776static
5777int fixswapcase(PyUnicodeObject *self)
5778{
5779    Py_ssize_t len = self->length;
5780    Py_UNICODE *s = self->str;
5781    int status = 0;
5782
5783    while (len-- > 0) {
5784        if (Py_UNICODE_ISUPPER(*s)) {
5785            *s = Py_UNICODE_TOLOWER(*s);
5786            status = 1;
5787        } else if (Py_UNICODE_ISLOWER(*s)) {
5788            *s = Py_UNICODE_TOUPPER(*s);
5789            status = 1;
5790        }
5791        s++;
5792    }
5793
5794    return status;
5795}
5796
5797static
5798int fixcapitalize(PyUnicodeObject *self)
5799{
5800    Py_ssize_t len = self->length;
5801    Py_UNICODE *s = self->str;
5802    int status = 0;
5803
5804    if (len == 0)
5805        return 0;
5806    if (Py_UNICODE_ISLOWER(*s)) {
5807        *s = Py_UNICODE_TOUPPER(*s);
5808        status = 1;
5809    }
5810    s++;
5811    while (--len > 0) {
5812        if (Py_UNICODE_ISUPPER(*s)) {
5813            *s = Py_UNICODE_TOLOWER(*s);
5814            status = 1;
5815        }
5816        s++;
5817    }
5818    return status;
5819}
5820
5821static
5822int fixtitle(PyUnicodeObject *self)
5823{
5824    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5825    register Py_UNICODE *e;
5826    int previous_is_cased;
5827
5828    /* Shortcut for single character strings */
5829    if (PyUnicode_GET_SIZE(self) == 1) {
5830        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5831        if (*p != ch) {
5832            *p = ch;
5833            return 1;
5834        }
5835        else
5836            return 0;
5837    }
5838
5839    e = p + PyUnicode_GET_SIZE(self);
5840    previous_is_cased = 0;
5841    for (; p < e; p++) {
5842        register const Py_UNICODE ch = *p;
5843
5844        if (previous_is_cased)
5845            *p = Py_UNICODE_TOLOWER(ch);
5846        else
5847            *p = Py_UNICODE_TOTITLE(ch);
5848
5849        if (Py_UNICODE_ISLOWER(ch) ||
5850            Py_UNICODE_ISUPPER(ch) ||
5851            Py_UNICODE_ISTITLE(ch))
5852            previous_is_cased = 1;
5853        else
5854            previous_is_cased = 0;
5855    }
5856    return 1;
5857}
5858
5859PyObject *
5860PyUnicode_Join(PyObject *separator, PyObject *seq)
5861{
5862    const Py_UNICODE blank = ' ';
5863    const Py_UNICODE *sep = &blank;
5864    Py_ssize_t seplen = 1;
5865    PyUnicodeObject *res = NULL; /* the result */
5866    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5867    PyObject *fseq;          /* PySequence_Fast(seq) */
5868    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
5869    PyObject **items;
5870    PyObject *item;
5871    Py_ssize_t sz, i;
5872
5873    fseq = PySequence_Fast(seq, "");
5874    if (fseq == NULL) {
5875        return NULL;
5876    }
5877
5878    /* NOTE: the following code can't call back into Python code,
5879     * so we are sure that fseq won't be mutated.
5880     */
5881
5882    seqlen = PySequence_Fast_GET_SIZE(fseq);
5883    /* If empty sequence, return u"". */
5884    if (seqlen == 0) {
5885        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5886        goto Done;
5887    }
5888    items = PySequence_Fast_ITEMS(fseq);
5889    /* If singleton sequence with an exact Unicode, return that. */
5890    if (seqlen == 1) {
5891        item = items[0];
5892        if (PyUnicode_CheckExact(item)) {
5893            Py_INCREF(item);
5894            res = (PyUnicodeObject *)item;
5895            goto Done;
5896        }
5897    }
5898    else {
5899        /* Set up sep and seplen */
5900        if (separator == NULL) {
5901            sep = &blank;
5902            seplen = 1;
5903        }
5904        else {
5905            if (!PyUnicode_Check(separator)) {
5906                PyErr_Format(PyExc_TypeError,
5907                             "separator: expected str instance,"
5908                             " %.80s found",
5909                             Py_TYPE(separator)->tp_name);
5910                goto onError;
5911            }
5912            sep = PyUnicode_AS_UNICODE(separator);
5913            seplen = PyUnicode_GET_SIZE(separator);
5914        }
5915    }
5916
5917    /* There are at least two things to join, or else we have a subclass
5918     * of str in the sequence.
5919     * Do a pre-pass to figure out the total amount of space we'll
5920     * need (sz), and see whether all argument are strings.
5921     */
5922    sz = 0;
5923    for (i = 0; i < seqlen; i++) {
5924        const Py_ssize_t old_sz = sz;
5925        item = items[i];
5926        if (!PyUnicode_Check(item)) {
5927            PyErr_Format(PyExc_TypeError,
5928                         "sequence item %zd: expected str instance,"
5929                         " %.80s found",
5930                         i, Py_TYPE(item)->tp_name);
5931            goto onError;
5932        }
5933        sz += PyUnicode_GET_SIZE(item);
5934        if (i != 0)
5935            sz += seplen;
5936        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5937            PyErr_SetString(PyExc_OverflowError,
5938                            "join() result is too long for a Python string");
5939            goto onError;
5940        }
5941    }
5942
5943    res = _PyUnicode_New(sz);
5944    if (res == NULL)
5945        goto onError;
5946
5947    /* Catenate everything. */
5948    res_p = PyUnicode_AS_UNICODE(res);
5949    for (i = 0; i < seqlen; ++i) {
5950        Py_ssize_t itemlen;
5951        item = items[i];
5952        itemlen = PyUnicode_GET_SIZE(item);
5953        /* Copy item, and maybe the separator. */
5954        if (i) {
5955            Py_UNICODE_COPY(res_p, sep, seplen);
5956            res_p += seplen;
5957        }
5958        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5959        res_p += itemlen;
5960    }
5961
5962  Done:
5963    Py_DECREF(fseq);
5964    return (PyObject *)res;
5965
5966  onError:
5967    Py_DECREF(fseq);
5968    Py_XDECREF(res);
5969    return NULL;
5970}
5971
5972static
5973PyUnicodeObject *pad(PyUnicodeObject *self,
5974                     Py_ssize_t left,
5975                     Py_ssize_t right,
5976                     Py_UNICODE fill)
5977{
5978    PyUnicodeObject *u;
5979
5980    if (left < 0)
5981        left = 0;
5982    if (right < 0)
5983        right = 0;
5984
5985    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5986        Py_INCREF(self);
5987        return self;
5988    }
5989
5990    if (left > PY_SSIZE_T_MAX - self->length ||
5991        right > PY_SSIZE_T_MAX - (left + self->length)) {
5992        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5993        return NULL;
5994    }
5995    u = _PyUnicode_New(left + self->length + right);
5996    if (u) {
5997        if (left)
5998            Py_UNICODE_FILL(u->str, fill, left);
5999        Py_UNICODE_COPY(u->str + left, self->str, self->length);
6000        if (right)
6001            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6002    }
6003
6004    return u;
6005}
6006
6007#define SPLIT_APPEND(data, left, right)                                 \
6008    str = PyUnicode_FromUnicode((data) + (left), (right) - (left));     \
6009    if (!str)                                                           \
6010        goto onError;                                                   \
6011    if (PyList_Append(list, str)) {                                     \
6012        Py_DECREF(str);                                                 \
6013        goto onError;                                                   \
6014    }                                                                   \
6015    else                                                                \
6016        Py_DECREF(str);
6017
6018static
6019PyObject *split_whitespace(PyUnicodeObject *self,
6020                           PyObject *list,
6021                           Py_ssize_t maxcount)
6022{
6023    register Py_ssize_t i;
6024    register Py_ssize_t j;
6025    Py_ssize_t len = self->length;
6026    PyObject *str;
6027    register const Py_UNICODE *buf = self->str;
6028
6029    for (i = j = 0; i < len; ) {
6030        /* find a token */
6031        while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6032            i++;
6033        j = i;
6034        while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6035            i++;
6036        if (j < i) {
6037            if (maxcount-- <= 0)
6038                break;
6039            SPLIT_APPEND(buf, j, i);
6040            while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6041                i++;
6042            j = i;
6043        }
6044    }
6045    if (j < len) {
6046        SPLIT_APPEND(buf, j, len);
6047    }
6048    return list;
6049
6050  onError:
6051    Py_DECREF(list);
6052    return NULL;
6053}
6054
6055PyObject *PyUnicode_Splitlines(PyObject *string,
6056                               int keepends)
6057{
6058    register Py_ssize_t i;
6059    register Py_ssize_t j;
6060    Py_ssize_t len;
6061    PyObject *list;
6062    PyObject *str;
6063    Py_UNICODE *data;
6064
6065    string = PyUnicode_FromObject(string);
6066    if (string == NULL)
6067        return NULL;
6068    data = PyUnicode_AS_UNICODE(string);
6069    len = PyUnicode_GET_SIZE(string);
6070
6071    list = PyList_New(0);
6072    if (!list)
6073        goto onError;
6074
6075    for (i = j = 0; i < len; ) {
6076        Py_ssize_t eol;
6077
6078        /* Find a line and append it */
6079        while (i < len && !BLOOM_LINEBREAK(data[i]))
6080            i++;
6081
6082        /* Skip the line break reading CRLF as one line break */
6083        eol = i;
6084        if (i < len) {
6085            if (data[i] == '\r' && i + 1 < len &&
6086                data[i+1] == '\n')
6087                i += 2;
6088            else
6089                i++;
6090            if (keepends)
6091                eol = i;
6092        }
6093        SPLIT_APPEND(data, j, eol);
6094        j = i;
6095    }
6096    if (j < len) {
6097        SPLIT_APPEND(data, j, len);
6098    }
6099
6100    Py_DECREF(string);
6101    return list;
6102
6103  onError:
6104    Py_XDECREF(list);
6105    Py_DECREF(string);
6106    return NULL;
6107}
6108
6109static
6110PyObject *split_char(PyUnicodeObject *self,
6111                     PyObject *list,
6112                     Py_UNICODE ch,
6113                     Py_ssize_t maxcount)
6114{
6115    register Py_ssize_t i;
6116    register Py_ssize_t j;
6117    Py_ssize_t len = self->length;
6118    PyObject *str;
6119    register const Py_UNICODE *buf = self->str;
6120
6121    for (i = j = 0; i < len; ) {
6122        if (buf[i] == ch) {
6123            if (maxcount-- <= 0)
6124                break;
6125            SPLIT_APPEND(buf, j, i);
6126            i = j = i + 1;
6127        } else
6128            i++;
6129    }
6130    if (j <= len) {
6131        SPLIT_APPEND(buf, j, len);
6132    }
6133    return list;
6134
6135  onError:
6136    Py_DECREF(list);
6137    return NULL;
6138}
6139
6140static
6141PyObject *split_substring(PyUnicodeObject *self,
6142                          PyObject *list,
6143                          PyUnicodeObject *substring,
6144                          Py_ssize_t maxcount)
6145{
6146    register Py_ssize_t i;
6147    register Py_ssize_t j;
6148    Py_ssize_t len = self->length;
6149    Py_ssize_t sublen = substring->length;
6150    PyObject *str;
6151
6152    for (i = j = 0; i <= len - sublen; ) {
6153        if (Py_UNICODE_MATCH(self, i, substring)) {
6154            if (maxcount-- <= 0)
6155                break;
6156            SPLIT_APPEND(self->str, j, i);
6157            i = j = i + sublen;
6158        } else
6159            i++;
6160    }
6161    if (j <= len) {
6162        SPLIT_APPEND(self->str, j, len);
6163    }
6164    return list;
6165
6166  onError:
6167    Py_DECREF(list);
6168    return NULL;
6169}
6170
6171static
6172PyObject *rsplit_whitespace(PyUnicodeObject *self,
6173                            PyObject *list,
6174                            Py_ssize_t maxcount)
6175{
6176    register Py_ssize_t i;
6177    register Py_ssize_t j;
6178    Py_ssize_t len = self->length;
6179    PyObject *str;
6180    register const Py_UNICODE *buf = self->str;
6181
6182    for (i = j = len - 1; i >= 0; ) {
6183        /* find a token */
6184        while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6185            i--;
6186        j = i;
6187        while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6188            i--;
6189        if (j > i) {
6190            if (maxcount-- <= 0)
6191                break;
6192            SPLIT_APPEND(buf, i + 1, j + 1);
6193            while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6194                i--;
6195            j = i;
6196        }
6197    }
6198    if (j >= 0) {
6199        SPLIT_APPEND(buf, 0, j + 1);
6200    }
6201    if (PyList_Reverse(list) < 0)
6202        goto onError;
6203    return list;
6204
6205  onError:
6206    Py_DECREF(list);
6207    return NULL;
6208}
6209
6210static
6211PyObject *rsplit_char(PyUnicodeObject *self,
6212                      PyObject *list,
6213                      Py_UNICODE ch,
6214                      Py_ssize_t maxcount)
6215{
6216    register Py_ssize_t i;
6217    register Py_ssize_t j;
6218    Py_ssize_t len = self->length;
6219    PyObject *str;
6220    register const Py_UNICODE *buf = self->str;
6221
6222    for (i = j = len - 1; i >= 0; ) {
6223        if (buf[i] == ch) {
6224            if (maxcount-- <= 0)
6225                break;
6226            SPLIT_APPEND(buf, i + 1, j + 1);
6227            j = i = i - 1;
6228        } else
6229            i--;
6230    }
6231    if (j >= -1) {
6232        SPLIT_APPEND(buf, 0, j + 1);
6233    }
6234    if (PyList_Reverse(list) < 0)
6235        goto onError;
6236    return list;
6237
6238  onError:
6239    Py_DECREF(list);
6240    return NULL;
6241}
6242
6243static
6244PyObject *rsplit_substring(PyUnicodeObject *self,
6245                           PyObject *list,
6246                           PyUnicodeObject *substring,
6247                           Py_ssize_t maxcount)
6248{
6249    register Py_ssize_t i;
6250    register Py_ssize_t j;
6251    Py_ssize_t len = self->length;
6252    Py_ssize_t sublen = substring->length;
6253    PyObject *str;
6254
6255    for (i = len - sublen, j = len; i >= 0; ) {
6256        if (Py_UNICODE_MATCH(self, i, substring)) {
6257            if (maxcount-- <= 0)
6258                break;
6259            SPLIT_APPEND(self->str, i + sublen, j);
6260            j = i;
6261            i -= sublen;
6262        } else
6263            i--;
6264    }
6265    if (j >= 0) {
6266        SPLIT_APPEND(self->str, 0, j);
6267    }
6268    if (PyList_Reverse(list) < 0)
6269        goto onError;
6270    return list;
6271
6272  onError:
6273    Py_DECREF(list);
6274    return NULL;
6275}
6276
6277#undef SPLIT_APPEND
6278
6279static
6280PyObject *split(PyUnicodeObject *self,
6281                PyUnicodeObject *substring,
6282                Py_ssize_t maxcount)
6283{
6284    PyObject *list;
6285
6286    if (maxcount < 0)
6287        maxcount = PY_SSIZE_T_MAX;
6288
6289    list = PyList_New(0);
6290    if (!list)
6291        return NULL;
6292
6293    if (substring == NULL)
6294        return split_whitespace(self,list,maxcount);
6295
6296    else if (substring->length == 1)
6297        return split_char(self,list,substring->str[0],maxcount);
6298
6299    else if (substring->length == 0) {
6300        Py_DECREF(list);
6301        PyErr_SetString(PyExc_ValueError, "empty separator");
6302        return NULL;
6303    }
6304    else
6305        return split_substring(self,list,substring,maxcount);
6306}
6307
6308static
6309PyObject *rsplit(PyUnicodeObject *self,
6310                 PyUnicodeObject *substring,
6311                 Py_ssize_t maxcount)
6312{
6313    PyObject *list;
6314
6315    if (maxcount < 0)
6316        maxcount = PY_SSIZE_T_MAX;
6317
6318    list = PyList_New(0);
6319    if (!list)
6320        return NULL;
6321
6322    if (substring == NULL)
6323        return rsplit_whitespace(self,list,maxcount);
6324
6325    else if (substring->length == 1)
6326        return rsplit_char(self,list,substring->str[0],maxcount);
6327
6328    else if (substring->length == 0) {
6329        Py_DECREF(list);
6330        PyErr_SetString(PyExc_ValueError, "empty separator");
6331        return NULL;
6332    }
6333    else
6334        return rsplit_substring(self,list,substring,maxcount);
6335}
6336
6337static
6338PyObject *replace(PyUnicodeObject *self,
6339                  PyUnicodeObject *str1,
6340                  PyUnicodeObject *str2,
6341                  Py_ssize_t maxcount)
6342{
6343    PyUnicodeObject *u;
6344
6345    if (maxcount < 0)
6346        maxcount = PY_SSIZE_T_MAX;
6347
6348    if (str1->length == str2->length) {
6349        /* same length */
6350        Py_ssize_t i;
6351        if (str1->length == 1) {
6352            /* replace characters */
6353            Py_UNICODE u1, u2;
6354            if (!findchar(self->str, self->length, str1->str[0]))
6355                goto nothing;
6356            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6357            if (!u)
6358                return NULL;
6359            Py_UNICODE_COPY(u->str, self->str, self->length);
6360            u1 = str1->str[0];
6361            u2 = str2->str[0];
6362            for (i = 0; i < u->length; i++)
6363                if (u->str[i] == u1) {
6364                    if (--maxcount < 0)
6365                        break;
6366                    u->str[i] = u2;
6367                }
6368        } else {
6369            i = fastsearch(
6370                self->str, self->length, str1->str, str1->length, FAST_SEARCH
6371                );
6372            if (i < 0)
6373                goto nothing;
6374            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6375            if (!u)
6376                return NULL;
6377            Py_UNICODE_COPY(u->str, self->str, self->length);
6378            while (i <= self->length - str1->length)
6379                if (Py_UNICODE_MATCH(self, i, str1)) {
6380                    if (--maxcount < 0)
6381                        break;
6382                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6383                    i += str1->length;
6384                } else
6385                    i++;
6386        }
6387    } else {
6388
6389        Py_ssize_t n, i, j, e;
6390        Py_ssize_t product, new_size, delta;
6391        Py_UNICODE *p;
6392
6393        /* replace strings */
6394        n = stringlib_count(self->str, self->length, str1->str, str1->length);
6395        if (n > maxcount)
6396            n = maxcount;
6397        if (n == 0)
6398            goto nothing;
6399        /* new_size = self->length + n * (str2->length - str1->length)); */
6400        delta = (str2->length - str1->length);
6401        if (delta == 0) {
6402            new_size = self->length;
6403        } else {
6404            product = n * (str2->length - str1->length);
6405            if ((product / (str2->length - str1->length)) != n) {
6406                PyErr_SetString(PyExc_OverflowError,
6407                                "replace string is too long");
6408                return NULL;
6409            }
6410            new_size = self->length + product;
6411            if (new_size < 0) {
6412                PyErr_SetString(PyExc_OverflowError,
6413                                "replace string is too long");
6414                return NULL;
6415            }
6416        }
6417        u = _PyUnicode_New(new_size);
6418        if (!u)
6419            return NULL;
6420        i = 0;
6421        p = u->str;
6422        e = self->length - str1->length;
6423        if (str1->length > 0) {
6424            while (n-- > 0) {
6425                /* look for next match */
6426                j = i;
6427                while (j <= e) {
6428                    if (Py_UNICODE_MATCH(self, j, str1))
6429                        break;
6430                    j++;
6431                }
6432                if (j > i) {
6433                    if (j > e)
6434                        break;
6435                    /* copy unchanged part [i:j] */
6436                    Py_UNICODE_COPY(p, self->str+i, j-i);
6437                    p += j - i;
6438                }
6439                /* copy substitution string */
6440                if (str2->length > 0) {
6441                    Py_UNICODE_COPY(p, str2->str, str2->length);
6442                    p += str2->length;
6443                }
6444                i = j + str1->length;
6445            }
6446            if (i < self->length)
6447                /* copy tail [i:] */
6448                Py_UNICODE_COPY(p, self->str+i, self->length-i);
6449        } else {
6450            /* interleave */
6451            while (n > 0) {
6452                Py_UNICODE_COPY(p, str2->str, str2->length);
6453                p += str2->length;
6454                if (--n <= 0)
6455                    break;
6456                *p++ = self->str[i++];
6457            }
6458            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6459        }
6460    }
6461    return (PyObject *) u;
6462
6463  nothing:
6464    /* nothing to replace; return original string (when possible) */
6465    if (PyUnicode_CheckExact(self)) {
6466        Py_INCREF(self);
6467        return (PyObject *) self;
6468    }
6469    return PyUnicode_FromUnicode(self->str, self->length);
6470}
6471
6472/* --- Unicode Object Methods --------------------------------------------- */
6473
6474PyDoc_STRVAR(title__doc__,
6475             "S.title() -> str\n\
6476\n\
6477Return a titlecased version of S, i.e. words start with title case\n\
6478characters, all remaining cased characters have lower case.");
6479
6480static PyObject*
6481unicode_title(PyUnicodeObject *self)
6482{
6483    return fixup(self, fixtitle);
6484}
6485
6486PyDoc_STRVAR(capitalize__doc__,
6487             "S.capitalize() -> str\n\
6488\n\
6489Return a capitalized version of S, i.e. make the first character\n\
6490have upper case.");
6491
6492static PyObject*
6493unicode_capitalize(PyUnicodeObject *self)
6494{
6495    return fixup(self, fixcapitalize);
6496}
6497
6498#if 0
6499PyDoc_STRVAR(capwords__doc__,
6500             "S.capwords() -> str\n\
6501\n\
6502Apply .capitalize() to all words in S and return the result with\n\
6503normalized whitespace (all whitespace strings are replaced by ' ').");
6504
6505static PyObject*
6506unicode_capwords(PyUnicodeObject *self)
6507{
6508    PyObject *list;
6509    PyObject *item;
6510    Py_ssize_t i;
6511
6512    /* Split into words */
6513    list = split(self, NULL, -1);
6514    if (!list)
6515        return NULL;
6516
6517    /* Capitalize each word */
6518    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6519        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6520                     fixcapitalize);
6521        if (item == NULL)
6522            goto onError;
6523        Py_DECREF(PyList_GET_ITEM(list, i));
6524        PyList_SET_ITEM(list, i, item);
6525    }
6526
6527    /* Join the words to form a new string */
6528    item = PyUnicode_Join(NULL, list);
6529
6530  onError:
6531    Py_DECREF(list);
6532    return (PyObject *)item;
6533}
6534#endif
6535
6536/* Argument converter.  Coerces to a single unicode character */
6537
6538static int
6539convert_uc(PyObject *obj, void *addr)
6540{
6541    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6542    PyObject *uniobj;
6543    Py_UNICODE *unistr;
6544
6545    uniobj = PyUnicode_FromObject(obj);
6546    if (uniobj == NULL) {
6547        PyErr_SetString(PyExc_TypeError,
6548                        "The fill character cannot be converted to Unicode");
6549        return 0;
6550    }
6551    if (PyUnicode_GET_SIZE(uniobj) != 1) {
6552        PyErr_SetString(PyExc_TypeError,
6553                        "The fill character must be exactly one character long");
6554        Py_DECREF(uniobj);
6555        return 0;
6556    }
6557    unistr = PyUnicode_AS_UNICODE(uniobj);
6558    *fillcharloc = unistr[0];
6559    Py_DECREF(uniobj);
6560    return 1;
6561}
6562
6563PyDoc_STRVAR(center__doc__,
6564             "S.center(width[, fillchar]) -> str\n\
6565\n\
6566Return S centered in a string of length width. Padding is\n\
6567done using the specified fill character (default is a space)");
6568
6569static PyObject *
6570unicode_center(PyUnicodeObject *self, PyObject *args)
6571{
6572    Py_ssize_t marg, left;
6573    Py_ssize_t width;
6574    Py_UNICODE fillchar = ' ';
6575
6576    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6577        return NULL;
6578
6579    if (self->length >= width && PyUnicode_CheckExact(self)) {
6580        Py_INCREF(self);
6581        return (PyObject*) self;
6582    }
6583
6584    marg = width - self->length;
6585    left = marg / 2 + (marg & width & 1);
6586
6587    return (PyObject*) pad(self, left, marg - left, fillchar);
6588}
6589
6590#if 0
6591
6592/* This code should go into some future Unicode collation support
6593   module. The basic comparison should compare ordinals on a naive
6594   basis (this is what Java does and thus JPython too). */
6595
6596/* speedy UTF-16 code point order comparison */
6597/* gleaned from: */
6598/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6599
6600static short utf16Fixup[32] =
6601{
6602    0, 0, 0, 0, 0, 0, 0, 0,
6603    0, 0, 0, 0, 0, 0, 0, 0,
6604    0, 0, 0, 0, 0, 0, 0, 0,
6605    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6606};
6607
6608static int
6609unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6610{
6611    Py_ssize_t len1, len2;
6612
6613    Py_UNICODE *s1 = str1->str;
6614    Py_UNICODE *s2 = str2->str;
6615
6616    len1 = str1->length;
6617    len2 = str2->length;
6618
6619    while (len1 > 0 && len2 > 0) {
6620        Py_UNICODE c1, c2;
6621
6622        c1 = *s1++;
6623        c2 = *s2++;
6624
6625        if (c1 > (1<<11) * 26)
6626            c1 += utf16Fixup[c1>>11];
6627        if (c2 > (1<<11) * 26)
6628            c2 += utf16Fixup[c2>>11];
6629        /* now c1 and c2 are in UTF-32-compatible order */
6630
6631        if (c1 != c2)
6632            return (c1 < c2) ? -1 : 1;
6633
6634        len1--; len2--;
6635    }
6636
6637    return (len1 < len2) ? -1 : (len1 != len2);
6638}
6639
6640#else
6641
6642static int
6643unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6644{
6645    register Py_ssize_t len1, len2;
6646
6647    Py_UNICODE *s1 = str1->str;
6648    Py_UNICODE *s2 = str2->str;
6649
6650    len1 = str1->length;
6651    len2 = str2->length;
6652
6653    while (len1 > 0 && len2 > 0) {
6654        Py_UNICODE c1, c2;
6655
6656        c1 = *s1++;
6657        c2 = *s2++;
6658
6659        if (c1 != c2)
6660            return (c1 < c2) ? -1 : 1;
6661
6662        len1--; len2--;
6663    }
6664
6665    return (len1 < len2) ? -1 : (len1 != len2);
6666}
6667
6668#endif
6669
6670int PyUnicode_Compare(PyObject *left,
6671                      PyObject *right)
6672{
6673    if (PyUnicode_Check(left) && PyUnicode_Check(right))
6674        return unicode_compare((PyUnicodeObject *)left,
6675                               (PyUnicodeObject *)right);
6676    PyErr_Format(PyExc_TypeError,
6677                 "Can't compare %.100s and %.100s",
6678                 left->ob_type->tp_name,
6679                 right->ob_type->tp_name);
6680    return -1;
6681}
6682
6683int
6684PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6685{
6686    int i;
6687    Py_UNICODE *id;
6688    assert(PyUnicode_Check(uni));
6689    id = PyUnicode_AS_UNICODE(uni);
6690    /* Compare Unicode string and source character set string */
6691    for (i = 0; id[i] && str[i]; i++)
6692        if (id[i] != str[i])
6693            return ((int)id[i] < (int)str[i]) ? -1 : 1;
6694    if (id[i])
6695        return 1; /* uni is longer */
6696    if (str[i])
6697        return -1; /* str is longer */
6698    return 0;
6699}
6700
6701
6702#define TEST_COND(cond)                         \
6703    ((cond) ? Py_True : Py_False)
6704
6705PyObject *PyUnicode_RichCompare(PyObject *left,
6706                                PyObject *right,
6707                                int op)
6708{
6709    int result;
6710
6711    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6712        PyObject *v;
6713        if (((PyUnicodeObject *) left)->length !=
6714            ((PyUnicodeObject *) right)->length) {
6715            if (op == Py_EQ) {
6716                Py_INCREF(Py_False);
6717                return Py_False;
6718            }
6719            if (op == Py_NE) {
6720                Py_INCREF(Py_True);
6721                return Py_True;
6722            }
6723        }
6724        if (left == right)
6725            result = 0;
6726        else
6727            result = unicode_compare((PyUnicodeObject *)left,
6728                                     (PyUnicodeObject *)right);
6729
6730        /* Convert the return value to a Boolean */
6731        switch (op) {
6732        case Py_EQ:
6733            v = TEST_COND(result == 0);
6734            break;
6735        case Py_NE:
6736            v = TEST_COND(result != 0);
6737            break;
6738        case Py_LE:
6739            v = TEST_COND(result <= 0);
6740            break;
6741        case Py_GE:
6742            v = TEST_COND(result >= 0);
6743            break;
6744        case Py_LT:
6745            v = TEST_COND(result == -1);
6746            break;
6747        case Py_GT:
6748            v = TEST_COND(result == 1);
6749            break;
6750        default:
6751            PyErr_BadArgument();
6752            return NULL;
6753        }
6754        Py_INCREF(v);
6755        return v;
6756    }
6757
6758    Py_INCREF(Py_NotImplemented);
6759    return Py_NotImplemented;
6760}
6761
6762int PyUnicode_Contains(PyObject *container,
6763                       PyObject *element)
6764{
6765    PyObject *str, *sub;
6766    int result;
6767
6768    /* Coerce the two arguments */
6769    sub = PyUnicode_FromObject(element);
6770    if (!sub) {
6771        PyErr_Format(PyExc_TypeError,
6772                     "'in <string>' requires string as left operand, not %s",
6773                     element->ob_type->tp_name);
6774        return -1;
6775    }
6776
6777    str = PyUnicode_FromObject(container);
6778    if (!str) {
6779        Py_DECREF(sub);
6780        return -1;
6781    }
6782
6783    result = stringlib_contains_obj(str, sub);
6784
6785    Py_DECREF(str);
6786    Py_DECREF(sub);
6787
6788    return result;
6789}
6790
6791/* Concat to string or Unicode object giving a new Unicode object. */
6792
6793PyObject *PyUnicode_Concat(PyObject *left,
6794                           PyObject *right)
6795{
6796    PyUnicodeObject *u = NULL, *v = NULL, *w;
6797
6798    /* Coerce the two arguments */
6799    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6800    if (u == NULL)
6801        goto onError;
6802    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6803    if (v == NULL)
6804        goto onError;
6805
6806    /* Shortcuts */
6807    if (v == unicode_empty) {
6808        Py_DECREF(v);
6809        return (PyObject *)u;
6810    }
6811    if (u == unicode_empty) {
6812        Py_DECREF(u);
6813        return (PyObject *)v;
6814    }
6815
6816    /* Concat the two Unicode strings */
6817    w = _PyUnicode_New(u->length + v->length);
6818    if (w == NULL)
6819        goto onError;
6820    Py_UNICODE_COPY(w->str, u->str, u->length);
6821    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6822
6823    Py_DECREF(u);
6824    Py_DECREF(v);
6825    return (PyObject *)w;
6826
6827  onError:
6828    Py_XDECREF(u);
6829    Py_XDECREF(v);
6830    return NULL;
6831}
6832
6833void
6834PyUnicode_Append(PyObject **pleft, PyObject *right)
6835{
6836    PyObject *new;
6837    if (*pleft == NULL)
6838        return;
6839    if (right == NULL || !PyUnicode_Check(*pleft)) {
6840        Py_DECREF(*pleft);
6841        *pleft = NULL;
6842        return;
6843    }
6844    new = PyUnicode_Concat(*pleft, right);
6845    Py_DECREF(*pleft);
6846    *pleft = new;
6847}
6848
6849void
6850PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6851{
6852    PyUnicode_Append(pleft, right);
6853    Py_XDECREF(right);
6854}
6855
6856PyDoc_STRVAR(count__doc__,
6857             "S.count(sub[, start[, end]]) -> int\n\
6858\n\
6859Return the number of non-overlapping occurrences of substring sub in\n\
6860string S[start:end].  Optional arguments start and end are\n\
6861interpreted as in slice notation.");
6862
6863static PyObject *
6864unicode_count(PyUnicodeObject *self, PyObject *args)
6865{
6866    PyUnicodeObject *substring;
6867    Py_ssize_t start = 0;
6868    Py_ssize_t end = PY_SSIZE_T_MAX;
6869    PyObject *result;
6870
6871    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6872                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6873        return NULL;
6874
6875    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6876        (PyObject *)substring);
6877    if (substring == NULL)
6878        return NULL;
6879
6880    FIX_START_END(self);
6881
6882    result = PyLong_FromSsize_t(
6883        stringlib_count(self->str + start, end - start,
6884                        substring->str, substring->length)
6885        );
6886
6887    Py_DECREF(substring);
6888
6889    return result;
6890}
6891
6892PyDoc_STRVAR(encode__doc__,
6893             "S.encode([encoding[, errors]]) -> bytes\n\
6894\n\
6895Encode S using the codec registered for encoding. encoding defaults\n\
6896to the default encoding. errors may be given to set a different error\n\
6897handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6898a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6899'xmlcharrefreplace' as well as any other name registered with\n\
6900codecs.register_error that can handle UnicodeEncodeErrors.");
6901
6902static PyObject *
6903unicode_encode(PyUnicodeObject *self, PyObject *args)
6904{
6905    char *encoding = NULL;
6906    char *errors = NULL;
6907    PyObject *v;
6908
6909    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6910        return NULL;
6911    v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
6912    if (v == NULL)
6913        goto onError;
6914    if (!PyBytes_Check(v)) {
6915        PyErr_Format(PyExc_TypeError,
6916                     "encoder did not return a bytes object "
6917                     "(type=%.400s)",
6918                     Py_TYPE(v)->tp_name);
6919        Py_DECREF(v);
6920        return NULL;
6921    }
6922    return v;
6923
6924  onError:
6925    return NULL;
6926}
6927
6928PyDoc_STRVAR(expandtabs__doc__,
6929             "S.expandtabs([tabsize]) -> str\n\
6930\n\
6931Return a copy of S where all tab characters are expanded using spaces.\n\
6932If tabsize is not given, a tab size of 8 characters is assumed.");
6933
6934static PyObject*
6935unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6936{
6937    Py_UNICODE *e;
6938    Py_UNICODE *p;
6939    Py_UNICODE *q;
6940    Py_UNICODE *qe;
6941    Py_ssize_t i, j, incr;
6942    PyUnicodeObject *u;
6943    int tabsize = 8;
6944
6945    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6946        return NULL;
6947
6948    /* First pass: determine size of output string */
6949    i = 0; /* chars up to and including most recent \n or \r */
6950    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6951    e = self->str + self->length; /* end of input */
6952    for (p = self->str; p < e; p++)
6953        if (*p == '\t') {
6954            if (tabsize > 0) {
6955                incr = tabsize - (j % tabsize); /* cannot overflow */
6956                if (j > PY_SSIZE_T_MAX - incr)
6957                    goto overflow1;
6958                j += incr;
6959            }
6960        }
6961        else {
6962            if (j > PY_SSIZE_T_MAX - 1)
6963                goto overflow1;
6964            j++;
6965            if (*p == '\n' || *p == '\r') {
6966                if (i > PY_SSIZE_T_MAX - j)
6967                    goto overflow1;
6968                i += j;
6969                j = 0;
6970            }
6971        }
6972
6973    if (i > PY_SSIZE_T_MAX - j)
6974        goto overflow1;
6975
6976    /* Second pass: create output string and fill it */
6977    u = _PyUnicode_New(i + j);
6978    if (!u)
6979        return NULL;
6980
6981    j = 0; /* same as in first pass */
6982    q = u->str; /* next output char */
6983    qe = u->str + u->length; /* end of output */
6984
6985    for (p = self->str; p < e; p++)
6986        if (*p == '\t') {
6987            if (tabsize > 0) {
6988                i = tabsize - (j % tabsize);
6989                j += i;
6990                while (i--) {
6991                    if (q >= qe)
6992                        goto overflow2;
6993                    *q++ = ' ';
6994                }
6995            }
6996        }
6997        else {
6998            if (q >= qe)
6999                goto overflow2;
7000            *q++ = *p;
7001            j++;
7002            if (*p == '\n' || *p == '\r')
7003                j = 0;
7004        }
7005
7006    return (PyObject*) u;
7007
7008  overflow2:
7009    Py_DECREF(u);
7010  overflow1:
7011    PyErr_SetString(PyExc_OverflowError, "new string is too long");
7012    return NULL;
7013}
7014
7015PyDoc_STRVAR(find__doc__,
7016             "S.find(sub[, start[, end]]) -> int\n\
7017\n\
7018Return the lowest index in S where substring sub is found,\n\
7019such that sub is contained within s[start:end].  Optional\n\
7020arguments start and end are interpreted as in slice notation.\n\
7021\n\
7022Return -1 on failure.");
7023
7024static PyObject *
7025unicode_find(PyUnicodeObject *self, PyObject *args)
7026{
7027    PyObject *substring;
7028    Py_ssize_t start;
7029    Py_ssize_t end;
7030    Py_ssize_t result;
7031
7032    if (!_ParseTupleFinds(args, &substring, &start, &end))
7033        return NULL;
7034
7035    result = stringlib_find_slice(
7036        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7037        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7038        start, end
7039        );
7040
7041    Py_DECREF(substring);
7042
7043    return PyLong_FromSsize_t(result);
7044}
7045
7046static PyObject *
7047unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7048{
7049    if (index < 0 || index >= self->length) {
7050        PyErr_SetString(PyExc_IndexError, "string index out of range");
7051        return NULL;
7052    }
7053
7054    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7055}
7056
7057/* Believe it or not, this produces the same value for ASCII strings
7058   as string_hash(). */
7059static long
7060unicode_hash(PyUnicodeObject *self)
7061{
7062    Py_ssize_t len;
7063    Py_UNICODE *p;
7064    long x;
7065
7066    if (self->hash != -1)
7067        return self->hash;
7068    len = Py_SIZE(self);
7069    p = self->str;
7070    x = *p << 7;
7071    while (--len >= 0)
7072        x = (1000003*x) ^ *p++;
7073    x ^= Py_SIZE(self);
7074    if (x == -1)
7075        x = -2;
7076    self->hash = x;
7077    return x;
7078}
7079
7080PyDoc_STRVAR(index__doc__,
7081             "S.index(sub[, start[, end]]) -> int\n\
7082\n\
7083Like S.find() but raise ValueError when the substring is not found.");
7084
7085static PyObject *
7086unicode_index(PyUnicodeObject *self, PyObject *args)
7087{
7088    Py_ssize_t result;
7089    PyObject *substring;
7090    Py_ssize_t start;
7091    Py_ssize_t end;
7092
7093    if (!_ParseTupleFinds(args, &substring, &start, &end))
7094        return NULL;
7095
7096    result = stringlib_find_slice(
7097        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7098        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7099        start, end
7100        );
7101
7102    Py_DECREF(substring);
7103
7104    if (result < 0) {
7105        PyErr_SetString(PyExc_ValueError, "substring not found");
7106        return NULL;
7107    }
7108
7109    return PyLong_FromSsize_t(result);
7110}
7111
7112PyDoc_STRVAR(islower__doc__,
7113             "S.islower() -> bool\n\
7114\n\
7115Return True if all cased characters in S are lowercase and there is\n\
7116at least one cased character in S, False otherwise.");
7117
7118static PyObject*
7119unicode_islower(PyUnicodeObject *self)
7120{
7121    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7122    register const Py_UNICODE *e;
7123    int cased;
7124
7125    /* Shortcut for single character strings */
7126    if (PyUnicode_GET_SIZE(self) == 1)
7127        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7128
7129    /* Special case for empty strings */
7130    if (PyUnicode_GET_SIZE(self) == 0)
7131        return PyBool_FromLong(0);
7132
7133    e = p + PyUnicode_GET_SIZE(self);
7134    cased = 0;
7135    for (; p < e; p++) {
7136        register const Py_UNICODE ch = *p;
7137
7138        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7139            return PyBool_FromLong(0);
7140        else if (!cased && Py_UNICODE_ISLOWER(ch))
7141            cased = 1;
7142    }
7143    return PyBool_FromLong(cased);
7144}
7145
7146PyDoc_STRVAR(isupper__doc__,
7147             "S.isupper() -> bool\n\
7148\n\
7149Return True if all cased characters in S are uppercase and there is\n\
7150at least one cased character in S, False otherwise.");
7151
7152static PyObject*
7153unicode_isupper(PyUnicodeObject *self)
7154{
7155    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7156    register const Py_UNICODE *e;
7157    int cased;
7158
7159    /* Shortcut for single character strings */
7160    if (PyUnicode_GET_SIZE(self) == 1)
7161        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7162
7163    /* Special case for empty strings */
7164    if (PyUnicode_GET_SIZE(self) == 0)
7165        return PyBool_FromLong(0);
7166
7167    e = p + PyUnicode_GET_SIZE(self);
7168    cased = 0;
7169    for (; p < e; p++) {
7170        register const Py_UNICODE ch = *p;
7171
7172        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7173            return PyBool_FromLong(0);
7174        else if (!cased && Py_UNICODE_ISUPPER(ch))
7175            cased = 1;
7176    }
7177    return PyBool_FromLong(cased);
7178}
7179
7180PyDoc_STRVAR(istitle__doc__,
7181             "S.istitle() -> bool\n\
7182\n\
7183Return True if S is a titlecased string and there is at least one\n\
7184character in S, i.e. upper- and titlecase characters may only\n\
7185follow uncased characters and lowercase characters only cased ones.\n\
7186Return False otherwise.");
7187
7188static PyObject*
7189unicode_istitle(PyUnicodeObject *self)
7190{
7191    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7192    register const Py_UNICODE *e;
7193    int cased, previous_is_cased;
7194
7195    /* Shortcut for single character strings */
7196    if (PyUnicode_GET_SIZE(self) == 1)
7197        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7198                               (Py_UNICODE_ISUPPER(*p) != 0));
7199
7200    /* Special case for empty strings */
7201    if (PyUnicode_GET_SIZE(self) == 0)
7202        return PyBool_FromLong(0);
7203
7204    e = p + PyUnicode_GET_SIZE(self);
7205    cased = 0;
7206    previous_is_cased = 0;
7207    for (; p < e; p++) {
7208        register const Py_UNICODE ch = *p;
7209
7210        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7211            if (previous_is_cased)
7212                return PyBool_FromLong(0);
7213            previous_is_cased = 1;
7214            cased = 1;
7215        }
7216        else if (Py_UNICODE_ISLOWER(ch)) {
7217            if (!previous_is_cased)
7218                return PyBool_FromLong(0);
7219            previous_is_cased = 1;
7220            cased = 1;
7221        }
7222        else
7223            previous_is_cased = 0;
7224    }
7225    return PyBool_FromLong(cased);
7226}
7227
7228PyDoc_STRVAR(isspace__doc__,
7229             "S.isspace() -> bool\n\
7230\n\
7231Return True if all characters in S are whitespace\n\
7232and there is at least one character in S, False otherwise.");
7233
7234static PyObject*
7235unicode_isspace(PyUnicodeObject *self)
7236{
7237    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7238    register const Py_UNICODE *e;
7239
7240    /* Shortcut for single character strings */
7241    if (PyUnicode_GET_SIZE(self) == 1 &&
7242        Py_UNICODE_ISSPACE(*p))
7243        return PyBool_FromLong(1);
7244
7245    /* Special case for empty strings */
7246    if (PyUnicode_GET_SIZE(self) == 0)
7247        return PyBool_FromLong(0);
7248
7249    e = p + PyUnicode_GET_SIZE(self);
7250    for (; p < e; p++) {
7251        if (!Py_UNICODE_ISSPACE(*p))
7252            return PyBool_FromLong(0);
7253    }
7254    return PyBool_FromLong(1);
7255}
7256
7257PyDoc_STRVAR(isalpha__doc__,
7258             "S.isalpha() -> bool\n\
7259\n\
7260Return True if all characters in S are alphabetic\n\
7261and there is at least one character in S, False otherwise.");
7262
7263static PyObject*
7264unicode_isalpha(PyUnicodeObject *self)
7265{
7266    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7267    register const Py_UNICODE *e;
7268
7269    /* Shortcut for single character strings */
7270    if (PyUnicode_GET_SIZE(self) == 1 &&
7271        Py_UNICODE_ISALPHA(*p))
7272        return PyBool_FromLong(1);
7273
7274    /* Special case for empty strings */
7275    if (PyUnicode_GET_SIZE(self) == 0)
7276        return PyBool_FromLong(0);
7277
7278    e = p + PyUnicode_GET_SIZE(self);
7279    for (; p < e; p++) {
7280        if (!Py_UNICODE_ISALPHA(*p))
7281            return PyBool_FromLong(0);
7282    }
7283    return PyBool_FromLong(1);
7284}
7285
7286PyDoc_STRVAR(isalnum__doc__,
7287             "S.isalnum() -> bool\n\
7288\n\
7289Return True if all characters in S are alphanumeric\n\
7290and there is at least one character in S, False otherwise.");
7291
7292static PyObject*
7293unicode_isalnum(PyUnicodeObject *self)
7294{
7295    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7296    register const Py_UNICODE *e;
7297
7298    /* Shortcut for single character strings */
7299    if (PyUnicode_GET_SIZE(self) == 1 &&
7300        Py_UNICODE_ISALNUM(*p))
7301        return PyBool_FromLong(1);
7302
7303    /* Special case for empty strings */
7304    if (PyUnicode_GET_SIZE(self) == 0)
7305        return PyBool_FromLong(0);
7306
7307    e = p + PyUnicode_GET_SIZE(self);
7308    for (; p < e; p++) {
7309        if (!Py_UNICODE_ISALNUM(*p))
7310            return PyBool_FromLong(0);
7311    }
7312    return PyBool_FromLong(1);
7313}
7314
7315PyDoc_STRVAR(isdecimal__doc__,
7316             "S.isdecimal() -> bool\n\
7317\n\
7318Return True if there are only decimal characters in S,\n\
7319False otherwise.");
7320
7321static PyObject*
7322unicode_isdecimal(PyUnicodeObject *self)
7323{
7324    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7325    register const Py_UNICODE *e;
7326
7327    /* Shortcut for single character strings */
7328    if (PyUnicode_GET_SIZE(self) == 1 &&
7329        Py_UNICODE_ISDECIMAL(*p))
7330        return PyBool_FromLong(1);
7331
7332    /* Special case for empty strings */
7333    if (PyUnicode_GET_SIZE(self) == 0)
7334        return PyBool_FromLong(0);
7335
7336    e = p + PyUnicode_GET_SIZE(self);
7337    for (; p < e; p++) {
7338        if (!Py_UNICODE_ISDECIMAL(*p))
7339            return PyBool_FromLong(0);
7340    }
7341    return PyBool_FromLong(1);
7342}
7343
7344PyDoc_STRVAR(isdigit__doc__,
7345             "S.isdigit() -> bool\n\
7346\n\
7347Return True if all characters in S are digits\n\
7348and there is at least one character in S, False otherwise.");
7349
7350static PyObject*
7351unicode_isdigit(PyUnicodeObject *self)
7352{
7353    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7354    register const Py_UNICODE *e;
7355
7356    /* Shortcut for single character strings */
7357    if (PyUnicode_GET_SIZE(self) == 1 &&
7358        Py_UNICODE_ISDIGIT(*p))
7359        return PyBool_FromLong(1);
7360
7361    /* Special case for empty strings */
7362    if (PyUnicode_GET_SIZE(self) == 0)
7363        return PyBool_FromLong(0);
7364
7365    e = p + PyUnicode_GET_SIZE(self);
7366    for (; p < e; p++) {
7367        if (!Py_UNICODE_ISDIGIT(*p))
7368            return PyBool_FromLong(0);
7369    }
7370    return PyBool_FromLong(1);
7371}
7372
7373PyDoc_STRVAR(isnumeric__doc__,
7374             "S.isnumeric() -> bool\n\
7375\n\
7376Return True if there are only numeric characters in S,\n\
7377False otherwise.");
7378
7379static PyObject*
7380unicode_isnumeric(PyUnicodeObject *self)
7381{
7382    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7383    register const Py_UNICODE *e;
7384
7385    /* Shortcut for single character strings */
7386    if (PyUnicode_GET_SIZE(self) == 1 &&
7387        Py_UNICODE_ISNUMERIC(*p))
7388        return PyBool_FromLong(1);
7389
7390    /* Special case for empty strings */
7391    if (PyUnicode_GET_SIZE(self) == 0)
7392        return PyBool_FromLong(0);
7393
7394    e = p + PyUnicode_GET_SIZE(self);
7395    for (; p < e; p++) {
7396        if (!Py_UNICODE_ISNUMERIC(*p))
7397            return PyBool_FromLong(0);
7398    }
7399    return PyBool_FromLong(1);
7400}
7401
7402int
7403PyUnicode_IsIdentifier(PyObject *self)
7404{
7405    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7406    register const Py_UNICODE *e;
7407
7408    /* Special case for empty strings */
7409    if (PyUnicode_GET_SIZE(self) == 0)
7410        return 0;
7411
7412    /* PEP 3131 says that the first character must be in
7413       XID_Start and subsequent characters in XID_Continue,
7414       and for the ASCII range, the 2.x rules apply (i.e
7415       start with letters and underscore, continue with
7416       letters, digits, underscore). However, given the current
7417       definition of XID_Start and XID_Continue, it is sufficient
7418       to check just for these, except that _ must be allowed
7419       as starting an identifier.  */
7420    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7421        return 0;
7422
7423    e = p + PyUnicode_GET_SIZE(self);
7424    for (p++; p < e; p++) {
7425        if (!_PyUnicode_IsXidContinue(*p))
7426            return 0;
7427    }
7428    return 1;
7429}
7430
7431PyDoc_STRVAR(isidentifier__doc__,
7432             "S.isidentifier() -> bool\n\
7433\n\
7434Return True if S is a valid identifier according\n\
7435to the language definition.");
7436
7437static PyObject*
7438unicode_isidentifier(PyObject *self)
7439{
7440    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7441}
7442
7443PyDoc_STRVAR(isprintable__doc__,
7444             "S.isprintable() -> bool\n\
7445\n\
7446Return True if all characters in S are considered\n\
7447printable in repr() or S is empty, False otherwise.");
7448
7449static PyObject*
7450unicode_isprintable(PyObject *self)
7451{
7452    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7453    register const Py_UNICODE *e;
7454
7455    /* Shortcut for single character strings */
7456    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7457        Py_RETURN_TRUE;
7458    }
7459
7460    e = p + PyUnicode_GET_SIZE(self);
7461    for (; p < e; p++) {
7462        if (!Py_UNICODE_ISPRINTABLE(*p)) {
7463            Py_RETURN_FALSE;
7464        }
7465    }
7466    Py_RETURN_TRUE;
7467}
7468
7469PyDoc_STRVAR(join__doc__,
7470             "S.join(sequence) -> str\n\
7471\n\
7472Return a string which is the concatenation of the strings in the\n\
7473sequence.  The separator between elements is S.");
7474
7475static PyObject*
7476unicode_join(PyObject *self, PyObject *data)
7477{
7478    return PyUnicode_Join(self, data);
7479}
7480
7481static Py_ssize_t
7482unicode_length(PyUnicodeObject *self)
7483{
7484    return self->length;
7485}
7486
7487PyDoc_STRVAR(ljust__doc__,
7488             "S.ljust(width[, fillchar]) -> str\n\
7489\n\
7490Return S left-justified in a Unicode string of length width. Padding is\n\
7491done using the specified fill character (default is a space).");
7492
7493static PyObject *
7494unicode_ljust(PyUnicodeObject *self, PyObject *args)
7495{
7496    Py_ssize_t width;
7497    Py_UNICODE fillchar = ' ';
7498
7499    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7500        return NULL;
7501
7502    if (self->length >= width && PyUnicode_CheckExact(self)) {
7503        Py_INCREF(self);
7504        return (PyObject*) self;
7505    }
7506
7507    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7508}
7509
7510PyDoc_STRVAR(lower__doc__,
7511             "S.lower() -> str\n\
7512\n\
7513Return a copy of the string S converted to lowercase.");
7514
7515static PyObject*
7516unicode_lower(PyUnicodeObject *self)
7517{
7518    return fixup(self, fixlower);
7519}
7520
7521#define LEFTSTRIP 0
7522#define RIGHTSTRIP 1
7523#define BOTHSTRIP 2
7524
7525/* Arrays indexed by above */
7526static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7527
7528#define STRIPNAME(i) (stripformat[i]+3)
7529
7530/* externally visible for str.strip(unicode) */
7531PyObject *
7532_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7533{
7534    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7535    Py_ssize_t len = PyUnicode_GET_SIZE(self);
7536    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7537    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7538    Py_ssize_t i, j;
7539
7540    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7541
7542    i = 0;
7543    if (striptype != RIGHTSTRIP) {
7544        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7545            i++;
7546        }
7547    }
7548
7549    j = len;
7550    if (striptype != LEFTSTRIP) {
7551        do {
7552            j--;
7553        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7554        j++;
7555    }
7556
7557    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7558        Py_INCREF(self);
7559        return (PyObject*)self;
7560    }
7561    else
7562        return PyUnicode_FromUnicode(s+i, j-i);
7563}
7564
7565
7566static PyObject *
7567do_strip(PyUnicodeObject *self, int striptype)
7568{
7569    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7570    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7571
7572    i = 0;
7573    if (striptype != RIGHTSTRIP) {
7574        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7575            i++;
7576        }
7577    }
7578
7579    j = len;
7580    if (striptype != LEFTSTRIP) {
7581        do {
7582            j--;
7583        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7584        j++;
7585    }
7586
7587    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7588        Py_INCREF(self);
7589        return (PyObject*)self;
7590    }
7591    else
7592        return PyUnicode_FromUnicode(s+i, j-i);
7593}
7594
7595
7596static PyObject *
7597do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7598{
7599    PyObject *sep = NULL;
7600
7601    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7602        return NULL;
7603
7604    if (sep != NULL && sep != Py_None) {
7605        if (PyUnicode_Check(sep))
7606            return _PyUnicode_XStrip(self, striptype, sep);
7607        else {
7608            PyErr_Format(PyExc_TypeError,
7609                         "%s arg must be None or str",
7610                         STRIPNAME(striptype));
7611            return NULL;
7612        }
7613    }
7614
7615    return do_strip(self, striptype);
7616}
7617
7618
7619PyDoc_STRVAR(strip__doc__,
7620             "S.strip([chars]) -> str\n\
7621\n\
7622Return a copy of the string S with leading and trailing\n\
7623whitespace removed.\n\
7624If chars is given and not None, remove characters in chars instead.");
7625
7626static PyObject *
7627unicode_strip(PyUnicodeObject *self, PyObject *args)
7628{
7629    if (PyTuple_GET_SIZE(args) == 0)
7630        return do_strip(self, BOTHSTRIP); /* Common case */
7631    else
7632        return do_argstrip(self, BOTHSTRIP, args);
7633}
7634
7635
7636PyDoc_STRVAR(lstrip__doc__,
7637             "S.lstrip([chars]) -> str\n\
7638\n\
7639Return a copy of the string S with leading whitespace removed.\n\
7640If chars is given and not None, remove characters in chars instead.");
7641
7642static PyObject *
7643unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7644{
7645    if (PyTuple_GET_SIZE(args) == 0)
7646        return do_strip(self, LEFTSTRIP); /* Common case */
7647    else
7648        return do_argstrip(self, LEFTSTRIP, args);
7649}
7650
7651
7652PyDoc_STRVAR(rstrip__doc__,
7653             "S.rstrip([chars]) -> str\n\
7654\n\
7655Return a copy of the string S with trailing whitespace removed.\n\
7656If chars is given and not None, remove characters in chars instead.");
7657
7658static PyObject *
7659unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7660{
7661    if (PyTuple_GET_SIZE(args) == 0)
7662        return do_strip(self, RIGHTSTRIP); /* Common case */
7663    else
7664        return do_argstrip(self, RIGHTSTRIP, args);
7665}
7666
7667
7668static PyObject*
7669unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7670{
7671    PyUnicodeObject *u;
7672    Py_UNICODE *p;
7673    Py_ssize_t nchars;
7674    size_t nbytes;
7675
7676    if (len < 0)
7677        len = 0;
7678
7679    if (len == 1 && PyUnicode_CheckExact(str)) {
7680        /* no repeat, return original string */
7681        Py_INCREF(str);
7682        return (PyObject*) str;
7683    }
7684
7685    /* ensure # of chars needed doesn't overflow int and # of bytes
7686     * needed doesn't overflow size_t
7687     */
7688    nchars = len * str->length;
7689    if (len && nchars / len != str->length) {
7690        PyErr_SetString(PyExc_OverflowError,
7691                        "repeated string is too long");
7692        return NULL;
7693    }
7694    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7695    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7696        PyErr_SetString(PyExc_OverflowError,
7697                        "repeated string is too long");
7698        return NULL;
7699    }
7700    u = _PyUnicode_New(nchars);
7701    if (!u)
7702        return NULL;
7703
7704    p = u->str;
7705
7706    if (str->length == 1 && len > 0) {
7707        Py_UNICODE_FILL(p, str->str[0], len);
7708    } else {
7709        Py_ssize_t done = 0; /* number of characters copied this far */
7710        if (done < nchars) {
7711            Py_UNICODE_COPY(p, str->str, str->length);
7712            done = str->length;
7713        }
7714        while (done < nchars) {
7715            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7716            Py_UNICODE_COPY(p+done, p, n);
7717            done += n;
7718        }
7719    }
7720
7721    return (PyObject*) u;
7722}
7723
7724PyObject *PyUnicode_Replace(PyObject *obj,
7725                            PyObject *subobj,
7726                            PyObject *replobj,
7727                            Py_ssize_t maxcount)
7728{
7729    PyObject *self;
7730    PyObject *str1;
7731    PyObject *str2;
7732    PyObject *result;
7733
7734    self = PyUnicode_FromObject(obj);
7735    if (self == NULL)
7736        return NULL;
7737    str1 = PyUnicode_FromObject(subobj);
7738    if (str1 == NULL) {
7739        Py_DECREF(self);
7740        return NULL;
7741    }
7742    str2 = PyUnicode_FromObject(replobj);
7743    if (str2 == NULL) {
7744        Py_DECREF(self);
7745        Py_DECREF(str1);
7746        return NULL;
7747    }
7748    result = replace((PyUnicodeObject *)self,
7749                     (PyUnicodeObject *)str1,
7750                     (PyUnicodeObject *)str2,
7751                     maxcount);
7752    Py_DECREF(self);
7753    Py_DECREF(str1);
7754    Py_DECREF(str2);
7755    return result;
7756}
7757
7758PyDoc_STRVAR(replace__doc__,
7759             "S.replace (old, new[, count]) -> str\n\
7760\n\
7761Return a copy of S with all occurrences of substring\n\
7762old replaced by new.  If the optional argument count is\n\
7763given, only the first count occurrences are replaced.");
7764
7765static PyObject*
7766unicode_replace(PyUnicodeObject *self, PyObject *args)
7767{
7768    PyUnicodeObject *str1;
7769    PyUnicodeObject *str2;
7770    Py_ssize_t maxcount = -1;
7771    PyObject *result;
7772
7773    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7774        return NULL;
7775    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7776    if (str1 == NULL)
7777        return NULL;
7778    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7779    if (str2 == NULL) {
7780        Py_DECREF(str1);
7781        return NULL;
7782    }
7783
7784    result = replace(self, str1, str2, maxcount);
7785
7786    Py_DECREF(str1);
7787    Py_DECREF(str2);
7788    return result;
7789}
7790
7791static
7792PyObject *unicode_repr(PyObject *unicode)
7793{
7794    PyObject *repr;
7795    Py_UNICODE *p;
7796    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7797    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7798
7799    /* XXX(nnorwitz): rather than over-allocating, it would be
7800       better to choose a different scheme.  Perhaps scan the
7801       first N-chars of the string and allocate based on that size.
7802    */
7803    /* Initial allocation is based on the longest-possible unichr
7804       escape.
7805
7806       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7807       unichr, so in this case it's the longest unichr escape. In
7808       narrow (UTF-16) builds this is five chars per source unichr
7809       since there are two unichrs in the surrogate pair, so in narrow
7810       (UTF-16) builds it's not the longest unichr escape.
7811
7812       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7813       so in the narrow (UTF-16) build case it's the longest unichr
7814       escape.
7815    */
7816
7817    repr = PyUnicode_FromUnicode(NULL,
7818                                 2 /* quotes */
7819#ifdef Py_UNICODE_WIDE
7820                                 + 10*size
7821#else
7822                                 + 6*size
7823#endif
7824                                 + 1);
7825    if (repr == NULL)
7826        return NULL;
7827
7828    p = PyUnicode_AS_UNICODE(repr);
7829
7830    /* Add quote */
7831    *p++ = (findchar(s, size, '\'') &&
7832            !findchar(s, size, '"')) ? '"' : '\'';
7833    while (size-- > 0) {
7834        Py_UNICODE ch = *s++;
7835
7836        /* Escape quotes and backslashes */
7837        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7838            *p++ = '\\';
7839            *p++ = ch;
7840            continue;
7841        }
7842
7843        /* Map special whitespace to '\t', \n', '\r' */
7844        if (ch == '\t') {
7845            *p++ = '\\';
7846            *p++ = 't';
7847        }
7848        else if (ch == '\n') {
7849            *p++ = '\\';
7850            *p++ = 'n';
7851        }
7852        else if (ch == '\r') {
7853            *p++ = '\\';
7854            *p++ = 'r';
7855        }
7856
7857        /* Map non-printable US ASCII to '\xhh' */
7858        else if (ch < ' ' || ch == 0x7F) {
7859            *p++ = '\\';
7860            *p++ = 'x';
7861            *p++ = hexdigits[(ch >> 4) & 0x000F];
7862            *p++ = hexdigits[ch & 0x000F];
7863        }
7864
7865        /* Copy ASCII characters as-is */
7866        else if (ch < 0x7F) {
7867            *p++ = ch;
7868        }
7869
7870        /* Non-ASCII characters */
7871        else {
7872            Py_UCS4 ucs = ch;
7873
7874#ifndef Py_UNICODE_WIDE
7875            Py_UNICODE ch2 = 0;
7876            /* Get code point from surrogate pair */
7877            if (size > 0) {
7878                ch2 = *s;
7879                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7880                    && ch2 <= 0xDFFF) {
7881                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7882                        + 0x00010000;
7883                    s++;
7884                    size--;
7885                }
7886            }
7887#endif
7888            /* Map Unicode whitespace and control characters
7889               (categories Z* and C* except ASCII space)
7890            */
7891            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7892                /* Map 8-bit characters to '\xhh' */
7893                if (ucs <= 0xff) {
7894                    *p++ = '\\';
7895                    *p++ = 'x';
7896                    *p++ = hexdigits[(ch >> 4) & 0x000F];
7897                    *p++ = hexdigits[ch & 0x000F];
7898                }
7899                /* Map 21-bit characters to '\U00xxxxxx' */
7900                else if (ucs >= 0x10000) {
7901                    *p++ = '\\';
7902                    *p++ = 'U';
7903                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7904                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7905                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7906                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7907                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7908                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7909                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7910                    *p++ = hexdigits[ucs & 0x0000000F];
7911                }
7912                /* Map 16-bit characters to '\uxxxx' */
7913                else {
7914                    *p++ = '\\';
7915                    *p++ = 'u';
7916                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
7917                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
7918                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
7919                    *p++ = hexdigits[ucs & 0x000F];
7920                }
7921            }
7922            /* Copy characters as-is */
7923            else {
7924                *p++ = ch;
7925#ifndef Py_UNICODE_WIDE
7926                if (ucs >= 0x10000)
7927                    *p++ = ch2;
7928#endif
7929            }
7930        }
7931    }
7932    /* Add quote */
7933    *p++ = PyUnicode_AS_UNICODE(repr)[0];
7934
7935    *p = '\0';
7936    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7937    return repr;
7938}
7939
7940PyDoc_STRVAR(rfind__doc__,
7941             "S.rfind(sub[, start[, end]]) -> int\n\
7942\n\
7943Return the highest index in S where substring sub is found,\n\
7944such that sub is contained within s[start:end].  Optional\n\
7945arguments start and end are interpreted as in slice notation.\n\
7946\n\
7947Return -1 on failure.");
7948
7949static PyObject *
7950unicode_rfind(PyUnicodeObject *self, PyObject *args)
7951{
7952    PyObject *substring;
7953    Py_ssize_t start;
7954    Py_ssize_t end;
7955    Py_ssize_t result;
7956
7957    if (!_ParseTupleFinds(args, &substring, &start, &end))
7958        return NULL;
7959
7960    result = stringlib_rfind_slice(
7961        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7962        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7963        start, end
7964        );
7965
7966    Py_DECREF(substring);
7967
7968    return PyLong_FromSsize_t(result);
7969}
7970
7971PyDoc_STRVAR(rindex__doc__,
7972             "S.rindex(sub[, start[, end]]) -> int\n\
7973\n\
7974Like S.rfind() but raise ValueError when the substring is not found.");
7975
7976static PyObject *
7977unicode_rindex(PyUnicodeObject *self, PyObject *args)
7978{
7979    PyObject *substring;
7980    Py_ssize_t start;
7981    Py_ssize_t end;
7982    Py_ssize_t result;
7983
7984    if (!_ParseTupleFinds(args, &substring, &start, &end))
7985        return NULL;
7986
7987    result = stringlib_rfind_slice(
7988        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7989        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7990        start, end
7991        );
7992
7993    Py_DECREF(substring);
7994
7995    if (result < 0) {
7996        PyErr_SetString(PyExc_ValueError, "substring not found");
7997        return NULL;
7998    }
7999    return PyLong_FromSsize_t(result);
8000}
8001
8002PyDoc_STRVAR(rjust__doc__,
8003             "S.rjust(width[, fillchar]) -> str\n\
8004\n\
8005Return S right-justified in a string of length width. Padding is\n\
8006done using the specified fill character (default is a space).");
8007
8008static PyObject *
8009unicode_rjust(PyUnicodeObject *self, PyObject *args)
8010{
8011    Py_ssize_t width;
8012    Py_UNICODE fillchar = ' ';
8013
8014    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8015        return NULL;
8016
8017    if (self->length >= width && PyUnicode_CheckExact(self)) {
8018        Py_INCREF(self);
8019        return (PyObject*) self;
8020    }
8021
8022    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8023}
8024
8025PyObject *PyUnicode_Split(PyObject *s,
8026                          PyObject *sep,
8027                          Py_ssize_t maxsplit)
8028{
8029    PyObject *result;
8030
8031    s = PyUnicode_FromObject(s);
8032    if (s == NULL)
8033        return NULL;
8034    if (sep != NULL) {
8035        sep = PyUnicode_FromObject(sep);
8036        if (sep == NULL) {
8037            Py_DECREF(s);
8038            return NULL;
8039        }
8040    }
8041
8042    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8043
8044    Py_DECREF(s);
8045    Py_XDECREF(sep);
8046    return result;
8047}
8048
8049PyDoc_STRVAR(split__doc__,
8050             "S.split([sep[, maxsplit]]) -> list of strings\n\
8051\n\
8052Return a list of the words in S, using sep as the\n\
8053delimiter string.  If maxsplit is given, at most maxsplit\n\
8054splits are done. If sep is not specified or is None, any\n\
8055whitespace string is a separator and empty strings are\n\
8056removed from the result.");
8057
8058static PyObject*
8059unicode_split(PyUnicodeObject *self, PyObject *args)
8060{
8061    PyObject *substring = Py_None;
8062    Py_ssize_t maxcount = -1;
8063
8064    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8065        return NULL;
8066
8067    if (substring == Py_None)
8068        return split(self, NULL, maxcount);
8069    else if (PyUnicode_Check(substring))
8070        return split(self, (PyUnicodeObject *)substring, maxcount);
8071    else
8072        return PyUnicode_Split((PyObject *)self, substring, maxcount);
8073}
8074
8075PyObject *
8076PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8077{
8078    PyObject* str_obj;
8079    PyObject* sep_obj;
8080    PyObject* out;
8081
8082    str_obj = PyUnicode_FromObject(str_in);
8083    if (!str_obj)
8084        return NULL;
8085    sep_obj = PyUnicode_FromObject(sep_in);
8086    if (!sep_obj) {
8087        Py_DECREF(str_obj);
8088        return NULL;
8089    }
8090
8091    out = stringlib_partition(
8092        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8093        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8094        );
8095
8096    Py_DECREF(sep_obj);
8097    Py_DECREF(str_obj);
8098
8099    return out;
8100}
8101
8102
8103PyObject *
8104PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8105{
8106    PyObject* str_obj;
8107    PyObject* sep_obj;
8108    PyObject* out;
8109
8110    str_obj = PyUnicode_FromObject(str_in);
8111    if (!str_obj)
8112        return NULL;
8113    sep_obj = PyUnicode_FromObject(sep_in);
8114    if (!sep_obj) {
8115        Py_DECREF(str_obj);
8116        return NULL;
8117    }
8118
8119    out = stringlib_rpartition(
8120        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8121        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8122        );
8123
8124    Py_DECREF(sep_obj);
8125    Py_DECREF(str_obj);
8126
8127    return out;
8128}
8129
8130PyDoc_STRVAR(partition__doc__,
8131             "S.partition(sep) -> (head, sep, tail)\n\
8132\n\
8133Search for the separator sep in S, and return the part before it,\n\
8134the separator itself, and the part after it.  If the separator is not\n\
8135found, return S and two empty strings.");
8136
8137static PyObject*
8138unicode_partition(PyUnicodeObject *self, PyObject *separator)
8139{
8140    return PyUnicode_Partition((PyObject *)self, separator);
8141}
8142
8143PyDoc_STRVAR(rpartition__doc__,
8144             "S.rpartition(sep) -> (tail, sep, head)\n\
8145\n\
8146Search for the separator sep in S, starting at the end of S, and return\n\
8147the part before it, the separator itself, and the part after it.  If the\n\
8148separator is not found, return two empty strings and S.");
8149
8150static PyObject*
8151unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8152{
8153    return PyUnicode_RPartition((PyObject *)self, separator);
8154}
8155
8156PyObject *PyUnicode_RSplit(PyObject *s,
8157                           PyObject *sep,
8158                           Py_ssize_t maxsplit)
8159{
8160    PyObject *result;
8161
8162    s = PyUnicode_FromObject(s);
8163    if (s == NULL)
8164        return NULL;
8165    if (sep != NULL) {
8166        sep = PyUnicode_FromObject(sep);
8167        if (sep == NULL) {
8168            Py_DECREF(s);
8169            return NULL;
8170        }
8171    }
8172
8173    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8174
8175    Py_DECREF(s);
8176    Py_XDECREF(sep);
8177    return result;
8178}
8179
8180PyDoc_STRVAR(rsplit__doc__,
8181             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8182\n\
8183Return a list of the words in S, using sep as the\n\
8184delimiter string, starting at the end of the string and\n\
8185working to the front.  If maxsplit is given, at most maxsplit\n\
8186splits are done. If sep is not specified, any whitespace string\n\
8187is a separator.");
8188
8189static PyObject*
8190unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8191{
8192    PyObject *substring = Py_None;
8193    Py_ssize_t maxcount = -1;
8194
8195    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8196        return NULL;
8197
8198    if (substring == Py_None)
8199        return rsplit(self, NULL, maxcount);
8200    else if (PyUnicode_Check(substring))
8201        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8202    else
8203        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8204}
8205
8206PyDoc_STRVAR(splitlines__doc__,
8207             "S.splitlines([keepends]) -> list of strings\n\
8208\n\
8209Return a list of the lines in S, breaking at line boundaries.\n\
8210Line breaks are not included in the resulting list unless keepends\n\
8211is given and true.");
8212
8213static PyObject*
8214unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8215{
8216    int keepends = 0;
8217
8218    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8219        return NULL;
8220
8221    return PyUnicode_Splitlines((PyObject *)self, keepends);
8222}
8223
8224static
8225PyObject *unicode_str(PyObject *self)
8226{
8227    if (PyUnicode_CheckExact(self)) {
8228        Py_INCREF(self);
8229        return self;
8230    } else
8231        /* Subtype -- return genuine unicode string with the same value. */
8232        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8233                                     PyUnicode_GET_SIZE(self));
8234}
8235
8236PyDoc_STRVAR(swapcase__doc__,
8237             "S.swapcase() -> str\n\
8238\n\
8239Return a copy of S with uppercase characters converted to lowercase\n\
8240and vice versa.");
8241
8242static PyObject*
8243unicode_swapcase(PyUnicodeObject *self)
8244{
8245    return fixup(self, fixswapcase);
8246}
8247
8248PyDoc_STRVAR(maketrans__doc__,
8249             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8250\n\
8251Return a translation table usable for str.translate().\n\
8252If there is only one argument, it must be a dictionary mapping Unicode\n\
8253ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8254Character keys will be then converted to ordinals.\n\
8255If there are two arguments, they must be strings of equal length, and\n\
8256in the resulting dictionary, each character in x will be mapped to the\n\
8257character at the same position in y. If there is a third argument, it\n\
8258must be a string, whose characters will be mapped to None in the result.");
8259
8260static PyObject*
8261unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8262{
8263    PyObject *x, *y = NULL, *z = NULL;
8264    PyObject *new = NULL, *key, *value;
8265    Py_ssize_t i = 0;
8266    int res;
8267
8268    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8269        return NULL;
8270    new = PyDict_New();
8271    if (!new)
8272        return NULL;
8273    if (y != NULL) {
8274        /* x must be a string too, of equal length */
8275        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8276        if (!PyUnicode_Check(x)) {
8277            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8278                            "be a string if there is a second argument");
8279            goto err;
8280        }
8281        if (PyUnicode_GET_SIZE(x) != ylen) {
8282            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8283                            "arguments must have equal length");
8284            goto err;
8285        }
8286        /* create entries for translating chars in x to those in y */
8287        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8288            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8289            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8290            if (!key || !value)
8291                goto err;
8292            res = PyDict_SetItem(new, key, value);
8293            Py_DECREF(key);
8294            Py_DECREF(value);
8295            if (res < 0)
8296                goto err;
8297        }
8298        /* create entries for deleting chars in z */
8299        if (z != NULL) {
8300            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8301                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8302                if (!key)
8303                    goto err;
8304                res = PyDict_SetItem(new, key, Py_None);
8305                Py_DECREF(key);
8306                if (res < 0)
8307                    goto err;
8308            }
8309        }
8310    } else {
8311        /* x must be a dict */
8312        if (!PyDict_Check(x)) {
8313            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8314                            "to maketrans it must be a dict");
8315            goto err;
8316        }
8317        /* copy entries into the new dict, converting string keys to int keys */
8318        while (PyDict_Next(x, &i, &key, &value)) {
8319            if (PyUnicode_Check(key)) {
8320                /* convert string keys to integer keys */
8321                PyObject *newkey;
8322                if (PyUnicode_GET_SIZE(key) != 1) {
8323                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8324                                    "table must be of length 1");
8325                    goto err;
8326                }
8327                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8328                if (!newkey)
8329                    goto err;
8330                res = PyDict_SetItem(new, newkey, value);
8331                Py_DECREF(newkey);
8332                if (res < 0)
8333                    goto err;
8334            } else if (PyLong_Check(key)) {
8335                /* just keep integer keys */
8336                if (PyDict_SetItem(new, key, value) < 0)
8337                    goto err;
8338            } else {
8339                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8340                                "be strings or integers");
8341                goto err;
8342            }
8343        }
8344    }
8345    return new;
8346  err:
8347    Py_DECREF(new);
8348    return NULL;
8349}
8350
8351PyDoc_STRVAR(translate__doc__,
8352             "S.translate(table) -> str\n\
8353\n\
8354Return a copy of the string S, where all characters have been mapped\n\
8355through the given translation table, which must be a mapping of\n\
8356Unicode ordinals to Unicode ordinals, strings, or None.\n\
8357Unmapped characters are left untouched. Characters mapped to None\n\
8358are deleted.");
8359
8360static PyObject*
8361unicode_translate(PyUnicodeObject *self, PyObject *table)
8362{
8363    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8364}
8365
8366PyDoc_STRVAR(upper__doc__,
8367             "S.upper() -> str\n\
8368\n\
8369Return a copy of S converted to uppercase.");
8370
8371static PyObject*
8372unicode_upper(PyUnicodeObject *self)
8373{
8374    return fixup(self, fixupper);
8375}
8376
8377PyDoc_STRVAR(zfill__doc__,
8378             "S.zfill(width) -> str\n\
8379\n\
8380Pad a numeric string S with zeros on the left, to fill a field\n\
8381of the specified width. The string S is never truncated.");
8382
8383static PyObject *
8384unicode_zfill(PyUnicodeObject *self, PyObject *args)
8385{
8386    Py_ssize_t fill;
8387    PyUnicodeObject *u;
8388
8389    Py_ssize_t width;
8390    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8391        return NULL;
8392
8393    if (self->length >= width) {
8394        if (PyUnicode_CheckExact(self)) {
8395            Py_INCREF(self);
8396            return (PyObject*) self;
8397        }
8398        else
8399            return PyUnicode_FromUnicode(
8400                PyUnicode_AS_UNICODE(self),
8401                PyUnicode_GET_SIZE(self)
8402                );
8403    }
8404
8405    fill = width - self->length;
8406
8407    u = pad(self, fill, 0, '0');
8408
8409    if (u == NULL)
8410        return NULL;
8411
8412    if (u->str[fill] == '+' || u->str[fill] == '-') {
8413        /* move sign to beginning of string */
8414        u->str[0] = u->str[fill];
8415        u->str[fill] = '0';
8416    }
8417
8418    return (PyObject*) u;
8419}
8420
8421#if 0
8422static PyObject*
8423unicode_freelistsize(PyUnicodeObject *self)
8424{
8425    return PyLong_FromLong(numfree);
8426}
8427#endif
8428
8429PyDoc_STRVAR(startswith__doc__,
8430             "S.startswith(prefix[, start[, end]]) -> bool\n\
8431\n\
8432Return True if S starts with the specified prefix, False otherwise.\n\
8433With optional start, test S beginning at that position.\n\
8434With optional end, stop comparing S at that position.\n\
8435prefix can also be a tuple of strings to try.");
8436
8437static PyObject *
8438unicode_startswith(PyUnicodeObject *self,
8439                   PyObject *args)
8440{
8441    PyObject *subobj;
8442    PyUnicodeObject *substring;
8443    Py_ssize_t start = 0;
8444    Py_ssize_t end = PY_SSIZE_T_MAX;
8445    int result;
8446
8447    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8448                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8449        return NULL;
8450    if (PyTuple_Check(subobj)) {
8451        Py_ssize_t i;
8452        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8453            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8454                PyTuple_GET_ITEM(subobj, i));
8455            if (substring == NULL)
8456                return NULL;
8457            result = tailmatch(self, substring, start, end, -1);
8458            Py_DECREF(substring);
8459            if (result) {
8460                Py_RETURN_TRUE;
8461            }
8462        }
8463        /* nothing matched */
8464        Py_RETURN_FALSE;
8465    }
8466    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8467    if (substring == NULL)
8468        return NULL;
8469    result = tailmatch(self, substring, start, end, -1);
8470    Py_DECREF(substring);
8471    return PyBool_FromLong(result);
8472}
8473
8474
8475PyDoc_STRVAR(endswith__doc__,
8476             "S.endswith(suffix[, start[, end]]) -> bool\n\
8477\n\
8478Return True if S ends with the specified suffix, False otherwise.\n\
8479With optional start, test S beginning at that position.\n\
8480With optional end, stop comparing S at that position.\n\
8481suffix can also be a tuple of strings to try.");
8482
8483static PyObject *
8484unicode_endswith(PyUnicodeObject *self,
8485                 PyObject *args)
8486{
8487    PyObject *subobj;
8488    PyUnicodeObject *substring;
8489    Py_ssize_t start = 0;
8490    Py_ssize_t end = PY_SSIZE_T_MAX;
8491    int result;
8492
8493    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8494                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8495        return NULL;
8496    if (PyTuple_Check(subobj)) {
8497        Py_ssize_t i;
8498        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8499            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8500                PyTuple_GET_ITEM(subobj, i));
8501            if (substring == NULL)
8502                return NULL;
8503            result = tailmatch(self, substring, start, end, +1);
8504            Py_DECREF(substring);
8505            if (result) {
8506                Py_RETURN_TRUE;
8507            }
8508        }
8509        Py_RETURN_FALSE;
8510    }
8511    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8512    if (substring == NULL)
8513        return NULL;
8514
8515    result = tailmatch(self, substring, start, end, +1);
8516    Py_DECREF(substring);
8517    return PyBool_FromLong(result);
8518}
8519
8520#include "stringlib/string_format.h"
8521
8522PyDoc_STRVAR(format__doc__,
8523             "S.format(*args, **kwargs) -> str\n\
8524\n\
8525");
8526
8527static PyObject *
8528unicode__format__(PyObject* self, PyObject* args)
8529{
8530    PyObject *format_spec;
8531
8532    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8533        return NULL;
8534
8535    return _PyUnicode_FormatAdvanced(self,
8536                                     PyUnicode_AS_UNICODE(format_spec),
8537                                     PyUnicode_GET_SIZE(format_spec));
8538}
8539
8540PyDoc_STRVAR(p_format__doc__,
8541             "S.__format__(format_spec) -> str\n\
8542\n\
8543");
8544
8545static PyObject *
8546unicode__sizeof__(PyUnicodeObject *v)
8547{
8548    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8549                              sizeof(Py_UNICODE) * (v->length + 1));
8550}
8551
8552PyDoc_STRVAR(sizeof__doc__,
8553             "S.__sizeof__() -> size of S in memory, in bytes");
8554
8555static PyObject *
8556unicode_getnewargs(PyUnicodeObject *v)
8557{
8558    return Py_BuildValue("(u#)", v->str, v->length);
8559}
8560
8561
8562static PyMethodDef unicode_methods[] = {
8563
8564    /* Order is according to common usage: often used methods should
8565       appear first, since lookup is done sequentially. */
8566
8567    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8568    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8569    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8570    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8571    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8572    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8573    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8574    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8575    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8576    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8577    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8578    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8579    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8580    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8581    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8582    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8583    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8584    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8585    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8586    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8587    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8588    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8589    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8590    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8591    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8592    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8593    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8594    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8595    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8596    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8597    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8598    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8599    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8600    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8601    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8602    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8603    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8604    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8605    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
8606    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8607    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8608    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8609    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8610    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8611    {"maketrans", (PyCFunction) unicode_maketrans,
8612     METH_VARARGS | METH_STATIC, maketrans__doc__},
8613    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8614#if 0
8615    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8616#endif
8617
8618#if 0
8619    /* This one is just used for debugging the implementation. */
8620    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8621#endif
8622
8623    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
8624    {NULL, NULL}
8625};
8626
8627static PyObject *
8628unicode_mod(PyObject *v, PyObject *w)
8629{
8630    if (!PyUnicode_Check(v)) {
8631        Py_INCREF(Py_NotImplemented);
8632        return Py_NotImplemented;
8633    }
8634    return PyUnicode_Format(v, w);
8635}
8636
8637static PyNumberMethods unicode_as_number = {
8638    0,              /*nb_add*/
8639    0,              /*nb_subtract*/
8640    0,              /*nb_multiply*/
8641    unicode_mod,            /*nb_remainder*/
8642};
8643
8644static PySequenceMethods unicode_as_sequence = {
8645    (lenfunc) unicode_length,       /* sq_length */
8646    PyUnicode_Concat,           /* sq_concat */
8647    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
8648    (ssizeargfunc) unicode_getitem,     /* sq_item */
8649    0,                  /* sq_slice */
8650    0,                  /* sq_ass_item */
8651    0,                  /* sq_ass_slice */
8652    PyUnicode_Contains,         /* sq_contains */
8653};
8654
8655static PyObject*
8656unicode_subscript(PyUnicodeObject* self, PyObject* item)
8657{
8658    if (PyIndex_Check(item)) {
8659        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8660        if (i == -1 && PyErr_Occurred())
8661            return NULL;
8662        if (i < 0)
8663            i += PyUnicode_GET_SIZE(self);
8664        return unicode_getitem(self, i);
8665    } else if (PySlice_Check(item)) {
8666        Py_ssize_t start, stop, step, slicelength, cur, i;
8667        Py_UNICODE* source_buf;
8668        Py_UNICODE* result_buf;
8669        PyObject* result;
8670
8671        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8672                                 &start, &stop, &step, &slicelength) < 0) {
8673            return NULL;
8674        }
8675
8676        if (slicelength <= 0) {
8677            return PyUnicode_FromUnicode(NULL, 0);
8678        } else if (start == 0 && step == 1 && slicelength == self->length &&
8679                   PyUnicode_CheckExact(self)) {
8680            Py_INCREF(self);
8681            return (PyObject *)self;
8682        } else if (step == 1) {
8683            return PyUnicode_FromUnicode(self->str + start, slicelength);
8684        } else {
8685            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8686            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8687                                                       sizeof(Py_UNICODE));
8688
8689            if (result_buf == NULL)
8690                return PyErr_NoMemory();
8691
8692            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8693                result_buf[i] = source_buf[cur];
8694            }
8695
8696            result = PyUnicode_FromUnicode(result_buf, slicelength);
8697            PyObject_FREE(result_buf);
8698            return result;
8699        }
8700    } else {
8701        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8702        return NULL;
8703    }
8704}
8705
8706static PyMappingMethods unicode_as_mapping = {
8707    (lenfunc)unicode_length,        /* mp_length */
8708    (binaryfunc)unicode_subscript,  /* mp_subscript */
8709    (objobjargproc)0,           /* mp_ass_subscript */
8710};
8711
8712
8713/* Helpers for PyUnicode_Format() */
8714
8715static PyObject *
8716getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8717{
8718    Py_ssize_t argidx = *p_argidx;
8719    if (argidx < arglen) {
8720        (*p_argidx)++;
8721        if (arglen < 0)
8722            return args;
8723        else
8724            return PyTuple_GetItem(args, argidx);
8725    }
8726    PyErr_SetString(PyExc_TypeError,
8727                    "not enough arguments for format string");
8728    return NULL;
8729}
8730
8731static Py_ssize_t
8732strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8733{
8734    register Py_ssize_t i;
8735    Py_ssize_t len = strlen(charbuffer);
8736    for (i = len - 1; i >= 0; i--)
8737        buffer[i] = (Py_UNICODE) charbuffer[i];
8738
8739    return len;
8740}
8741
8742static int
8743doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8744{
8745    Py_ssize_t result;
8746
8747    PyOS_ascii_formatd((char *)buffer, len, format, x);
8748    result = strtounicode(buffer, (char *)buffer);
8749    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8750}
8751
8752#if 0
8753static int
8754longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8755{
8756    Py_ssize_t result;
8757
8758    PyOS_snprintf((char *)buffer, len, format, x);
8759    result = strtounicode(buffer, (char *)buffer);
8760    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8761}
8762#endif
8763
8764/* XXX To save some code duplication, formatfloat/long/int could have been
8765   shared with stringobject.c, converting from 8-bit to Unicode after the
8766   formatting is done. */
8767
8768static int
8769formatfloat(Py_UNICODE *buf,
8770            size_t buflen,
8771            int flags,
8772            int prec,
8773            int type,
8774            PyObject *v)
8775{
8776    /* fmt = '%#.' + `prec` + `type`
8777       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8778    char fmt[20];
8779    double x;
8780
8781    x = PyFloat_AsDouble(v);
8782    if (x == -1.0 && PyErr_Occurred())
8783        return -1;
8784    if (prec < 0)
8785        prec = 6;
8786    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8787        type = 'g';
8788    /* Worst case length calc to ensure no buffer overrun:
8789
8790       'g' formats:
8791       fmt = %#.<prec>g
8792       buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8793       for any double rep.)
8794       len = 1 + prec + 1 + 2 + 5 = 9 + prec
8795
8796       'f' formats:
8797       buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8798       len = 1 + 50 + 1 + prec = 52 + prec
8799
8800       If prec=0 the effective precision is 1 (the leading digit is
8801       always given), therefore increase the length by one.
8802
8803    */
8804    if (((type == 'g' || type == 'G') &&
8805         buflen <= (size_t)10 + (size_t)prec) ||
8806        (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8807        PyErr_SetString(PyExc_OverflowError,
8808                        "formatted float is too long (precision too large?)");
8809        return -1;
8810    }
8811    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8812                  (flags&F_ALT) ? "#" : "",
8813                  prec, type);
8814    return doubletounicode(buf, buflen, fmt, x);
8815}
8816
8817static PyObject*
8818formatlong(PyObject *val, int flags, int prec, int type)
8819{
8820    char *buf;
8821    int len;
8822    PyObject *str; /* temporary string object. */
8823    PyObject *result;
8824
8825    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8826    if (!str)
8827        return NULL;
8828    result = PyUnicode_FromStringAndSize(buf, len);
8829    Py_DECREF(str);
8830    return result;
8831}
8832
8833#if 0
8834static int
8835formatint(Py_UNICODE *buf,
8836          size_t buflen,
8837          int flags,
8838          int prec,
8839          int type,
8840          PyObject *v)
8841{
8842    /* fmt = '%#.' + `prec` + 'l' + `type`
8843     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8844     *                     + 1 + 1
8845     *                   = 24
8846     */
8847    char fmt[64]; /* plenty big enough! */
8848    char *sign;
8849    long x;
8850
8851    x = PyLong_AsLong(v);
8852    if (x == -1 && PyErr_Occurred())
8853        return -1;
8854    if (x < 0 && type == 'u') {
8855        type = 'd';
8856    }
8857    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8858        sign = "-";
8859    else
8860        sign = "";
8861    if (prec < 0)
8862        prec = 1;
8863
8864    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8865     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8866     */
8867    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8868        PyErr_SetString(PyExc_OverflowError,
8869                        "formatted integer is too long (precision too large?)");
8870        return -1;
8871    }
8872
8873    if ((flags & F_ALT) &&
8874        (type == 'x' || type == 'X' || type == 'o')) {
8875        /* When converting under %#o, %#x or %#X, there are a number
8876         * of issues that cause pain:
8877         * - for %#o, we want a different base marker than C
8878         * - when 0 is being converted, the C standard leaves off
8879         *   the '0x' or '0X', which is inconsistent with other
8880         *   %#x/%#X conversions and inconsistent with Python's
8881         *   hex() function
8882         * - there are platforms that violate the standard and
8883         *   convert 0 with the '0x' or '0X'
8884         *   (Metrowerks, Compaq Tru64)
8885         * - there are platforms that give '0x' when converting
8886         *   under %#X, but convert 0 in accordance with the
8887         *   standard (OS/2 EMX)
8888         *
8889         * We can achieve the desired consistency by inserting our
8890         * own '0x' or '0X' prefix, and substituting %x/%X in place
8891         * of %#x/%#X.
8892         *
8893         * Note that this is the same approach as used in
8894         * formatint() in stringobject.c
8895         */
8896        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8897                      sign, type, prec, type);
8898    }
8899    else {
8900        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8901                      sign, (flags&F_ALT) ? "#" : "",
8902                      prec, type);
8903    }
8904    if (sign[0])
8905        return longtounicode(buf, buflen, fmt, -x);
8906    else
8907        return longtounicode(buf, buflen, fmt, x);
8908}
8909#endif
8910
8911static int
8912formatchar(Py_UNICODE *buf,
8913           size_t buflen,
8914           PyObject *v)
8915{
8916    /* presume that the buffer is at least 3 characters long */
8917    if (PyUnicode_Check(v)) {
8918        if (PyUnicode_GET_SIZE(v) == 1) {
8919            buf[0] = PyUnicode_AS_UNICODE(v)[0];
8920            buf[1] = '\0';
8921            return 1;
8922        }
8923#ifndef Py_UNICODE_WIDE
8924        if (PyUnicode_GET_SIZE(v) == 2) {
8925            /* Decode a valid surrogate pair */
8926            int c0 = PyUnicode_AS_UNICODE(v)[0];
8927            int c1 = PyUnicode_AS_UNICODE(v)[1];
8928            if (0xD800 <= c0 && c0 <= 0xDBFF &&
8929                0xDC00 <= c1 && c1 <= 0xDFFF) {
8930                buf[0] = c0;
8931                buf[1] = c1;
8932                buf[2] = '\0';
8933                return 2;
8934            }
8935        }
8936#endif
8937        goto onError;
8938    }
8939    else {
8940        /* Integer input truncated to a character */
8941        long x;
8942        x = PyLong_AsLong(v);
8943        if (x == -1 && PyErr_Occurred())
8944            goto onError;
8945
8946        if (x < 0 || x > 0x10ffff) {
8947            PyErr_SetString(PyExc_OverflowError,
8948                            "%c arg not in range(0x110000)");
8949            return -1;
8950        }
8951
8952#ifndef Py_UNICODE_WIDE
8953        if (x > 0xffff) {
8954            x -= 0x10000;
8955            buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8956            buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8957            return 2;
8958        }
8959#endif
8960        buf[0] = (Py_UNICODE) x;
8961        buf[1] = '\0';
8962        return 1;
8963    }
8964
8965  onError:
8966    PyErr_SetString(PyExc_TypeError,
8967                    "%c requires int or char");
8968    return -1;
8969}
8970
8971/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8972
8973   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8974   chars are formatted. XXX This is a magic number. Each formatting
8975   routine does bounds checking to ensure no overflow, but a better
8976   solution may be to malloc a buffer of appropriate size for each
8977   format. For now, the current solution is sufficient.
8978*/
8979#define FORMATBUFLEN (size_t)120
8980
8981PyObject *PyUnicode_Format(PyObject *format,
8982                           PyObject *args)
8983{
8984    Py_UNICODE *fmt, *res;
8985    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8986    int args_owned = 0;
8987    PyUnicodeObject *result = NULL;
8988    PyObject *dict = NULL;
8989    PyObject *uformat;
8990
8991    if (format == NULL || args == NULL) {
8992        PyErr_BadInternalCall();
8993        return NULL;
8994    }
8995    uformat = PyUnicode_FromObject(format);
8996    if (uformat == NULL)
8997        return NULL;
8998    fmt = PyUnicode_AS_UNICODE(uformat);
8999    fmtcnt = PyUnicode_GET_SIZE(uformat);
9000
9001    reslen = rescnt = fmtcnt + 100;
9002    result = _PyUnicode_New(reslen);
9003    if (result == NULL)
9004        goto onError;
9005    res = PyUnicode_AS_UNICODE(result);
9006
9007    if (PyTuple_Check(args)) {
9008        arglen = PyTuple_Size(args);
9009        argidx = 0;
9010    }
9011    else {
9012        arglen = -1;
9013        argidx = -2;
9014    }
9015    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9016        !PyUnicode_Check(args))
9017        dict = args;
9018
9019    while (--fmtcnt >= 0) {
9020        if (*fmt != '%') {
9021            if (--rescnt < 0) {
9022                rescnt = fmtcnt + 100;
9023                reslen += rescnt;
9024                if (_PyUnicode_Resize(&result, reslen) < 0)
9025                    goto onError;
9026                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9027                --rescnt;
9028            }
9029            *res++ = *fmt++;
9030        }
9031        else {
9032            /* Got a format specifier */
9033            int flags = 0;
9034            Py_ssize_t width = -1;
9035            int prec = -1;
9036            Py_UNICODE c = '\0';
9037            Py_UNICODE fill;
9038            int isnumok;
9039            PyObject *v = NULL;
9040            PyObject *temp = NULL;
9041            Py_UNICODE *pbuf;
9042            Py_UNICODE sign;
9043            Py_ssize_t len;
9044            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
9045
9046            fmt++;
9047            if (*fmt == '(') {
9048                Py_UNICODE *keystart;
9049                Py_ssize_t keylen;
9050                PyObject *key;
9051                int pcount = 1;
9052
9053                if (dict == NULL) {
9054                    PyErr_SetString(PyExc_TypeError,
9055                                    "format requires a mapping");
9056                    goto onError;
9057                }
9058                ++fmt;
9059                --fmtcnt;
9060                keystart = fmt;
9061                /* Skip over balanced parentheses */
9062                while (pcount > 0 && --fmtcnt >= 0) {
9063                    if (*fmt == ')')
9064                        --pcount;
9065                    else if (*fmt == '(')
9066                        ++pcount;
9067                    fmt++;
9068                }
9069                keylen = fmt - keystart - 1;
9070                if (fmtcnt < 0 || pcount > 0) {
9071                    PyErr_SetString(PyExc_ValueError,
9072                                    "incomplete format key");
9073                    goto onError;
9074                }
9075#if 0
9076                /* keys are converted to strings using UTF-8 and
9077                   then looked up since Python uses strings to hold
9078                   variables names etc. in its namespaces and we
9079                   wouldn't want to break common idioms. */
9080                key = PyUnicode_EncodeUTF8(keystart,
9081                                           keylen,
9082                                           NULL);
9083#else
9084                key = PyUnicode_FromUnicode(keystart, keylen);
9085#endif
9086                if (key == NULL)
9087                    goto onError;
9088                if (args_owned) {
9089                    Py_DECREF(args);
9090                    args_owned = 0;
9091                }
9092                args = PyObject_GetItem(dict, key);
9093                Py_DECREF(key);
9094                if (args == NULL) {
9095                    goto onError;
9096                }
9097                args_owned = 1;
9098                arglen = -1;
9099                argidx = -2;
9100            }
9101            while (--fmtcnt >= 0) {
9102                switch (c = *fmt++) {
9103                case '-': flags |= F_LJUST; continue;
9104                case '+': flags |= F_SIGN; continue;
9105                case ' ': flags |= F_BLANK; continue;
9106                case '#': flags |= F_ALT; continue;
9107                case '0': flags |= F_ZERO; continue;
9108                }
9109                break;
9110            }
9111            if (c == '*') {
9112                v = getnextarg(args, arglen, &argidx);
9113                if (v == NULL)
9114                    goto onError;
9115                if (!PyLong_Check(v)) {
9116                    PyErr_SetString(PyExc_TypeError,
9117                                    "* wants int");
9118                    goto onError;
9119                }
9120                width = PyLong_AsLong(v);
9121                if (width == -1 && PyErr_Occurred())
9122                    goto onError;
9123                if (width < 0) {
9124                    flags |= F_LJUST;
9125                    width = -width;
9126                }
9127                if (--fmtcnt >= 0)
9128                    c = *fmt++;
9129            }
9130            else if (c >= '0' && c <= '9') {
9131                width = c - '0';
9132                while (--fmtcnt >= 0) {
9133                    c = *fmt++;
9134                    if (c < '0' || c > '9')
9135                        break;
9136                    if ((width*10) / 10 != width) {
9137                        PyErr_SetString(PyExc_ValueError,
9138                                        "width too big");
9139                        goto onError;
9140                    }
9141                    width = width*10 + (c - '0');
9142                }
9143            }
9144            if (c == '.') {
9145                prec = 0;
9146                if (--fmtcnt >= 0)
9147                    c = *fmt++;
9148                if (c == '*') {
9149                    v = getnextarg(args, arglen, &argidx);
9150                    if (v == NULL)
9151                        goto onError;
9152                    if (!PyLong_Check(v)) {
9153                        PyErr_SetString(PyExc_TypeError,
9154                                        "* wants int");
9155                        goto onError;
9156                    }
9157                    prec = PyLong_AsLong(v);
9158                    if (prec == -1 && PyErr_Occurred())
9159                        goto onError;
9160                    if (prec < 0)
9161                        prec = 0;
9162                    if (--fmtcnt >= 0)
9163                        c = *fmt++;
9164                }
9165                else if (c >= '0' && c <= '9') {
9166                    prec = c - '0';
9167                    while (--fmtcnt >= 0) {
9168                        c = Py_CHARMASK(*fmt++);
9169                        if (c < '0' || c > '9')
9170                            break;
9171                        if ((prec*10) / 10 != prec) {
9172                            PyErr_SetString(PyExc_ValueError,
9173                                            "prec too big");
9174                            goto onError;
9175                        }
9176                        prec = prec*10 + (c - '0');
9177                    }
9178                }
9179            } /* prec */
9180            if (fmtcnt >= 0) {
9181                if (c == 'h' || c == 'l' || c == 'L') {
9182                    if (--fmtcnt >= 0)
9183                        c = *fmt++;
9184                }
9185            }
9186            if (fmtcnt < 0) {
9187                PyErr_SetString(PyExc_ValueError,
9188                                "incomplete format");
9189                goto onError;
9190            }
9191            if (c != '%') {
9192                v = getnextarg(args, arglen, &argidx);
9193                if (v == NULL)
9194                    goto onError;
9195            }
9196            sign = 0;
9197            fill = ' ';
9198            switch (c) {
9199
9200            case '%':
9201                pbuf = formatbuf;
9202                /* presume that buffer length is at least 1 */
9203                pbuf[0] = '%';
9204                len = 1;
9205                break;
9206
9207            case 's':
9208            case 'r':
9209            case 'a':
9210                if (PyUnicode_Check(v) && c == 's') {
9211                    temp = v;
9212                    Py_INCREF(temp);
9213                }
9214                else {
9215                    if (c == 's')
9216                        temp = PyObject_Str(v);
9217                    else if (c == 'r')
9218                        temp = PyObject_Repr(v);
9219                    else
9220                        temp = PyObject_ASCII(v);
9221                    if (temp == NULL)
9222                        goto onError;
9223                    if (PyUnicode_Check(temp))
9224                        /* nothing to do */;
9225                    else {
9226                        Py_DECREF(temp);
9227                        PyErr_SetString(PyExc_TypeError,
9228                                        "%s argument has non-string str()");
9229                        goto onError;
9230                    }
9231                }
9232                pbuf = PyUnicode_AS_UNICODE(temp);
9233                len = PyUnicode_GET_SIZE(temp);
9234                if (prec >= 0 && len > prec)
9235                    len = prec;
9236                break;
9237
9238            case 'i':
9239            case 'd':
9240            case 'u':
9241            case 'o':
9242            case 'x':
9243            case 'X':
9244                if (c == 'i')
9245                    c = 'd';
9246                isnumok = 0;
9247                if (PyNumber_Check(v)) {
9248                    PyObject *iobj=NULL;
9249
9250                    if (PyLong_Check(v)) {
9251                        iobj = v;
9252                        Py_INCREF(iobj);
9253                    }
9254                    else {
9255                        iobj = PyNumber_Long(v);
9256                    }
9257                    if (iobj!=NULL) {
9258                        if (PyLong_Check(iobj)) {
9259                            isnumok = 1;
9260                            temp = formatlong(iobj, flags, prec, c);
9261                            Py_DECREF(iobj);
9262                            if (!temp)
9263                                goto onError;
9264                            pbuf = PyUnicode_AS_UNICODE(temp);
9265                            len = PyUnicode_GET_SIZE(temp);
9266                            sign = 1;
9267                        }
9268                        else {
9269                            Py_DECREF(iobj);
9270                        }
9271                    }
9272                }
9273                if (!isnumok) {
9274                    PyErr_Format(PyExc_TypeError,
9275                                 "%%%c format: a number is required, "
9276                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9277                    goto onError;
9278                }
9279                if (flags & F_ZERO)
9280                    fill = '0';
9281                break;
9282
9283            case 'e':
9284            case 'E':
9285            case 'f':
9286            case 'F':
9287            case 'g':
9288            case 'G':
9289                if (c == 'F')
9290                    c = 'f';
9291                pbuf = formatbuf;
9292                len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9293                                  flags, prec, c, v);
9294                if (len < 0)
9295                    goto onError;
9296                sign = 1;
9297                if (flags & F_ZERO)
9298                    fill = '0';
9299                break;
9300
9301            case 'c':
9302                pbuf = formatbuf;
9303                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9304                if (len < 0)
9305                    goto onError;
9306                break;
9307
9308            default:
9309                PyErr_Format(PyExc_ValueError,
9310                             "unsupported format character '%c' (0x%x) "
9311                             "at index %zd",
9312                             (31<=c && c<=126) ? (char)c : '?',
9313                             (int)c,
9314                             (Py_ssize_t)(fmt - 1 -
9315                                          PyUnicode_AS_UNICODE(uformat)));
9316                goto onError;
9317            }
9318            if (sign) {
9319                if (*pbuf == '-' || *pbuf == '+') {
9320                    sign = *pbuf++;
9321                    len--;
9322                }
9323                else if (flags & F_SIGN)
9324                    sign = '+';
9325                else if (flags & F_BLANK)
9326                    sign = ' ';
9327                else
9328                    sign = 0;
9329            }
9330            if (width < len)
9331                width = len;
9332            if (rescnt - (sign != 0) < width) {
9333                reslen -= rescnt;
9334                rescnt = width + fmtcnt + 100;
9335                reslen += rescnt;
9336                if (reslen < 0) {
9337                    Py_XDECREF(temp);
9338                    PyErr_NoMemory();
9339                    goto onError;
9340                }
9341                if (_PyUnicode_Resize(&result, reslen) < 0) {
9342                    Py_XDECREF(temp);
9343                    goto onError;
9344                }
9345                res = PyUnicode_AS_UNICODE(result)
9346                    + reslen - rescnt;
9347            }
9348            if (sign) {
9349                if (fill != ' ')
9350                    *res++ = sign;
9351                rescnt--;
9352                if (width > len)
9353                    width--;
9354            }
9355            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9356                assert(pbuf[0] == '0');
9357                assert(pbuf[1] == c);
9358                if (fill != ' ') {
9359                    *res++ = *pbuf++;
9360                    *res++ = *pbuf++;
9361                }
9362                rescnt -= 2;
9363                width -= 2;
9364                if (width < 0)
9365                    width = 0;
9366                len -= 2;
9367            }
9368            if (width > len && !(flags & F_LJUST)) {
9369                do {
9370                    --rescnt;
9371                    *res++ = fill;
9372                } while (--width > len);
9373            }
9374            if (fill == ' ') {
9375                if (sign)
9376                    *res++ = sign;
9377                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9378                    assert(pbuf[0] == '0');
9379                    assert(pbuf[1] == c);
9380                    *res++ = *pbuf++;
9381                    *res++ = *pbuf++;
9382                }
9383            }
9384            Py_UNICODE_COPY(res, pbuf, len);
9385            res += len;
9386            rescnt -= len;
9387            while (--width >= len) {
9388                --rescnt;
9389                *res++ = ' ';
9390            }
9391            if (dict && (argidx < arglen) && c != '%') {
9392                PyErr_SetString(PyExc_TypeError,
9393                                "not all arguments converted during string formatting");
9394                Py_XDECREF(temp);
9395                goto onError;
9396            }
9397            Py_XDECREF(temp);
9398        } /* '%' */
9399    } /* until end */
9400    if (argidx < arglen && !dict) {
9401        PyErr_SetString(PyExc_TypeError,
9402                        "not all arguments converted during string formatting");
9403        goto onError;
9404    }
9405
9406    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9407        goto onError;
9408    if (args_owned) {
9409        Py_DECREF(args);
9410    }
9411    Py_DECREF(uformat);
9412    return (PyObject *)result;
9413
9414  onError:
9415    Py_XDECREF(result);
9416    Py_DECREF(uformat);
9417    if (args_owned) {
9418        Py_DECREF(args);
9419    }
9420    return NULL;
9421}
9422
9423static PyObject *
9424unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9425
9426static PyObject *
9427unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9428{
9429    PyObject *x = NULL;
9430    static char *kwlist[] = {"object", "encoding", "errors", 0};
9431    char *encoding = NULL;
9432    char *errors = NULL;
9433
9434    if (type != &PyUnicode_Type)
9435        return unicode_subtype_new(type, args, kwds);
9436    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9437                                     kwlist, &x, &encoding, &errors))
9438        return NULL;
9439    if (x == NULL)
9440        return (PyObject *)_PyUnicode_New(0);
9441    if (encoding == NULL && errors == NULL)
9442        return PyObject_Str(x);
9443    else
9444        return PyUnicode_FromEncodedObject(x, encoding, errors);
9445}
9446
9447static PyObject *
9448unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9449{
9450    PyUnicodeObject *tmp, *pnew;
9451    Py_ssize_t n;
9452
9453    assert(PyType_IsSubtype(type, &PyUnicode_Type));
9454    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9455    if (tmp == NULL)
9456        return NULL;
9457    assert(PyUnicode_Check(tmp));
9458    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9459    if (pnew == NULL) {
9460        Py_DECREF(tmp);
9461        return NULL;
9462    }
9463    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9464    if (pnew->str == NULL) {
9465        _Py_ForgetReference((PyObject *)pnew);
9466        PyObject_Del(pnew);
9467        Py_DECREF(tmp);
9468        return PyErr_NoMemory();
9469    }
9470    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9471    pnew->length = n;
9472    pnew->hash = tmp->hash;
9473    Py_DECREF(tmp);
9474    return (PyObject *)pnew;
9475}
9476
9477PyDoc_STRVAR(unicode_doc,
9478             "str(string[, encoding[, errors]]) -> str\n\
9479\n\
9480Create a new string object from the given encoded string.\n\
9481encoding defaults to the current default string encoding.\n\
9482errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9483
9484static PyObject *unicode_iter(PyObject *seq);
9485
9486PyTypeObject PyUnicode_Type = {
9487    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9488    "str",              /* tp_name */
9489    sizeof(PyUnicodeObject),        /* tp_size */
9490    0,                  /* tp_itemsize */
9491    /* Slots */
9492    (destructor)unicode_dealloc,    /* tp_dealloc */
9493    0,                  /* tp_print */
9494    0,                  /* tp_getattr */
9495    0,                  /* tp_setattr */
9496    0,                  /* tp_reserved */
9497    unicode_repr,           /* tp_repr */
9498    &unicode_as_number,         /* tp_as_number */
9499    &unicode_as_sequence,       /* tp_as_sequence */
9500    &unicode_as_mapping,        /* tp_as_mapping */
9501    (hashfunc) unicode_hash,        /* tp_hash*/
9502    0,                  /* tp_call*/
9503    (reprfunc) unicode_str,     /* tp_str */
9504    PyObject_GenericGetAttr,        /* tp_getattro */
9505    0,                  /* tp_setattro */
9506    0,                  /* tp_as_buffer */
9507    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9508    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
9509    unicode_doc,            /* tp_doc */
9510    0,                  /* tp_traverse */
9511    0,                  /* tp_clear */
9512    PyUnicode_RichCompare,      /* tp_richcompare */
9513    0,                  /* tp_weaklistoffset */
9514    unicode_iter,           /* tp_iter */
9515    0,                  /* tp_iternext */
9516    unicode_methods,            /* tp_methods */
9517    0,                  /* tp_members */
9518    0,                  /* tp_getset */
9519    &PyBaseObject_Type,         /* tp_base */
9520    0,                  /* tp_dict */
9521    0,                  /* tp_descr_get */
9522    0,                  /* tp_descr_set */
9523    0,                  /* tp_dictoffset */
9524    0,                  /* tp_init */
9525    0,                  /* tp_alloc */
9526    unicode_new,            /* tp_new */
9527    PyObject_Del,           /* tp_free */
9528};
9529
9530/* Initialize the Unicode implementation */
9531
9532void _PyUnicode_Init(void)
9533{
9534    int i;
9535
9536    /* XXX - move this array to unicodectype.c ? */
9537    Py_UNICODE linebreak[] = {
9538        0x000A, /* LINE FEED */
9539        0x000D, /* CARRIAGE RETURN */
9540        0x001C, /* FILE SEPARATOR */
9541        0x001D, /* GROUP SEPARATOR */
9542        0x001E, /* RECORD SEPARATOR */
9543        0x0085, /* NEXT LINE */
9544        0x2028, /* LINE SEPARATOR */
9545        0x2029, /* PARAGRAPH SEPARATOR */
9546    };
9547
9548    /* Init the implementation */
9549    free_list = NULL;
9550    numfree = 0;
9551    unicode_empty = _PyUnicode_New(0);
9552    if (!unicode_empty)
9553        return;
9554
9555    for (i = 0; i < 256; i++)
9556        unicode_latin1[i] = NULL;
9557    if (PyType_Ready(&PyUnicode_Type) < 0)
9558        Py_FatalError("Can't initialize 'unicode'");
9559
9560    /* initialize the linebreak bloom filter */
9561    bloom_linebreak = make_bloom_mask(
9562        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9563        );
9564
9565    PyType_Ready(&EncodingMapType);
9566}
9567
9568/* Finalize the Unicode implementation */
9569
9570int
9571PyUnicode_ClearFreeList(void)
9572{
9573    int freelist_size = numfree;
9574    PyUnicodeObject *u;
9575
9576    for (u = free_list; u != NULL;) {
9577        PyUnicodeObject *v = u;
9578        u = *(PyUnicodeObject **)u;
9579        if (v->str)
9580            PyObject_DEL(v->str);
9581        Py_XDECREF(v->defenc);
9582        PyObject_Del(v);
9583        numfree--;
9584    }
9585    free_list = NULL;
9586    assert(numfree == 0);
9587    return freelist_size;
9588}
9589
9590void
9591_PyUnicode_Fini(void)
9592{
9593    int i;
9594
9595    Py_XDECREF(unicode_empty);
9596    unicode_empty = NULL;
9597
9598    for (i = 0; i < 256; i++) {
9599        if (unicode_latin1[i]) {
9600            Py_DECREF(unicode_latin1[i]);
9601            unicode_latin1[i] = NULL;
9602        }
9603    }
9604    (void)PyUnicode_ClearFreeList();
9605}
9606
9607void
9608PyUnicode_InternInPlace(PyObject **p)
9609{
9610    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9611    PyObject *t;
9612    if (s == NULL || !PyUnicode_Check(s))
9613        Py_FatalError(
9614            "PyUnicode_InternInPlace: unicode strings only please!");
9615    /* If it's a subclass, we don't really know what putting
9616       it in the interned dict might do. */
9617    if (!PyUnicode_CheckExact(s))
9618        return;
9619    if (PyUnicode_CHECK_INTERNED(s))
9620        return;
9621    if (interned == NULL) {
9622        interned = PyDict_New();
9623        if (interned == NULL) {
9624            PyErr_Clear(); /* Don't leave an exception */
9625            return;
9626        }
9627    }
9628    /* It might be that the GetItem call fails even
9629       though the key is present in the dictionary,
9630       namely when this happens during a stack overflow. */
9631    Py_ALLOW_RECURSION
9632        t = PyDict_GetItem(interned, (PyObject *)s);
9633    Py_END_ALLOW_RECURSION
9634
9635        if (t) {
9636            Py_INCREF(t);
9637            Py_DECREF(*p);
9638            *p = t;
9639            return;
9640        }
9641
9642    PyThreadState_GET()->recursion_critical = 1;
9643    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9644        PyErr_Clear();
9645        PyThreadState_GET()->recursion_critical = 0;
9646        return;
9647    }
9648    PyThreadState_GET()->recursion_critical = 0;
9649    /* The two references in interned are not counted by refcnt.
9650       The deallocator will take care of this */
9651    Py_REFCNT(s) -= 2;
9652    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9653}
9654
9655void
9656PyUnicode_InternImmortal(PyObject **p)
9657{
9658    PyUnicode_InternInPlace(p);
9659    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9660        PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9661        Py_INCREF(*p);
9662    }
9663}
9664
9665PyObject *
9666PyUnicode_InternFromString(const char *cp)
9667{
9668    PyObject *s = PyUnicode_FromString(cp);
9669    if (s == NULL)
9670        return NULL;
9671    PyUnicode_InternInPlace(&s);
9672    return s;
9673}
9674
9675void _Py_ReleaseInternedUnicodeStrings(void)
9676{
9677    PyObject *keys;
9678    PyUnicodeObject *s;
9679    Py_ssize_t i, n;
9680    Py_ssize_t immortal_size = 0, mortal_size = 0;
9681
9682    if (interned == NULL || !PyDict_Check(interned))
9683        return;
9684    keys = PyDict_Keys(interned);
9685    if (keys == NULL || !PyList_Check(keys)) {
9686        PyErr_Clear();
9687        return;
9688    }
9689
9690    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9691       detector, interned unicode strings are not forcibly deallocated;
9692       rather, we give them their stolen references back, and then clear
9693       and DECREF the interned dict. */
9694
9695    n = PyList_GET_SIZE(keys);
9696    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9697            n);
9698    for (i = 0; i < n; i++) {
9699        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9700        switch (s->state) {
9701        case SSTATE_NOT_INTERNED:
9702            /* XXX Shouldn't happen */
9703            break;
9704        case SSTATE_INTERNED_IMMORTAL:
9705            Py_REFCNT(s) += 1;
9706            immortal_size += s->length;
9707            break;
9708        case SSTATE_INTERNED_MORTAL:
9709            Py_REFCNT(s) += 2;
9710            mortal_size += s->length;
9711            break;
9712        default:
9713            Py_FatalError("Inconsistent interned string state.");
9714        }
9715        s->state = SSTATE_NOT_INTERNED;
9716    }
9717    fprintf(stderr, "total size of all interned strings: "
9718            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9719            "mortal/immortal\n", mortal_size, immortal_size);
9720    Py_DECREF(keys);
9721    PyDict_Clear(interned);
9722    Py_DECREF(interned);
9723    interned = NULL;
9724}
9725
9726
9727/********************* Unicode Iterator **************************/
9728
9729typedef struct {
9730    PyObject_HEAD
9731    Py_ssize_t it_index;
9732    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9733} unicodeiterobject;
9734
9735static void
9736unicodeiter_dealloc(unicodeiterobject *it)
9737{
9738    _PyObject_GC_UNTRACK(it);
9739    Py_XDECREF(it->it_seq);
9740    PyObject_GC_Del(it);
9741}
9742
9743static int
9744unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9745{
9746    Py_VISIT(it->it_seq);
9747    return 0;
9748}
9749
9750static PyObject *
9751unicodeiter_next(unicodeiterobject *it)
9752{
9753    PyUnicodeObject *seq;
9754    PyObject *item;
9755
9756    assert(it != NULL);
9757    seq = it->it_seq;
9758    if (seq == NULL)
9759        return NULL;
9760    assert(PyUnicode_Check(seq));
9761
9762    if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9763        item = PyUnicode_FromUnicode(
9764            PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9765        if (item != NULL)
9766            ++it->it_index;
9767        return item;
9768    }
9769
9770    Py_DECREF(seq);
9771    it->it_seq = NULL;
9772    return NULL;
9773}
9774
9775static PyObject *
9776unicodeiter_len(unicodeiterobject *it)
9777{
9778    Py_ssize_t len = 0;
9779    if (it->it_seq)
9780        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9781    return PyLong_FromSsize_t(len);
9782}
9783
9784PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9785
9786static PyMethodDef unicodeiter_methods[] = {
9787    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9788     length_hint_doc},
9789    {NULL,      NULL}       /* sentinel */
9790};
9791
9792PyTypeObject PyUnicodeIter_Type = {
9793    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9794    "str_iterator",         /* tp_name */
9795    sizeof(unicodeiterobject),      /* tp_basicsize */
9796    0,                  /* tp_itemsize */
9797    /* methods */
9798    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
9799    0,                  /* tp_print */
9800    0,                  /* tp_getattr */
9801    0,                  /* tp_setattr */
9802    0,                  /* tp_reserved */
9803    0,                  /* tp_repr */
9804    0,                  /* tp_as_number */
9805    0,                  /* tp_as_sequence */
9806    0,                  /* tp_as_mapping */
9807    0,                  /* tp_hash */
9808    0,                  /* tp_call */
9809    0,                  /* tp_str */
9810    PyObject_GenericGetAttr,        /* tp_getattro */
9811    0,                  /* tp_setattro */
9812    0,                  /* tp_as_buffer */
9813    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9814    0,                  /* tp_doc */
9815    (traverseproc)unicodeiter_traverse, /* tp_traverse */
9816    0,                  /* tp_clear */
9817    0,                  /* tp_richcompare */
9818    0,                  /* tp_weaklistoffset */
9819    PyObject_SelfIter,          /* tp_iter */
9820    (iternextfunc)unicodeiter_next,     /* tp_iternext */
9821    unicodeiter_methods,            /* tp_methods */
9822    0,
9823};
9824
9825static PyObject *
9826unicode_iter(PyObject *seq)
9827{
9828    unicodeiterobject *it;
9829
9830    if (!PyUnicode_Check(seq)) {
9831        PyErr_BadInternalCall();
9832        return NULL;
9833    }
9834    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9835    if (it == NULL)
9836        return NULL;
9837    it->it_index = 0;
9838    Py_INCREF(seq);
9839    it->it_seq = (PyUnicodeObject *)seq;
9840    _PyObject_GC_TRACK(it);
9841    return (PyObject *)it;
9842}
9843
9844size_t
9845Py_UNICODE_strlen(const Py_UNICODE *u)
9846{
9847    int res = 0;
9848    while(*u++)
9849        res++;
9850    return res;
9851}
9852
9853Py_UNICODE*
9854Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9855{
9856    Py_UNICODE *u = s1;
9857    while ((*u++ = *s2++));
9858    return s1;
9859}
9860
9861Py_UNICODE*
9862Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9863{
9864    Py_UNICODE *u = s1;
9865    while ((*u++ = *s2++))
9866        if (n-- == 0)
9867            break;
9868    return s1;
9869}
9870
9871int
9872Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9873{
9874    while (*s1 && *s2 && *s1 == *s2)
9875        s1++, s2++;
9876    if (*s1 && *s2)
9877        return (*s1 < *s2) ? -1 : +1;
9878    if (*s1)
9879        return 1;
9880    if (*s2)
9881        return -1;
9882    return 0;
9883}
9884
9885Py_UNICODE*
9886Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9887{
9888    const Py_UNICODE *p;
9889    for (p = s; *p; p++)
9890        if (*p == c)
9891            return (Py_UNICODE*)p;
9892    return NULL;
9893}
9894
9895
9896#ifdef __cplusplus
9897}
9898#endif
9899
9900
9901/*
9902  Local variables:
9903  c-basic-offset: 4
9904  indent-tabs-mode: nil
9905  End:
9906*/
9907