unicodeobject.c revision 28a6cfaefc41a4e4bfa6dd0b54318c0465987652
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "ucnhash.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* Limit for the Unicode object free list */
51
52#define PyUnicode_MAXFREELIST       1024
53
54/* Limit for the Unicode object free list stay alive optimization.
55
56   The implementation will keep allocated Unicode memory intact for
57   all objects on the free list having a size less than this
58   limit. This reduces malloc() overhead for small Unicode objects.
59
60   At worst this will result in PyUnicode_MAXFREELIST *
61   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
62   malloc()-overhead) bytes of unused garbage.
63
64   Setting the limit to 0 effectively turns the feature off.
65
66   Note: This is an experimental feature ! If you get core dumps when
67   using Unicode objects, turn this feature off.
68
69*/
70
71#define KEEPALIVE_SIZE_LIMIT       9
72
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
81/* --- Globals ------------------------------------------------------------
82
83   The globals are initialized by the _PyUnicode_Init() API and should
84   not be used before calling that API.
85
86*/
87
88
89#ifdef __cplusplus
90extern "C" {
91#endif
92
93/* This dictionary holds all interned unicode strings.  Note that references
94   to strings in this dictionary are *not* counted in the string's ob_refcnt.
95   When the interned string reaches a refcnt of 0 the string deallocation
96   function will delete the reference from this dictionary.
97
98   Another way to look at this is that to say that the actual reference
99   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
100*/
101static PyObject *interned;
102
103/* Free list for Unicode objects */
104static PyUnicodeObject *free_list;
105static int numfree;
106
107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111   shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
116    0, 0, 0, 0, 0, 0, 0, 0,
117/*     case 0x0009: * CHARACTER TABULATION */
118/*     case 0x000A: * LINE FEED */
119/*     case 0x000B: * LINE TABULATION */
120/*     case 0x000C: * FORM FEED */
121/*     case 0x000D: * CARRIAGE RETURN */
122    0, 1, 1, 1, 1, 1, 0, 0,
123    0, 0, 0, 0, 0, 0, 0, 0,
124/*     case 0x001C: * FILE SEPARATOR */
125/*     case 0x001D: * GROUP SEPARATOR */
126/*     case 0x001E: * RECORD SEPARATOR */
127/*     case 0x001F: * UNIT SEPARATOR */
128    0, 0, 0, 0, 1, 1, 1, 1,
129/*     case 0x0020: * SPACE */
130    1, 0, 0, 0, 0, 0, 0, 0,
131    0, 0, 0, 0, 0, 0, 0, 0,
132    0, 0, 0, 0, 0, 0, 0, 0,
133    0, 0, 0, 0, 0, 0, 0, 0,
134
135    0, 0, 0, 0, 0, 0, 0, 0,
136    0, 0, 0, 0, 0, 0, 0, 0,
137    0, 0, 0, 0, 0, 0, 0, 0,
138    0, 0, 0, 0, 0, 0, 0, 0,
139    0, 0, 0, 0, 0, 0, 0, 0,
140    0, 0, 0, 0, 0, 0, 0, 0,
141    0, 0, 0, 0, 0, 0, 0, 0,
142    0, 0, 0, 0, 0, 0, 0, 0
143};
144
145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146       PyObject **errorHandler,const char *encoding, const char *reason,
147       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
150static void raise_encode_exception(PyObject **exceptionObject,
151                                   const char *encoding,
152                                   const Py_UNICODE *unicode, Py_ssize_t size,
153                                   Py_ssize_t startpos, Py_ssize_t endpos,
154                                   const char *reason);
155
156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
158    0, 0, 0, 0, 0, 0, 0, 0,
159/*         0x000A, * LINE FEED */
160/*         0x000B, * LINE TABULATION */
161/*         0x000C, * FORM FEED */
162/*         0x000D, * CARRIAGE RETURN */
163    0, 0, 1, 1, 1, 1, 0, 0,
164    0, 0, 0, 0, 0, 0, 0, 0,
165/*         0x001C, * FILE SEPARATOR */
166/*         0x001D, * GROUP SEPARATOR */
167/*         0x001E, * RECORD SEPARATOR */
168    0, 0, 0, 0, 1, 1, 1, 0,
169    0, 0, 0, 0, 0, 0, 0, 0,
170    0, 0, 0, 0, 0, 0, 0, 0,
171    0, 0, 0, 0, 0, 0, 0, 0,
172    0, 0, 0, 0, 0, 0, 0, 0,
173
174    0, 0, 0, 0, 0, 0, 0, 0,
175    0, 0, 0, 0, 0, 0, 0, 0,
176    0, 0, 0, 0, 0, 0, 0, 0,
177    0, 0, 0, 0, 0, 0, 0, 0,
178    0, 0, 0, 0, 0, 0, 0, 0,
179    0, 0, 0, 0, 0, 0, 0, 0,
180    0, 0, 0, 0, 0, 0, 0, 0,
181    0, 0, 0, 0, 0, 0, 0, 0
182};
183
184
185Py_UNICODE
186PyUnicode_GetMax(void)
187{
188#ifdef Py_UNICODE_WIDE
189    return 0x10FFFF;
190#else
191    /* This is actually an illegal character, so it should
192       not be passed to unichr. */
193    return 0xFFFF;
194#endif
195}
196
197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200   to keep things simple, we use a single bitmask, using the least 5
201   bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
221
222#define BLOOM_LINEBREAK(ch)                                             \
223    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
224     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228    /* calculate simple bloom-style bitmask for a given unicode string */
229
230    BLOOM_MASK mask;
231    Py_ssize_t i;
232
233    mask = 0;
234    for (i = 0; i < len; i++)
235        BLOOM_ADD(mask, ptr[i]);
236
237    return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242    Py_ssize_t i;
243
244    for (i = 0; i < setlen; i++)
245        if (set[i] == chr)
246            return 1;
247
248    return 0;
249}
250
251#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
252    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
254/* --- Unicode Object ----------------------------------------------------- */
255
256static
257int unicode_resize(register PyUnicodeObject *unicode,
258                   Py_ssize_t length)
259{
260    void *oldstr;
261
262    /* Shortcut if there's nothing much to do. */
263    if (unicode->length == length)
264        goto reset;
265
266    /* Resizing shared object (unicode_empty or single character
267       objects) in-place is not allowed. Use PyUnicode_Resize()
268       instead ! */
269
270    if (unicode == unicode_empty ||
271        (unicode->length == 1 &&
272         unicode->str[0] < 256U &&
273         unicode_latin1[unicode->str[0]] == unicode)) {
274        PyErr_SetString(PyExc_SystemError,
275                        "can't resize shared str objects");
276        return -1;
277    }
278
279    /* We allocate one more byte to make sure the string is Ux0000 terminated.
280       The overallocation is also used by fastsearch, which assumes that it's
281       safe to look at str[length] (without making any assumptions about what
282       it contains). */
283
284    oldstr = unicode->str;
285    unicode->str = PyObject_REALLOC(unicode->str,
286                                    sizeof(Py_UNICODE) * (length + 1));
287    if (!unicode->str) {
288        unicode->str = (Py_UNICODE *)oldstr;
289        PyErr_NoMemory();
290        return -1;
291    }
292    unicode->str[length] = 0;
293    unicode->length = length;
294
295  reset:
296    /* Reset the object caches */
297    if (unicode->defenc) {
298        Py_CLEAR(unicode->defenc);
299    }
300    unicode->hash = -1;
301
302    return 0;
303}
304
305/* We allocate one more byte to make sure the string is
306   Ux0000 terminated; some code (e.g. new_identifier)
307   relies on that.
308
309   XXX This allocator could further be enhanced by assuring that the
310   free list never reduces its size below 1.
311
312*/
313
314static
315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
316{
317    register PyUnicodeObject *unicode;
318
319    /* Optimization for empty strings */
320    if (length == 0 && unicode_empty != NULL) {
321        Py_INCREF(unicode_empty);
322        return unicode_empty;
323    }
324
325    /* Ensure we won't overflow the size. */
326    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327        return (PyUnicodeObject *)PyErr_NoMemory();
328    }
329
330    /* Unicode freelist & memory allocation */
331    if (free_list) {
332        unicode = free_list;
333        free_list = *(PyUnicodeObject **)unicode;
334        numfree--;
335        if (unicode->str) {
336            /* Keep-Alive optimization: we only upsize the buffer,
337               never downsize it. */
338            if ((unicode->length < length) &&
339                unicode_resize(unicode, length) < 0) {
340                PyObject_DEL(unicode->str);
341                unicode->str = NULL;
342            }
343        }
344        else {
345            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
347        }
348        PyObject_INIT(unicode, &PyUnicode_Type);
349    }
350    else {
351        size_t new_size;
352        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
353        if (unicode == NULL)
354            return NULL;
355        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
357    }
358
359    if (!unicode->str) {
360        PyErr_NoMemory();
361        goto onError;
362    }
363    /* Initialize the first element to guard against cases where
364     * the caller fails before initializing str -- unicode_resize()
365     * reads str[0], and the Keep-Alive optimization can keep memory
366     * allocated for str alive across a call to unicode_dealloc(unicode).
367     * We don't want unicode_resize to read uninitialized memory in
368     * that case.
369     */
370    unicode->str[0] = 0;
371    unicode->str[length] = 0;
372    unicode->length = length;
373    unicode->hash = -1;
374    unicode->state = 0;
375    unicode->defenc = NULL;
376    return unicode;
377
378  onError:
379    /* XXX UNREF/NEWREF interface should be more symmetrical */
380    _Py_DEC_REFTOTAL;
381    _Py_ForgetReference((PyObject *)unicode);
382    PyObject_Del(unicode);
383    return NULL;
384}
385
386static
387void unicode_dealloc(register PyUnicodeObject *unicode)
388{
389    switch (PyUnicode_CHECK_INTERNED(unicode)) {
390    case SSTATE_NOT_INTERNED:
391        break;
392
393    case SSTATE_INTERNED_MORTAL:
394        /* revive dead object temporarily for DelItem */
395        Py_REFCNT(unicode) = 3;
396        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397            Py_FatalError(
398                "deletion of interned string failed");
399        break;
400
401    case SSTATE_INTERNED_IMMORTAL:
402        Py_FatalError("Immortal interned string died.");
403
404    default:
405        Py_FatalError("Inconsistent interned string state.");
406    }
407
408    if (PyUnicode_CheckExact(unicode) &&
409        numfree < PyUnicode_MAXFREELIST) {
410        /* Keep-Alive optimization */
411        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412            PyObject_DEL(unicode->str);
413            unicode->str = NULL;
414            unicode->length = 0;
415        }
416        if (unicode->defenc) {
417            Py_CLEAR(unicode->defenc);
418        }
419        /* Add to free list */
420        *(PyUnicodeObject **)unicode = free_list;
421        free_list = unicode;
422        numfree++;
423    }
424    else {
425        PyObject_DEL(unicode->str);
426        Py_XDECREF(unicode->defenc);
427        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
428    }
429}
430
431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
433{
434    register PyUnicodeObject *v;
435
436    /* Argument checks */
437    if (unicode == NULL) {
438        PyErr_BadInternalCall();
439        return -1;
440    }
441    v = *unicode;
442    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
443        PyErr_BadInternalCall();
444        return -1;
445    }
446
447    /* Resizing unicode_empty and single character objects is not
448       possible since these are being shared. We simply return a fresh
449       copy with the same Unicode content. */
450    if (v->length != length &&
451        (v == unicode_empty || v->length == 1)) {
452        PyUnicodeObject *w = _PyUnicode_New(length);
453        if (w == NULL)
454            return -1;
455        Py_UNICODE_COPY(w->str, v->str,
456                        length < v->length ? length : v->length);
457        Py_DECREF(*unicode);
458        *unicode = w;
459        return 0;
460    }
461
462    /* Note that we don't have to modify *unicode for unshared Unicode
463       objects, since we can modify them in-place. */
464    return unicode_resize(v, length);
465}
466
467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
471
472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
473                                Py_ssize_t size)
474{
475    PyUnicodeObject *unicode;
476
477    /* If the Unicode data is known at construction time, we can apply
478       some optimizations which share commonly used objects. */
479    if (u != NULL) {
480
481        /* Optimization for empty strings */
482        if (size == 0 && unicode_empty != NULL) {
483            Py_INCREF(unicode_empty);
484            return (PyObject *)unicode_empty;
485        }
486
487        /* Single character Unicode objects in the Latin-1 range are
488           shared when using this constructor */
489        if (size == 1 && *u < 256) {
490            unicode = unicode_latin1[*u];
491            if (!unicode) {
492                unicode = _PyUnicode_New(1);
493                if (!unicode)
494                    return NULL;
495                unicode->str[0] = *u;
496                unicode_latin1[*u] = unicode;
497            }
498            Py_INCREF(unicode);
499            return (PyObject *)unicode;
500        }
501    }
502
503    unicode = _PyUnicode_New(size);
504    if (!unicode)
505        return NULL;
506
507    /* Copy the Unicode data into the new object */
508    if (u != NULL)
509        Py_UNICODE_COPY(unicode->str, u, size);
510
511    return (PyObject *)unicode;
512}
513
514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
515{
516    PyUnicodeObject *unicode;
517
518    if (size < 0) {
519        PyErr_SetString(PyExc_SystemError,
520                        "Negative size passed to PyUnicode_FromStringAndSize");
521        return NULL;
522    }
523
524    /* If the Unicode data is known at construction time, we can apply
525       some optimizations which share commonly used objects.
526       Also, this means the input must be UTF-8, so fall back to the
527       UTF-8 decoder at the end. */
528    if (u != NULL) {
529
530        /* Optimization for empty strings */
531        if (size == 0 && unicode_empty != NULL) {
532            Py_INCREF(unicode_empty);
533            return (PyObject *)unicode_empty;
534        }
535
536        /* Single characters are shared when using this constructor.
537           Restrict to ASCII, since the input must be UTF-8. */
538        if (size == 1 && Py_CHARMASK(*u) < 128) {
539            unicode = unicode_latin1[Py_CHARMASK(*u)];
540            if (!unicode) {
541                unicode = _PyUnicode_New(1);
542                if (!unicode)
543                    return NULL;
544                unicode->str[0] = Py_CHARMASK(*u);
545                unicode_latin1[Py_CHARMASK(*u)] = unicode;
546            }
547            Py_INCREF(unicode);
548            return (PyObject *)unicode;
549        }
550
551        return PyUnicode_DecodeUTF8(u, size, NULL);
552    }
553
554    unicode = _PyUnicode_New(size);
555    if (!unicode)
556        return NULL;
557
558    return (PyObject *)unicode;
559}
560
561PyObject *PyUnicode_FromString(const char *u)
562{
563    size_t size = strlen(u);
564    if (size > PY_SSIZE_T_MAX) {
565        PyErr_SetString(PyExc_OverflowError, "input too long");
566        return NULL;
567    }
568
569    return PyUnicode_FromStringAndSize(u, size);
570}
571
572#ifdef HAVE_WCHAR_H
573
574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581   to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584                                 Py_ssize_t size)
585{
586    PyUnicodeObject *unicode;
587    register Py_ssize_t i;
588    Py_ssize_t alloc;
589    const wchar_t *orig_w;
590
591    if (w == NULL) {
592        if (size == 0)
593            return PyUnicode_FromStringAndSize(NULL, 0);
594        PyErr_BadInternalCall();
595        return NULL;
596    }
597
598    if (size == -1) {
599        size = wcslen(w);
600    }
601
602    alloc = size;
603    orig_w = w;
604    for (i = size; i > 0; i--) {
605        if (*w > 0xFFFF)
606            alloc++;
607        w++;
608    }
609    w = orig_w;
610    unicode = _PyUnicode_New(alloc);
611    if (!unicode)
612        return NULL;
613
614    /* Copy the wchar_t data into the new object */
615    {
616        register Py_UNICODE *u;
617        u = PyUnicode_AS_UNICODE(unicode);
618        for (i = size; i > 0; i--) {
619            if (*w > 0xFFFF) {
620                wchar_t ordinal = *w++;
621                ordinal -= 0x10000;
622                *u++ = 0xD800 | (ordinal >> 10);
623                *u++ = 0xDC00 | (ordinal & 0x3FF);
624            }
625            else
626                *u++ = *w++;
627        }
628    }
629    return (PyObject *)unicode;
630}
631
632#else
633
634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
635                                 Py_ssize_t size)
636{
637    PyUnicodeObject *unicode;
638
639    if (w == NULL) {
640        if (size == 0)
641            return PyUnicode_FromStringAndSize(NULL, 0);
642        PyErr_BadInternalCall();
643        return NULL;
644    }
645
646    if (size == -1) {
647        size = wcslen(w);
648    }
649
650    unicode = _PyUnicode_New(size);
651    if (!unicode)
652        return NULL;
653
654    /* Copy the wchar_t data into the new object */
655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
656    memcpy(unicode->str, w, size * sizeof(wchar_t));
657#else
658    {
659        register Py_UNICODE *u;
660        register Py_ssize_t i;
661        u = PyUnicode_AS_UNICODE(unicode);
662        for (i = size; i > 0; i--)
663            *u++ = *w++;
664    }
665#endif
666
667    return (PyObject *)unicode;
668}
669
670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
674static void
675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676        int zeropad, int width, int precision, char c)
677{
678    *fmt++ = '%';
679    if (width) {
680        if (zeropad)
681            *fmt++ = '0';
682        fmt += sprintf(fmt, "%d", width);
683    }
684    if (precision)
685        fmt += sprintf(fmt, ".%d", precision);
686    if (longflag)
687        *fmt++ = 'l';
688    else if (longlongflag) {
689        /* longlongflag should only ever be nonzero on machines with
690           HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692        char *f = PY_FORMAT_LONG_LONG;
693        while (*f)
694            *fmt++ = *f++;
695#else
696        /* we shouldn't ever get here */
697        assert(0);
698        *fmt++ = 'l';
699#endif
700    }
701    else if (size_tflag) {
702        char *f = PY_FORMAT_SIZE_T;
703        while (*f)
704            *fmt++ = *f++;
705    }
706    *fmt++ = c;
707    *fmt = '\0';
708}
709
710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld.  21 characters
715   allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
725    va_list count;
726    Py_ssize_t callcount = 0;
727    PyObject **callresults = NULL;
728    PyObject **callresult = NULL;
729    Py_ssize_t n = 0;
730    int width = 0;
731    int precision = 0;
732    int zeropad;
733    const char* f;
734    Py_UNICODE *s;
735    PyObject *string;
736    /* used by sprintf */
737    char buffer[ITEM_BUFFER_LEN+1];
738    /* use abuffer instead of buffer, if we need more space
739     * (which can happen if there's a format specifier with width). */
740    char *abuffer = NULL;
741    char *realbuffer;
742    Py_ssize_t abuffersize = 0;
743    char fmt[61]; /* should be enough for %0width.precisionlld */
744    const char *copy;
745
746    Py_VA_COPY(count, vargs);
747    /* step 1: count the number of %S/%R/%A/%s format specifications
748     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750     * result in an array) */
751    for (f = format; *f; f++) {
752         if (*f == '%') {
753             if (*(f+1)=='%')
754                 continue;
755             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
756                 ++callcount;
757             while (Py_ISDIGIT((unsigned)*f))
758                 width = (width*10) + *f++ - '0';
759             while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
760                 ;
761             if (*f == 's')
762                 ++callcount;
763         }
764         else if (128 <= (unsigned char)*f) {
765             PyErr_Format(PyExc_ValueError,
766                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
767                "string, got a non-ASCII byte: 0x%02x",
768                (unsigned char)*f);
769             return NULL;
770         }
771    }
772    /* step 2: allocate memory for the results of
773     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
774    if (callcount) {
775        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776        if (!callresults) {
777            PyErr_NoMemory();
778            return NULL;
779        }
780        callresult = callresults;
781    }
782    /* step 3: figure out how large a buffer we need */
783    for (f = format; *f; f++) {
784        if (*f == '%') {
785#ifdef HAVE_LONG_LONG
786            int longlongflag = 0;
787#endif
788            const char* p = f;
789            width = 0;
790            while (Py_ISDIGIT((unsigned)*f))
791                width = (width*10) + *f++ - '0';
792            while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
793                ;
794
795            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796             * they don't affect the amount of space we reserve.
797             */
798            if (*f == 'l') {
799                if (f[1] == 'd' || f[1] == 'u') {
800                    ++f;
801                }
802#ifdef HAVE_LONG_LONG
803                else if (f[1] == 'l' &&
804                         (f[2] == 'd' || f[2] == 'u')) {
805                    longlongflag = 1;
806                    f += 2;
807                }
808#endif
809            }
810            else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
811                ++f;
812            }
813
814            switch (*f) {
815            case 'c':
816            {
817#ifndef Py_UNICODE_WIDE
818                int ordinal = va_arg(count, int);
819                if (ordinal > 0xffff)
820                    n += 2;
821                else
822                    n++;
823#else
824                (void)va_arg(count, int);
825                n++;
826#endif
827                break;
828            }
829            case '%':
830                n++;
831                break;
832            case 'd': case 'u': case 'i': case 'x':
833                (void) va_arg(count, int);
834#ifdef HAVE_LONG_LONG
835                if (longlongflag) {
836                    if (width < MAX_LONG_LONG_CHARS)
837                        width = MAX_LONG_LONG_CHARS;
838                }
839                else
840#endif
841                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842                       including sign.  Decimal takes the most space.  This
843                       isn't enough for octal.  If a width is specified we
844                       need more (which we allocate later). */
845                    if (width < MAX_LONG_CHARS)
846                        width = MAX_LONG_CHARS;
847                n += width;
848                /* XXX should allow for large precision here too. */
849                if (abuffersize < width)
850                    abuffersize = width;
851                break;
852            case 's':
853            {
854                /* UTF-8 */
855                const char *s = va_arg(count, const char*);
856                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857                if (!str)
858                    goto fail;
859                n += PyUnicode_GET_SIZE(str);
860                /* Remember the str and switch to the next slot */
861                *callresult++ = str;
862                break;
863            }
864            case 'U':
865            {
866                PyObject *obj = va_arg(count, PyObject *);
867                assert(obj && PyUnicode_Check(obj));
868                n += PyUnicode_GET_SIZE(obj);
869                break;
870            }
871            case 'V':
872            {
873                PyObject *obj = va_arg(count, PyObject *);
874                const char *str = va_arg(count, const char *);
875                PyObject *str_obj;
876                assert(obj || str);
877                assert(!obj || PyUnicode_Check(obj));
878                if (obj) {
879                    n += PyUnicode_GET_SIZE(obj);
880                    *callresult++ = NULL;
881                }
882                else {
883                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
884                    if (!str_obj)
885                        goto fail;
886                    n += PyUnicode_GET_SIZE(str_obj);
887                    *callresult++ = str_obj;
888                }
889                break;
890            }
891            case 'S':
892            {
893                PyObject *obj = va_arg(count, PyObject *);
894                PyObject *str;
895                assert(obj);
896                str = PyObject_Str(obj);
897                if (!str)
898                    goto fail;
899                n += PyUnicode_GET_SIZE(str);
900                /* Remember the str and switch to the next slot */
901                *callresult++ = str;
902                break;
903            }
904            case 'R':
905            {
906                PyObject *obj = va_arg(count, PyObject *);
907                PyObject *repr;
908                assert(obj);
909                repr = PyObject_Repr(obj);
910                if (!repr)
911                    goto fail;
912                n += PyUnicode_GET_SIZE(repr);
913                /* Remember the repr and switch to the next slot */
914                *callresult++ = repr;
915                break;
916            }
917            case 'A':
918            {
919                PyObject *obj = va_arg(count, PyObject *);
920                PyObject *ascii;
921                assert(obj);
922                ascii = PyObject_ASCII(obj);
923                if (!ascii)
924                    goto fail;
925                n += PyUnicode_GET_SIZE(ascii);
926                /* Remember the repr and switch to the next slot */
927                *callresult++ = ascii;
928                break;
929            }
930            case 'p':
931                (void) va_arg(count, int);
932                /* maximum 64-bit pointer representation:
933                 * 0xffffffffffffffff
934                 * so 19 characters is enough.
935                 * XXX I count 18 -- what's the extra for?
936                 */
937                n += 19;
938                break;
939            default:
940                /* if we stumble upon an unknown
941                   formatting code, copy the rest of
942                   the format string to the output
943                   string. (we cannot just skip the
944                   code, since there's no way to know
945                   what's in the argument list) */
946                n += strlen(p);
947                goto expand;
948            }
949        } else
950            n++;
951    }
952  expand:
953    if (abuffersize > ITEM_BUFFER_LEN) {
954        /* add 1 for sprintf's trailing null byte */
955        abuffer = PyObject_Malloc(abuffersize + 1);
956        if (!abuffer) {
957            PyErr_NoMemory();
958            goto fail;
959        }
960        realbuffer = abuffer;
961    }
962    else
963        realbuffer = buffer;
964    /* step 4: fill the buffer */
965    /* Since we've analyzed how much space we need for the worst case,
966       we don't have to resize the string.
967       There can be no errors beyond this point. */
968    string = PyUnicode_FromUnicode(NULL, n);
969    if (!string)
970        goto fail;
971
972    s = PyUnicode_AS_UNICODE(string);
973    callresult = callresults;
974
975    for (f = format; *f; f++) {
976        if (*f == '%') {
977            const char* p = f++;
978            int longflag = 0;
979            int longlongflag = 0;
980            int size_tflag = 0;
981            zeropad = (*f == '0');
982            /* parse the width.precision part */
983            width = 0;
984            while (Py_ISDIGIT((unsigned)*f))
985                width = (width*10) + *f++ - '0';
986            precision = 0;
987            if (*f == '.') {
988                f++;
989                while (Py_ISDIGIT((unsigned)*f))
990                    precision = (precision*10) + *f++ - '0';
991            }
992            /* Handle %ld, %lu, %lld and %llu. */
993            if (*f == 'l') {
994                if (f[1] == 'd' || f[1] == 'u') {
995                    longflag = 1;
996                    ++f;
997                }
998#ifdef HAVE_LONG_LONG
999                else if (f[1] == 'l' &&
1000                         (f[2] == 'd' || f[2] == 'u')) {
1001                    longlongflag = 1;
1002                    f += 2;
1003                }
1004#endif
1005            }
1006            /* handle the size_t flag. */
1007            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1008                size_tflag = 1;
1009                ++f;
1010            }
1011
1012            switch (*f) {
1013            case 'c':
1014            {
1015                int ordinal = va_arg(vargs, int);
1016#ifndef Py_UNICODE_WIDE
1017                if (ordinal > 0xffff) {
1018                    ordinal -= 0x10000;
1019                    *s++ = 0xD800 | (ordinal >> 10);
1020                    *s++ = 0xDC00 | (ordinal & 0x3FF);
1021                } else
1022#endif
1023                *s++ = ordinal;
1024                break;
1025            }
1026            case 'd':
1027                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1028                        width, precision, 'd');
1029                if (longflag)
1030                    sprintf(realbuffer, fmt, va_arg(vargs, long));
1031#ifdef HAVE_LONG_LONG
1032                else if (longlongflag)
1033                    sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1034#endif
1035                else if (size_tflag)
1036                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1037                else
1038                    sprintf(realbuffer, fmt, va_arg(vargs, int));
1039                appendstring(realbuffer);
1040                break;
1041            case 'u':
1042                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1043                        width, precision, 'u');
1044                if (longflag)
1045                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
1046#ifdef HAVE_LONG_LONG
1047                else if (longlongflag)
1048                    sprintf(realbuffer, fmt, va_arg(vargs,
1049                                                    unsigned PY_LONG_LONG));
1050#endif
1051                else if (size_tflag)
1052                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1053                else
1054                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1055                appendstring(realbuffer);
1056                break;
1057            case 'i':
1058                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
1059                sprintf(realbuffer, fmt, va_arg(vargs, int));
1060                appendstring(realbuffer);
1061                break;
1062            case 'x':
1063                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1064                sprintf(realbuffer, fmt, va_arg(vargs, int));
1065                appendstring(realbuffer);
1066                break;
1067            case 's':
1068            {
1069                /* unused, since we already have the result */
1070                (void) va_arg(vargs, char *);
1071                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1072                                PyUnicode_GET_SIZE(*callresult));
1073                s += PyUnicode_GET_SIZE(*callresult);
1074                /* We're done with the unicode()/repr() => forget it */
1075                Py_DECREF(*callresult);
1076                /* switch to next unicode()/repr() result */
1077                ++callresult;
1078                break;
1079            }
1080            case 'U':
1081            {
1082                PyObject *obj = va_arg(vargs, PyObject *);
1083                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1084                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1085                s += size;
1086                break;
1087            }
1088            case 'V':
1089            {
1090                PyObject *obj = va_arg(vargs, PyObject *);
1091                va_arg(vargs, const char *);
1092                if (obj) {
1093                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1094                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1095                    s += size;
1096                } else {
1097                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1098                                    PyUnicode_GET_SIZE(*callresult));
1099                    s += PyUnicode_GET_SIZE(*callresult);
1100                    Py_DECREF(*callresult);
1101                }
1102                ++callresult;
1103                break;
1104            }
1105            case 'S':
1106            case 'R':
1107            case 'A':
1108            {
1109                Py_UNICODE *ucopy;
1110                Py_ssize_t usize;
1111                Py_ssize_t upos;
1112                /* unused, since we already have the result */
1113                (void) va_arg(vargs, PyObject *);
1114                ucopy = PyUnicode_AS_UNICODE(*callresult);
1115                usize = PyUnicode_GET_SIZE(*callresult);
1116                for (upos = 0; upos<usize;)
1117                    *s++ = ucopy[upos++];
1118                /* We're done with the unicode()/repr() => forget it */
1119                Py_DECREF(*callresult);
1120                /* switch to next unicode()/repr() result */
1121                ++callresult;
1122                break;
1123            }
1124            case 'p':
1125                sprintf(buffer, "%p", va_arg(vargs, void*));
1126                /* %p is ill-defined:  ensure leading 0x. */
1127                if (buffer[1] == 'X')
1128                    buffer[1] = 'x';
1129                else if (buffer[1] != 'x') {
1130                    memmove(buffer+2, buffer, strlen(buffer)+1);
1131                    buffer[0] = '0';
1132                    buffer[1] = 'x';
1133                }
1134                appendstring(buffer);
1135                break;
1136            case '%':
1137                *s++ = '%';
1138                break;
1139            default:
1140                appendstring(p);
1141                goto end;
1142            }
1143        }
1144        else
1145            *s++ = *f;
1146    }
1147
1148  end:
1149    if (callresults)
1150        PyObject_Free(callresults);
1151    if (abuffer)
1152        PyObject_Free(abuffer);
1153    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1154    return string;
1155  fail:
1156    if (callresults) {
1157        PyObject **callresult2 = callresults;
1158        while (callresult2 < callresult) {
1159            Py_XDECREF(*callresult2);
1160            ++callresult2;
1161        }
1162        PyObject_Free(callresults);
1163    }
1164    if (abuffer)
1165        PyObject_Free(abuffer);
1166    return NULL;
1167}
1168
1169#undef appendstring
1170
1171PyObject *
1172PyUnicode_FromFormat(const char *format, ...)
1173{
1174    PyObject* ret;
1175    va_list vargs;
1176
1177#ifdef HAVE_STDARG_PROTOTYPES
1178    va_start(vargs, format);
1179#else
1180    va_start(vargs);
1181#endif
1182    ret = PyUnicode_FromFormatV(format, vargs);
1183    va_end(vargs);
1184    return ret;
1185}
1186
1187/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1188   convert a Unicode object to a wide character string.
1189
1190   - If w is NULL: return the number of wide characters (including the null
1191     character) required to convert the unicode object. Ignore size argument.
1192
1193   - Otherwise: return the number of wide characters (excluding the null
1194     character) written into w. Write at most size wide characters (including
1195     the null character). */
1196static Py_ssize_t
1197unicode_aswidechar(PyUnicodeObject *unicode,
1198                   wchar_t *w,
1199                   Py_ssize_t size)
1200{
1201#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1202    Py_ssize_t res;
1203    if (w != NULL) {
1204        res = PyUnicode_GET_SIZE(unicode);
1205        if (size > res)
1206            size = res + 1;
1207        else
1208            res = size;
1209        memcpy(w, unicode->str, size * sizeof(wchar_t));
1210        return res;
1211    }
1212    else
1213        return PyUnicode_GET_SIZE(unicode) + 1;
1214#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1215    register const Py_UNICODE *u;
1216    const Py_UNICODE *uend;
1217    const wchar_t *worig, *wend;
1218    Py_ssize_t nchar;
1219
1220    u = PyUnicode_AS_UNICODE(unicode);
1221    uend = u + PyUnicode_GET_SIZE(unicode);
1222    if (w != NULL) {
1223        worig = w;
1224        wend = w + size;
1225        while (u != uend && w != wend) {
1226            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1227                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1228            {
1229                *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1230                u += 2;
1231            }
1232            else {
1233                *w = *u;
1234                u++;
1235            }
1236            w++;
1237        }
1238        if (w != wend)
1239            *w = L'\0';
1240        return w - worig;
1241    }
1242    else {
1243        nchar = 1; /* null character at the end */
1244        while (u != uend) {
1245            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1246                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1247                u += 2;
1248            else
1249                u++;
1250            nchar++;
1251        }
1252    }
1253    return nchar;
1254#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1255    register Py_UNICODE *u, *uend, ordinal;
1256    register Py_ssize_t i;
1257    wchar_t *worig, *wend;
1258    Py_ssize_t nchar;
1259
1260    u = PyUnicode_AS_UNICODE(unicode);
1261    uend = u + PyUnicode_GET_SIZE(u);
1262    if (w != NULL) {
1263        worig = w;
1264        wend = w + size;
1265        while (u != uend && w != wend) {
1266            ordinal = *u;
1267            if (ordinal > 0xffff) {
1268                ordinal -= 0x10000;
1269                *w++ = 0xD800 | (ordinal >> 10);
1270                *w++ = 0xDC00 | (ordinal & 0x3FF);
1271            }
1272            else
1273                *w++ = ordinal;
1274            u++;
1275        }
1276        if (w != wend)
1277            *w = 0;
1278        return w - worig;
1279    }
1280    else {
1281        nchar = 1; /* null character */
1282        while (u != uend) {
1283            if (*u > 0xffff)
1284                nchar += 2;
1285            else
1286                nchar++;
1287            u++;
1288        }
1289        return nchar;
1290    }
1291#else
1292#  error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
1293#endif
1294}
1295
1296Py_ssize_t
1297PyUnicode_AsWideChar(PyObject *unicode,
1298                     wchar_t *w,
1299                     Py_ssize_t size)
1300{
1301    if (unicode == NULL) {
1302        PyErr_BadInternalCall();
1303        return -1;
1304    }
1305    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
1306}
1307
1308wchar_t*
1309PyUnicode_AsWideCharString(PyObject *unicode,
1310                           Py_ssize_t *size)
1311{
1312    wchar_t* buffer;
1313    Py_ssize_t buflen;
1314
1315    if (unicode == NULL) {
1316        PyErr_BadInternalCall();
1317        return NULL;
1318    }
1319
1320    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
1321    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
1322        PyErr_NoMemory();
1323        return NULL;
1324    }
1325
1326    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1327    if (buffer == NULL) {
1328        PyErr_NoMemory();
1329        return NULL;
1330    }
1331    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
1332    if (size != NULL)
1333        *size = buflen;
1334    return buffer;
1335}
1336
1337#endif
1338
1339PyObject *PyUnicode_FromOrdinal(int ordinal)
1340{
1341    Py_UNICODE s[2];
1342
1343    if (ordinal < 0 || ordinal > 0x10ffff) {
1344        PyErr_SetString(PyExc_ValueError,
1345                        "chr() arg not in range(0x110000)");
1346        return NULL;
1347    }
1348
1349#ifndef Py_UNICODE_WIDE
1350    if (ordinal > 0xffff) {
1351        ordinal -= 0x10000;
1352        s[0] = 0xD800 | (ordinal >> 10);
1353        s[1] = 0xDC00 | (ordinal & 0x3FF);
1354        return PyUnicode_FromUnicode(s, 2);
1355    }
1356#endif
1357
1358    s[0] = (Py_UNICODE)ordinal;
1359    return PyUnicode_FromUnicode(s, 1);
1360}
1361
1362PyObject *PyUnicode_FromObject(register PyObject *obj)
1363{
1364    /* XXX Perhaps we should make this API an alias of
1365       PyObject_Str() instead ?! */
1366    if (PyUnicode_CheckExact(obj)) {
1367        Py_INCREF(obj);
1368        return obj;
1369    }
1370    if (PyUnicode_Check(obj)) {
1371        /* For a Unicode subtype that's not a Unicode object,
1372           return a true Unicode object with the same data. */
1373        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1374                                     PyUnicode_GET_SIZE(obj));
1375    }
1376    PyErr_Format(PyExc_TypeError,
1377                 "Can't convert '%.100s' object to str implicitly",
1378                 Py_TYPE(obj)->tp_name);
1379    return NULL;
1380}
1381
1382PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1383                                      const char *encoding,
1384                                      const char *errors)
1385{
1386    Py_buffer buffer;
1387    PyObject *v;
1388
1389    if (obj == NULL) {
1390        PyErr_BadInternalCall();
1391        return NULL;
1392    }
1393
1394    /* Decoding bytes objects is the most common case and should be fast */
1395    if (PyBytes_Check(obj)) {
1396        if (PyBytes_GET_SIZE(obj) == 0) {
1397            Py_INCREF(unicode_empty);
1398            v = (PyObject *) unicode_empty;
1399        }
1400        else {
1401            v = PyUnicode_Decode(
1402                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1403                    encoding, errors);
1404        }
1405        return v;
1406    }
1407
1408    if (PyUnicode_Check(obj)) {
1409        PyErr_SetString(PyExc_TypeError,
1410                        "decoding str is not supported");
1411        return NULL;
1412    }
1413
1414    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1415    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1416        PyErr_Format(PyExc_TypeError,
1417                     "coercing to str: need bytes, bytearray "
1418                     "or buffer-like object, %.80s found",
1419                     Py_TYPE(obj)->tp_name);
1420        return NULL;
1421    }
1422
1423    if (buffer.len == 0) {
1424        Py_INCREF(unicode_empty);
1425        v = (PyObject *) unicode_empty;
1426    }
1427    else
1428        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1429
1430    PyBuffer_Release(&buffer);
1431    return v;
1432}
1433
1434/* Convert encoding to lower case and replace '_' with '-' in order to
1435   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1436   1 on success. */
1437static int
1438normalize_encoding(const char *encoding,
1439                   char *lower,
1440                   size_t lower_len)
1441{
1442    const char *e;
1443    char *l;
1444    char *l_end;
1445
1446    e = encoding;
1447    l = lower;
1448    l_end = &lower[lower_len - 1];
1449    while (*e) {
1450        if (l == l_end)
1451            return 0;
1452        if (Py_ISUPPER(*e)) {
1453            *l++ = Py_TOLOWER(*e++);
1454        }
1455        else if (*e == '_') {
1456            *l++ = '-';
1457            e++;
1458        }
1459        else {
1460            *l++ = *e++;
1461        }
1462    }
1463    *l = '\0';
1464    return 1;
1465}
1466
1467PyObject *PyUnicode_Decode(const char *s,
1468                           Py_ssize_t size,
1469                           const char *encoding,
1470                           const char *errors)
1471{
1472    PyObject *buffer = NULL, *unicode;
1473    Py_buffer info;
1474    char lower[11];  /* Enough for any encoding shortcut */
1475
1476    if (encoding == NULL)
1477        encoding = PyUnicode_GetDefaultEncoding();
1478
1479    /* Shortcuts for common default encodings */
1480    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1481        if (strcmp(lower, "utf-8") == 0)
1482            return PyUnicode_DecodeUTF8(s, size, errors);
1483        else if ((strcmp(lower, "latin-1") == 0) ||
1484                 (strcmp(lower, "iso-8859-1") == 0))
1485            return PyUnicode_DecodeLatin1(s, size, errors);
1486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1487        else if (strcmp(lower, "mbcs") == 0)
1488            return PyUnicode_DecodeMBCS(s, size, errors);
1489#endif
1490        else if (strcmp(lower, "ascii") == 0)
1491            return PyUnicode_DecodeASCII(s, size, errors);
1492        else if (strcmp(lower, "utf-16") == 0)
1493            return PyUnicode_DecodeUTF16(s, size, errors, 0);
1494        else if (strcmp(lower, "utf-32") == 0)
1495            return PyUnicode_DecodeUTF32(s, size, errors, 0);
1496    }
1497
1498    /* Decode via the codec registry */
1499    buffer = NULL;
1500    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1501        goto onError;
1502    buffer = PyMemoryView_FromBuffer(&info);
1503    if (buffer == NULL)
1504        goto onError;
1505    unicode = PyCodec_Decode(buffer, encoding, errors);
1506    if (unicode == NULL)
1507        goto onError;
1508    if (!PyUnicode_Check(unicode)) {
1509        PyErr_Format(PyExc_TypeError,
1510                     "decoder did not return a str object (type=%.400s)",
1511                     Py_TYPE(unicode)->tp_name);
1512        Py_DECREF(unicode);
1513        goto onError;
1514    }
1515    Py_DECREF(buffer);
1516    return unicode;
1517
1518  onError:
1519    Py_XDECREF(buffer);
1520    return NULL;
1521}
1522
1523PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1524                                    const char *encoding,
1525                                    const char *errors)
1526{
1527    PyObject *v;
1528
1529    if (!PyUnicode_Check(unicode)) {
1530        PyErr_BadArgument();
1531        goto onError;
1532    }
1533
1534    if (encoding == NULL)
1535        encoding = PyUnicode_GetDefaultEncoding();
1536
1537    /* Decode via the codec registry */
1538    v = PyCodec_Decode(unicode, encoding, errors);
1539    if (v == NULL)
1540        goto onError;
1541    return v;
1542
1543  onError:
1544    return NULL;
1545}
1546
1547PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1548                                     const char *encoding,
1549                                     const char *errors)
1550{
1551    PyObject *v;
1552
1553    if (!PyUnicode_Check(unicode)) {
1554        PyErr_BadArgument();
1555        goto onError;
1556    }
1557
1558    if (encoding == NULL)
1559        encoding = PyUnicode_GetDefaultEncoding();
1560
1561    /* Decode via the codec registry */
1562    v = PyCodec_Decode(unicode, encoding, errors);
1563    if (v == NULL)
1564        goto onError;
1565    if (!PyUnicode_Check(v)) {
1566        PyErr_Format(PyExc_TypeError,
1567                     "decoder did not return a str object (type=%.400s)",
1568                     Py_TYPE(v)->tp_name);
1569        Py_DECREF(v);
1570        goto onError;
1571    }
1572    return v;
1573
1574  onError:
1575    return NULL;
1576}
1577
1578PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1579                           Py_ssize_t size,
1580                           const char *encoding,
1581                           const char *errors)
1582{
1583    PyObject *v, *unicode;
1584
1585    unicode = PyUnicode_FromUnicode(s, size);
1586    if (unicode == NULL)
1587        return NULL;
1588    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1589    Py_DECREF(unicode);
1590    return v;
1591}
1592
1593PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1594                                    const char *encoding,
1595                                    const char *errors)
1596{
1597    PyObject *v;
1598
1599    if (!PyUnicode_Check(unicode)) {
1600        PyErr_BadArgument();
1601        goto onError;
1602    }
1603
1604    if (encoding == NULL)
1605        encoding = PyUnicode_GetDefaultEncoding();
1606
1607    /* Encode via the codec registry */
1608    v = PyCodec_Encode(unicode, encoding, errors);
1609    if (v == NULL)
1610        goto onError;
1611    return v;
1612
1613  onError:
1614    return NULL;
1615}
1616
1617PyObject *
1618PyUnicode_EncodeFSDefault(PyObject *unicode)
1619{
1620#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1621    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1622                                PyUnicode_GET_SIZE(unicode),
1623                                NULL);
1624#elif defined(__APPLE__)
1625    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1626                                PyUnicode_GET_SIZE(unicode),
1627                                "surrogateescape");
1628#else
1629    PyInterpreterState *interp = PyThreadState_GET()->interp;
1630    /* Bootstrap check: if the filesystem codec is implemented in Python, we
1631       cannot use it to encode and decode filenames before it is loaded. Load
1632       the Python codec requires to encode at least its own filename. Use the C
1633       version of the locale codec until the codec registry is initialized and
1634       the Python codec is loaded.
1635
1636       Py_FileSystemDefaultEncoding is shared between all interpreters, we
1637       cannot only rely on it: check also interp->fscodec_initialized for
1638       subinterpreters. */
1639    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
1640        return PyUnicode_AsEncodedString(unicode,
1641                                         Py_FileSystemDefaultEncoding,
1642                                         "surrogateescape");
1643    }
1644    else {
1645        /* locale encoding with surrogateescape */
1646        wchar_t *wchar;
1647        char *bytes;
1648        PyObject *bytes_obj;
1649        size_t error_pos;
1650
1651        wchar = PyUnicode_AsWideCharString(unicode, NULL);
1652        if (wchar == NULL)
1653            return NULL;
1654        bytes = _Py_wchar2char(wchar, &error_pos);
1655        if (bytes == NULL) {
1656            if (error_pos != (size_t)-1) {
1657                char *errmsg = strerror(errno);
1658                PyObject *exc = NULL;
1659                if (errmsg == NULL)
1660                    errmsg = "Py_wchar2char() failed";
1661                raise_encode_exception(&exc,
1662                    "filesystemencoding",
1663                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1664                    error_pos, error_pos+1,
1665                    errmsg);
1666                Py_XDECREF(exc);
1667            }
1668            else
1669                PyErr_NoMemory();
1670            PyMem_Free(wchar);
1671            return NULL;
1672        }
1673        PyMem_Free(wchar);
1674
1675        bytes_obj = PyBytes_FromString(bytes);
1676        PyMem_Free(bytes);
1677        return bytes_obj;
1678    }
1679#endif
1680}
1681
1682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1683                                    const char *encoding,
1684                                    const char *errors)
1685{
1686    PyObject *v;
1687    char lower[11];  /* Enough for any encoding shortcut */
1688
1689    if (!PyUnicode_Check(unicode)) {
1690        PyErr_BadArgument();
1691        return NULL;
1692    }
1693
1694    if (encoding == NULL)
1695        encoding = PyUnicode_GetDefaultEncoding();
1696
1697    /* Shortcuts for common default encodings */
1698    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1699        if (strcmp(lower, "utf-8") == 0)
1700            return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1701                                        PyUnicode_GET_SIZE(unicode),
1702                                        errors);
1703        else if ((strcmp(lower, "latin-1") == 0) ||
1704                 (strcmp(lower, "iso-8859-1") == 0))
1705            return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1706                                          PyUnicode_GET_SIZE(unicode),
1707                                          errors);
1708#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1709        else if (strcmp(lower, "mbcs") == 0)
1710            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1711                                        PyUnicode_GET_SIZE(unicode),
1712                                        errors);
1713#endif
1714        else if (strcmp(lower, "ascii") == 0)
1715            return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1716                                         PyUnicode_GET_SIZE(unicode),
1717                                         errors);
1718    }
1719    /* During bootstrap, we may need to find the encodings
1720       package, to load the file system encoding, and require the
1721       file system encoding in order to load the encodings
1722       package.
1723
1724       Break out of this dependency by assuming that the path to
1725       the encodings module is ASCII-only.  XXX could try wcstombs
1726       instead, if the file system encoding is the locale's
1727       encoding. */
1728    if (Py_FileSystemDefaultEncoding &&
1729             strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1730             !PyThreadState_GET()->interp->codecs_initialized)
1731        return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1732                                     PyUnicode_GET_SIZE(unicode),
1733                                     errors);
1734
1735    /* Encode via the codec registry */
1736    v = PyCodec_Encode(unicode, encoding, errors);
1737    if (v == NULL)
1738        return NULL;
1739
1740    /* The normal path */
1741    if (PyBytes_Check(v))
1742        return v;
1743
1744    /* If the codec returns a buffer, raise a warning and convert to bytes */
1745    if (PyByteArray_Check(v)) {
1746        int error;
1747        PyObject *b;
1748
1749        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1750            "encoder %s returned bytearray instead of bytes",
1751            encoding);
1752        if (error) {
1753            Py_DECREF(v);
1754            return NULL;
1755        }
1756
1757        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1758        Py_DECREF(v);
1759        return b;
1760    }
1761
1762    PyErr_Format(PyExc_TypeError,
1763                 "encoder did not return a bytes object (type=%.400s)",
1764                 Py_TYPE(v)->tp_name);
1765    Py_DECREF(v);
1766    return NULL;
1767}
1768
1769PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1770                                     const char *encoding,
1771                                     const char *errors)
1772{
1773    PyObject *v;
1774
1775    if (!PyUnicode_Check(unicode)) {
1776        PyErr_BadArgument();
1777        goto onError;
1778    }
1779
1780    if (encoding == NULL)
1781        encoding = PyUnicode_GetDefaultEncoding();
1782
1783    /* Encode via the codec registry */
1784    v = PyCodec_Encode(unicode, encoding, errors);
1785    if (v == NULL)
1786        goto onError;
1787    if (!PyUnicode_Check(v)) {
1788        PyErr_Format(PyExc_TypeError,
1789                     "encoder did not return an str object (type=%.400s)",
1790                     Py_TYPE(v)->tp_name);
1791        Py_DECREF(v);
1792        goto onError;
1793    }
1794    return v;
1795
1796  onError:
1797    return NULL;
1798}
1799
1800PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1801                                            const char *errors)
1802{
1803    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1804    if (v)
1805        return v;
1806    if (errors != NULL)
1807        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1808    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1809                             PyUnicode_GET_SIZE(unicode),
1810                             NULL);
1811    if (!v)
1812        return NULL;
1813    ((PyUnicodeObject *)unicode)->defenc = v;
1814    return v;
1815}
1816
1817PyObject*
1818PyUnicode_DecodeFSDefault(const char *s) {
1819    Py_ssize_t size = (Py_ssize_t)strlen(s);
1820    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1821}
1822
1823PyObject*
1824PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1825{
1826#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1827    return PyUnicode_DecodeMBCS(s, size, NULL);
1828#elif defined(__APPLE__)
1829    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1830#else
1831    PyInterpreterState *interp = PyThreadState_GET()->interp;
1832    /* Bootstrap check: if the filesystem codec is implemented in Python, we
1833       cannot use it to encode and decode filenames before it is loaded. Load
1834       the Python codec requires to encode at least its own filename. Use the C
1835       version of the locale codec until the codec registry is initialized and
1836       the Python codec is loaded.
1837
1838       Py_FileSystemDefaultEncoding is shared between all interpreters, we
1839       cannot only rely on it: check also interp->fscodec_initialized for
1840       subinterpreters. */
1841    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
1842        return PyUnicode_Decode(s, size,
1843                                Py_FileSystemDefaultEncoding,
1844                                "surrogateescape");
1845    }
1846    else {
1847        /* locale encoding with surrogateescape */
1848        wchar_t *wchar;
1849        PyObject *unicode;
1850        size_t len;
1851
1852        if (s[size] != '\0' || size != strlen(s)) {
1853            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1854            return NULL;
1855        }
1856
1857        wchar = _Py_char2wchar(s, &len);
1858        if (wchar == NULL)
1859            return PyErr_NoMemory();
1860
1861        unicode = PyUnicode_FromWideChar(wchar, len);
1862        PyMem_Free(wchar);
1863        return unicode;
1864    }
1865#endif
1866}
1867
1868
1869int
1870_PyUnicode_HasNULChars(PyObject* s)
1871{
1872    static PyObject *nul = NULL;
1873
1874    if (nul == NULL)
1875        nul = PyUnicode_FromStringAndSize("\0", 1);
1876    if (nul == NULL)
1877        return -1;
1878    return PyUnicode_Contains(s, nul);
1879}
1880
1881
1882int
1883PyUnicode_FSConverter(PyObject* arg, void* addr)
1884{
1885    PyObject *output = NULL;
1886    Py_ssize_t size;
1887    void *data;
1888    if (arg == NULL) {
1889        Py_DECREF(*(PyObject**)addr);
1890        return 1;
1891    }
1892    if (PyBytes_Check(arg)) {
1893        output = arg;
1894        Py_INCREF(output);
1895    }
1896    else {
1897        arg = PyUnicode_FromObject(arg);
1898        if (!arg)
1899            return 0;
1900        output = PyUnicode_EncodeFSDefault(arg);
1901        Py_DECREF(arg);
1902        if (!output)
1903            return 0;
1904        if (!PyBytes_Check(output)) {
1905            Py_DECREF(output);
1906            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1907            return 0;
1908        }
1909    }
1910    size = PyBytes_GET_SIZE(output);
1911    data = PyBytes_AS_STRING(output);
1912    if (size != strlen(data)) {
1913        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1914        Py_DECREF(output);
1915        return 0;
1916    }
1917    *(PyObject**)addr = output;
1918    return Py_CLEANUP_SUPPORTED;
1919}
1920
1921
1922int
1923PyUnicode_FSDecoder(PyObject* arg, void* addr)
1924{
1925    PyObject *output = NULL;
1926    Py_ssize_t size;
1927    void *data;
1928    if (arg == NULL) {
1929        Py_DECREF(*(PyObject**)addr);
1930        return 1;
1931    }
1932    if (PyUnicode_Check(arg)) {
1933        output = arg;
1934        Py_INCREF(output);
1935    }
1936    else {
1937        arg = PyBytes_FromObject(arg);
1938        if (!arg)
1939            return 0;
1940        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1941                                                  PyBytes_GET_SIZE(arg));
1942        Py_DECREF(arg);
1943        if (!output)
1944            return 0;
1945        if (!PyUnicode_Check(output)) {
1946            Py_DECREF(output);
1947            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1948            return 0;
1949        }
1950    }
1951    size = PyUnicode_GET_SIZE(output);
1952    data = PyUnicode_AS_UNICODE(output);
1953    if (size != Py_UNICODE_strlen(data)) {
1954        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1955        Py_DECREF(output);
1956        return 0;
1957    }
1958    *(PyObject**)addr = output;
1959    return Py_CLEANUP_SUPPORTED;
1960}
1961
1962
1963char*
1964_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1965{
1966    PyObject *bytes;
1967    if (!PyUnicode_Check(unicode)) {
1968        PyErr_BadArgument();
1969        return NULL;
1970    }
1971    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1972    if (bytes == NULL)
1973        return NULL;
1974    if (psize != NULL)
1975        *psize = PyBytes_GET_SIZE(bytes);
1976    return PyBytes_AS_STRING(bytes);
1977}
1978
1979char*
1980_PyUnicode_AsString(PyObject *unicode)
1981{
1982    return _PyUnicode_AsStringAndSize(unicode, NULL);
1983}
1984
1985Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1986{
1987    if (!PyUnicode_Check(unicode)) {
1988        PyErr_BadArgument();
1989        goto onError;
1990    }
1991    return PyUnicode_AS_UNICODE(unicode);
1992
1993  onError:
1994    return NULL;
1995}
1996
1997Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1998{
1999    if (!PyUnicode_Check(unicode)) {
2000        PyErr_BadArgument();
2001        goto onError;
2002    }
2003    return PyUnicode_GET_SIZE(unicode);
2004
2005  onError:
2006    return -1;
2007}
2008
2009const char *PyUnicode_GetDefaultEncoding(void)
2010{
2011    return "utf-8";
2012}
2013
2014/* create or adjust a UnicodeDecodeError */
2015static void
2016make_decode_exception(PyObject **exceptionObject,
2017                      const char *encoding,
2018                      const char *input, Py_ssize_t length,
2019                      Py_ssize_t startpos, Py_ssize_t endpos,
2020                      const char *reason)
2021{
2022    if (*exceptionObject == NULL) {
2023        *exceptionObject = PyUnicodeDecodeError_Create(
2024            encoding, input, length, startpos, endpos, reason);
2025    }
2026    else {
2027        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2028            goto onError;
2029        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2030            goto onError;
2031        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2032            goto onError;
2033    }
2034    return;
2035
2036onError:
2037    Py_DECREF(*exceptionObject);
2038    *exceptionObject = NULL;
2039}
2040
2041/* error handling callback helper:
2042   build arguments, call the callback and check the arguments,
2043   if no exception occurred, copy the replacement to the output
2044   and adjust various state variables.
2045   return 0 on success, -1 on error
2046*/
2047
2048static
2049int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2050                                     const char *encoding, const char *reason,
2051                                     const char **input, const char **inend, Py_ssize_t *startinpos,
2052                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2053                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
2054{
2055    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
2056
2057    PyObject *restuple = NULL;
2058    PyObject *repunicode = NULL;
2059    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
2060    Py_ssize_t insize;
2061    Py_ssize_t requiredsize;
2062    Py_ssize_t newpos;
2063    Py_UNICODE *repptr;
2064    PyObject *inputobj = NULL;
2065    Py_ssize_t repsize;
2066    int res = -1;
2067
2068    if (*errorHandler == NULL) {
2069        *errorHandler = PyCodec_LookupError(errors);
2070        if (*errorHandler == NULL)
2071            goto onError;
2072    }
2073
2074    make_decode_exception(exceptionObject,
2075        encoding,
2076        *input, *inend - *input,
2077        *startinpos, *endinpos,
2078        reason);
2079    if (*exceptionObject == NULL)
2080        goto onError;
2081
2082    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2083    if (restuple == NULL)
2084        goto onError;
2085    if (!PyTuple_Check(restuple)) {
2086        PyErr_SetString(PyExc_TypeError, &argparse[4]);
2087        goto onError;
2088    }
2089    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
2090        goto onError;
2091
2092    /* Copy back the bytes variables, which might have been modified by the
2093       callback */
2094    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2095    if (!inputobj)
2096        goto onError;
2097    if (!PyBytes_Check(inputobj)) {
2098        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
2099    }
2100    *input = PyBytes_AS_STRING(inputobj);
2101    insize = PyBytes_GET_SIZE(inputobj);
2102    *inend = *input + insize;
2103    /* we can DECREF safely, as the exception has another reference,
2104       so the object won't go away. */
2105    Py_DECREF(inputobj);
2106
2107    if (newpos<0)
2108        newpos = insize+newpos;
2109    if (newpos<0 || newpos>insize) {
2110        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2111        goto onError;
2112    }
2113
2114    /* need more space? (at least enough for what we
2115       have+the replacement+the rest of the string (starting
2116       at the new input position), so we won't have to check space
2117       when there are no errors in the rest of the string) */
2118    repptr = PyUnicode_AS_UNICODE(repunicode);
2119    repsize = PyUnicode_GET_SIZE(repunicode);
2120    requiredsize = *outpos + repsize + insize-newpos;
2121    if (requiredsize > outsize) {
2122        if (requiredsize<2*outsize)
2123            requiredsize = 2*outsize;
2124        if (_PyUnicode_Resize(output, requiredsize) < 0)
2125            goto onError;
2126        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
2127    }
2128    *endinpos = newpos;
2129    *inptr = *input + newpos;
2130    Py_UNICODE_COPY(*outptr, repptr, repsize);
2131    *outptr += repsize;
2132    *outpos += repsize;
2133
2134    /* we made it! */
2135    res = 0;
2136
2137  onError:
2138    Py_XDECREF(restuple);
2139    return res;
2140}
2141
2142/* --- UTF-7 Codec -------------------------------------------------------- */
2143
2144/* See RFC2152 for details.  We encode conservatively and decode liberally. */
2145
2146/* Three simple macros defining base-64. */
2147
2148/* Is c a base-64 character? */
2149
2150#define IS_BASE64(c) \
2151    (((c) >= 'A' && (c) <= 'Z') ||     \
2152     ((c) >= 'a' && (c) <= 'z') ||     \
2153     ((c) >= '0' && (c) <= '9') ||     \
2154     (c) == '+' || (c) == '/')
2155
2156/* given that c is a base-64 character, what is its base-64 value? */
2157
2158#define FROM_BASE64(c)                                                  \
2159    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
2160     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
2161     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
2162     (c) == '+' ? 62 : 63)
2163
2164/* What is the base-64 character of the bottom 6 bits of n? */
2165
2166#define TO_BASE64(n)  \
2167    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2168
2169/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2170 * decoded as itself.  We are permissive on decoding; the only ASCII
2171 * byte not decoding to itself is the + which begins a base64
2172 * string. */
2173
2174#define DECODE_DIRECT(c)                                \
2175    ((c) <= 127 && (c) != '+')
2176
2177/* The UTF-7 encoder treats ASCII characters differently according to
2178 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2179 * the above).  See RFC2152.  This array identifies these different
2180 * sets:
2181 * 0 : "Set D"
2182 *     alphanumeric and '(),-./:?
2183 * 1 : "Set O"
2184 *     !"#$%&*;<=>@[]^_`{|}
2185 * 2 : "whitespace"
2186 *     ht nl cr sp
2187 * 3 : special (must be base64 encoded)
2188 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2189 */
2190
2191static
2192char utf7_category[128] = {
2193/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
2194    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
2195/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
2196    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2197/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
2198    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
2199/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
2200    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
2201/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
2202    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2203/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
2204    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
2205/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
2206    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2207/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
2208    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
2209};
2210
2211/* ENCODE_DIRECT: this character should be encoded as itself.  The
2212 * answer depends on whether we are encoding set O as itself, and also
2213 * on whether we are encoding whitespace as itself.  RFC2152 makes it
2214 * clear that the answers to these questions vary between
2215 * applications, so this code needs to be flexible.  */
2216
2217#define ENCODE_DIRECT(c, directO, directWS)             \
2218    ((c) < 128 && (c) > 0 &&                            \
2219     ((utf7_category[(c)] == 0) ||                      \
2220      (directWS && (utf7_category[(c)] == 2)) ||        \
2221      (directO && (utf7_category[(c)] == 1))))
2222
2223PyObject *PyUnicode_DecodeUTF7(const char *s,
2224                               Py_ssize_t size,
2225                               const char *errors)
2226{
2227    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2228}
2229
2230/* The decoder.  The only state we preserve is our read position,
2231 * i.e. how many characters we have consumed.  So if we end in the
2232 * middle of a shift sequence we have to back off the read position
2233 * and the output to the beginning of the sequence, otherwise we lose
2234 * all the shift state (seen bits, number of bits seen, high
2235 * surrogate). */
2236
2237PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
2238                                       Py_ssize_t size,
2239                                       const char *errors,
2240                                       Py_ssize_t *consumed)
2241{
2242    const char *starts = s;
2243    Py_ssize_t startinpos;
2244    Py_ssize_t endinpos;
2245    Py_ssize_t outpos;
2246    const char *e;
2247    PyUnicodeObject *unicode;
2248    Py_UNICODE *p;
2249    const char *errmsg = "";
2250    int inShift = 0;
2251    Py_UNICODE *shiftOutStart;
2252    unsigned int base64bits = 0;
2253    unsigned long base64buffer = 0;
2254    Py_UNICODE surrogate = 0;
2255    PyObject *errorHandler = NULL;
2256    PyObject *exc = NULL;
2257
2258    unicode = _PyUnicode_New(size);
2259    if (!unicode)
2260        return NULL;
2261    if (size == 0) {
2262        if (consumed)
2263            *consumed = 0;
2264        return (PyObject *)unicode;
2265    }
2266
2267    p = unicode->str;
2268    shiftOutStart = p;
2269    e = s + size;
2270
2271    while (s < e) {
2272        Py_UNICODE ch;
2273      restart:
2274        ch = (unsigned char) *s;
2275
2276        if (inShift) { /* in a base-64 section */
2277            if (IS_BASE64(ch)) { /* consume a base-64 character */
2278                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2279                base64bits += 6;
2280                s++;
2281                if (base64bits >= 16) {
2282                    /* we have enough bits for a UTF-16 value */
2283                    Py_UNICODE outCh = (Py_UNICODE)
2284                                       (base64buffer >> (base64bits-16));
2285                    base64bits -= 16;
2286                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2287                    if (surrogate) {
2288                        /* expecting a second surrogate */
2289                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2290#ifdef Py_UNICODE_WIDE
2291                            *p++ = (((surrogate & 0x3FF)<<10)
2292                                    | (outCh & 0x3FF)) + 0x10000;
2293#else
2294                            *p++ = surrogate;
2295                            *p++ = outCh;
2296#endif
2297                            surrogate = 0;
2298                            continue;
2299                        }
2300                        else {
2301                            *p++ = surrogate;
2302                            surrogate = 0;
2303                        }
2304                    }
2305                    if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2306                        /* first surrogate */
2307                        surrogate = outCh;
2308                    }
2309                    else {
2310                        *p++ = outCh;
2311                    }
2312                }
2313            }
2314            else { /* now leaving a base-64 section */
2315                inShift = 0;
2316                s++;
2317                if (surrogate) {
2318                    *p++ = surrogate;
2319                    surrogate = 0;
2320                }
2321                if (base64bits > 0) { /* left-over bits */
2322                    if (base64bits >= 6) {
2323                        /* We've seen at least one base-64 character */
2324                        errmsg = "partial character in shift sequence";
2325                        goto utf7Error;
2326                    }
2327                    else {
2328                        /* Some bits remain; they should be zero */
2329                        if (base64buffer != 0) {
2330                            errmsg = "non-zero padding bits in shift sequence";
2331                            goto utf7Error;
2332                        }
2333                    }
2334                }
2335                if (ch != '-') {
2336                    /* '-' is absorbed; other terminating
2337                       characters are preserved */
2338                    *p++ = ch;
2339                }
2340            }
2341        }
2342        else if ( ch == '+' ) {
2343            startinpos = s-starts;
2344            s++; /* consume '+' */
2345            if (s < e && *s == '-') { /* '+-' encodes '+' */
2346                s++;
2347                *p++ = '+';
2348            }
2349            else { /* begin base64-encoded section */
2350                inShift = 1;
2351                shiftOutStart = p;
2352                base64bits = 0;
2353            }
2354        }
2355        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
2356            *p++ = ch;
2357            s++;
2358        }
2359        else {
2360            startinpos = s-starts;
2361            s++;
2362            errmsg = "unexpected special character";
2363            goto utf7Error;
2364        }
2365        continue;
2366utf7Error:
2367        outpos = p-PyUnicode_AS_UNICODE(unicode);
2368        endinpos = s-starts;
2369        if (unicode_decode_call_errorhandler(
2370                errors, &errorHandler,
2371                "utf7", errmsg,
2372                &starts, &e, &startinpos, &endinpos, &exc, &s,
2373                &unicode, &outpos, &p))
2374            goto onError;
2375    }
2376
2377    /* end of string */
2378
2379    if (inShift && !consumed) { /* in shift sequence, no more to follow */
2380        /* if we're in an inconsistent state, that's an error */
2381        if (surrogate ||
2382                (base64bits >= 6) ||
2383                (base64bits > 0 && base64buffer != 0)) {
2384            outpos = p-PyUnicode_AS_UNICODE(unicode);
2385            endinpos = size;
2386            if (unicode_decode_call_errorhandler(
2387                    errors, &errorHandler,
2388                    "utf7", "unterminated shift sequence",
2389                    &starts, &e, &startinpos, &endinpos, &exc, &s,
2390                    &unicode, &outpos, &p))
2391                goto onError;
2392            if (s < e)
2393                goto restart;
2394        }
2395    }
2396
2397    /* return state */
2398    if (consumed) {
2399        if (inShift) {
2400            p = shiftOutStart; /* back off output */
2401            *consumed = startinpos;
2402        }
2403        else {
2404            *consumed = s-starts;
2405        }
2406    }
2407
2408    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2409        goto onError;
2410
2411    Py_XDECREF(errorHandler);
2412    Py_XDECREF(exc);
2413    return (PyObject *)unicode;
2414
2415  onError:
2416    Py_XDECREF(errorHandler);
2417    Py_XDECREF(exc);
2418    Py_DECREF(unicode);
2419    return NULL;
2420}
2421
2422
2423PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2424                               Py_ssize_t size,
2425                               int base64SetO,
2426                               int base64WhiteSpace,
2427                               const char *errors)
2428{
2429    PyObject *v;
2430    /* It might be possible to tighten this worst case */
2431    Py_ssize_t allocated = 8 * size;
2432    int inShift = 0;
2433    Py_ssize_t i = 0;
2434    unsigned int base64bits = 0;
2435    unsigned long base64buffer = 0;
2436    char * out;
2437    char * start;
2438
2439    if (size == 0)
2440        return PyBytes_FromStringAndSize(NULL, 0);
2441
2442    if (allocated / 8 != size)
2443        return PyErr_NoMemory();
2444
2445    v = PyBytes_FromStringAndSize(NULL, allocated);
2446    if (v == NULL)
2447        return NULL;
2448
2449    start = out = PyBytes_AS_STRING(v);
2450    for (;i < size; ++i) {
2451        Py_UNICODE ch = s[i];
2452
2453        if (inShift) {
2454            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2455                /* shifting out */
2456                if (base64bits) { /* output remaining bits */
2457                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
2458                    base64buffer = 0;
2459                    base64bits = 0;
2460                }
2461                inShift = 0;
2462                /* Characters not in the BASE64 set implicitly unshift the sequence
2463                   so no '-' is required, except if the character is itself a '-' */
2464                if (IS_BASE64(ch) || ch == '-') {
2465                    *out++ = '-';
2466                }
2467                *out++ = (char) ch;
2468            }
2469            else {
2470                goto encode_char;
2471            }
2472        }
2473        else { /* not in a shift sequence */
2474            if (ch == '+') {
2475                *out++ = '+';
2476                        *out++ = '-';
2477            }
2478            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2479                *out++ = (char) ch;
2480            }
2481            else {
2482                *out++ = '+';
2483                inShift = 1;
2484                goto encode_char;
2485            }
2486        }
2487        continue;
2488encode_char:
2489#ifdef Py_UNICODE_WIDE
2490        if (ch >= 0x10000) {
2491            /* code first surrogate */
2492            base64bits += 16;
2493            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2494            while (base64bits >= 6) {
2495                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2496                base64bits -= 6;
2497            }
2498            /* prepare second surrogate */
2499            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
2500        }
2501#endif
2502        base64bits += 16;
2503        base64buffer = (base64buffer << 16) | ch;
2504        while (base64bits >= 6) {
2505            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2506            base64bits -= 6;
2507        }
2508    }
2509    if (base64bits)
2510        *out++= TO_BASE64(base64buffer << (6-base64bits) );
2511    if (inShift)
2512        *out++ = '-';
2513    if (_PyBytes_Resize(&v, out - start) < 0)
2514        return NULL;
2515    return v;
2516}
2517
2518#undef IS_BASE64
2519#undef FROM_BASE64
2520#undef TO_BASE64
2521#undef DECODE_DIRECT
2522#undef ENCODE_DIRECT
2523
2524/* --- UTF-8 Codec -------------------------------------------------------- */
2525
2526static
2527char utf8_code_length[256] = {
2528    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
2529       illegal prefix.  See RFC 3629 for details */
2530    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2531    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2532    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2533    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2534    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2535    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2536    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2537    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2538    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2539    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2540    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2541    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2542    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2543    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2544    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2545    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
2546};
2547
2548PyObject *PyUnicode_DecodeUTF8(const char *s,
2549                               Py_ssize_t size,
2550                               const char *errors)
2551{
2552    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2553}
2554
2555/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2556#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2557
2558/* Mask to quickly check whether a C 'long' contains a
2559   non-ASCII, UTF8-encoded char. */
2560#if (SIZEOF_LONG == 8)
2561# define ASCII_CHAR_MASK 0x8080808080808080L
2562#elif (SIZEOF_LONG == 4)
2563# define ASCII_CHAR_MASK 0x80808080L
2564#else
2565# error C 'long' size should be either 4 or 8!
2566#endif
2567
2568PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2569                                       Py_ssize_t size,
2570                                       const char *errors,
2571                                       Py_ssize_t *consumed)
2572{
2573    const char *starts = s;
2574    int n;
2575    int k;
2576    Py_ssize_t startinpos;
2577    Py_ssize_t endinpos;
2578    Py_ssize_t outpos;
2579    const char *e, *aligned_end;
2580    PyUnicodeObject *unicode;
2581    Py_UNICODE *p;
2582    const char *errmsg = "";
2583    PyObject *errorHandler = NULL;
2584    PyObject *exc = NULL;
2585
2586    /* Note: size will always be longer than the resulting Unicode
2587       character count */
2588    unicode = _PyUnicode_New(size);
2589    if (!unicode)
2590        return NULL;
2591    if (size == 0) {
2592        if (consumed)
2593            *consumed = 0;
2594        return (PyObject *)unicode;
2595    }
2596
2597    /* Unpack UTF-8 encoded data */
2598    p = unicode->str;
2599    e = s + size;
2600    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2601
2602    while (s < e) {
2603        Py_UCS4 ch = (unsigned char)*s;
2604
2605        if (ch < 0x80) {
2606            /* Fast path for runs of ASCII characters. Given that common UTF-8
2607               input will consist of an overwhelming majority of ASCII
2608               characters, we try to optimize for this case by checking
2609               as many characters as a C 'long' can contain.
2610               First, check if we can do an aligned read, as most CPUs have
2611               a penalty for unaligned reads.
2612            */
2613            if (!((size_t) s & LONG_PTR_MASK)) {
2614                /* Help register allocation */
2615                register const char *_s = s;
2616                register Py_UNICODE *_p = p;
2617                while (_s < aligned_end) {
2618                    /* Read a whole long at a time (either 4 or 8 bytes),
2619                       and do a fast unrolled copy if it only contains ASCII
2620                       characters. */
2621                    unsigned long data = *(unsigned long *) _s;
2622                    if (data & ASCII_CHAR_MASK)
2623                        break;
2624                    _p[0] = (unsigned char) _s[0];
2625                    _p[1] = (unsigned char) _s[1];
2626                    _p[2] = (unsigned char) _s[2];
2627                    _p[3] = (unsigned char) _s[3];
2628#if (SIZEOF_LONG == 8)
2629                    _p[4] = (unsigned char) _s[4];
2630                    _p[5] = (unsigned char) _s[5];
2631                    _p[6] = (unsigned char) _s[6];
2632                    _p[7] = (unsigned char) _s[7];
2633#endif
2634                    _s += SIZEOF_LONG;
2635                    _p += SIZEOF_LONG;
2636                }
2637                s = _s;
2638                p = _p;
2639                if (s == e)
2640                    break;
2641                ch = (unsigned char)*s;
2642            }
2643        }
2644
2645        if (ch < 0x80) {
2646            *p++ = (Py_UNICODE)ch;
2647            s++;
2648            continue;
2649        }
2650
2651        n = utf8_code_length[ch];
2652
2653        if (s + n > e) {
2654            if (consumed)
2655                break;
2656            else {
2657                errmsg = "unexpected end of data";
2658                startinpos = s-starts;
2659                endinpos = startinpos+1;
2660                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2661                    endinpos++;
2662                goto utf8Error;
2663            }
2664        }
2665
2666        switch (n) {
2667
2668        case 0:
2669            errmsg = "invalid start byte";
2670            startinpos = s-starts;
2671            endinpos = startinpos+1;
2672            goto utf8Error;
2673
2674        case 1:
2675            errmsg = "internal error";
2676            startinpos = s-starts;
2677            endinpos = startinpos+1;
2678            goto utf8Error;
2679
2680        case 2:
2681            if ((s[1] & 0xc0) != 0x80) {
2682                errmsg = "invalid continuation byte";
2683                startinpos = s-starts;
2684                endinpos = startinpos + 1;
2685                goto utf8Error;
2686            }
2687            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2688            assert ((ch > 0x007F) && (ch <= 0x07FF));
2689            *p++ = (Py_UNICODE)ch;
2690            break;
2691
2692        case 3:
2693            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2694               will result in surrogates in range d800-dfff. Surrogates are
2695               not valid UTF-8 so they are rejected.
2696               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2697               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2698            if ((s[1] & 0xc0) != 0x80 ||
2699                (s[2] & 0xc0) != 0x80 ||
2700                ((unsigned char)s[0] == 0xE0 &&
2701                 (unsigned char)s[1] < 0xA0) ||
2702                ((unsigned char)s[0] == 0xED &&
2703                 (unsigned char)s[1] > 0x9F)) {
2704                errmsg = "invalid continuation byte";
2705                startinpos = s-starts;
2706                endinpos = startinpos + 1;
2707
2708                /* if s[1] first two bits are 1 and 0, then the invalid
2709                   continuation byte is s[2], so increment endinpos by 1,
2710                   if not, s[1] is invalid and endinpos doesn't need to
2711                   be incremented. */
2712                if ((s[1] & 0xC0) == 0x80)
2713                    endinpos++;
2714                goto utf8Error;
2715            }
2716            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2717            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2718            *p++ = (Py_UNICODE)ch;
2719            break;
2720
2721        case 4:
2722            if ((s[1] & 0xc0) != 0x80 ||
2723                (s[2] & 0xc0) != 0x80 ||
2724                (s[3] & 0xc0) != 0x80 ||
2725                ((unsigned char)s[0] == 0xF0 &&
2726                 (unsigned char)s[1] < 0x90) ||
2727                ((unsigned char)s[0] == 0xF4 &&
2728                 (unsigned char)s[1] > 0x8F)) {
2729                errmsg = "invalid continuation byte";
2730                startinpos = s-starts;
2731                endinpos = startinpos + 1;
2732                if ((s[1] & 0xC0) == 0x80) {
2733                    endinpos++;
2734                    if ((s[2] & 0xC0) == 0x80)
2735                        endinpos++;
2736                }
2737                goto utf8Error;
2738            }
2739            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2740                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2741            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2742
2743#ifdef Py_UNICODE_WIDE
2744            *p++ = (Py_UNICODE)ch;
2745#else
2746            /*  compute and append the two surrogates: */
2747
2748            /*  translate from 10000..10FFFF to 0..FFFF */
2749            ch -= 0x10000;
2750
2751            /*  high surrogate = top 10 bits added to D800 */
2752            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2753
2754            /*  low surrogate = bottom 10 bits added to DC00 */
2755            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2756#endif
2757            break;
2758        }
2759        s += n;
2760        continue;
2761
2762      utf8Error:
2763        outpos = p-PyUnicode_AS_UNICODE(unicode);
2764        if (unicode_decode_call_errorhandler(
2765                errors, &errorHandler,
2766                "utf-8", errmsg,
2767                &starts, &e, &startinpos, &endinpos, &exc, &s,
2768                &unicode, &outpos, &p))
2769            goto onError;
2770        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2771    }
2772    if (consumed)
2773        *consumed = s-starts;
2774
2775    /* Adjust length */
2776    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2777        goto onError;
2778
2779    Py_XDECREF(errorHandler);
2780    Py_XDECREF(exc);
2781    return (PyObject *)unicode;
2782
2783  onError:
2784    Py_XDECREF(errorHandler);
2785    Py_XDECREF(exc);
2786    Py_DECREF(unicode);
2787    return NULL;
2788}
2789
2790#undef ASCII_CHAR_MASK
2791
2792#ifdef __APPLE__
2793
2794/* Simplified UTF-8 decoder using surrogateescape error handler,
2795   used to decode the command line arguments on Mac OS X. */
2796
2797wchar_t*
2798_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2799{
2800    int n;
2801    const char *e;
2802    wchar_t *unicode, *p;
2803
2804    /* Note: size will always be longer than the resulting Unicode
2805       character count */
2806    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2807        PyErr_NoMemory();
2808        return NULL;
2809    }
2810    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2811    if (!unicode)
2812        return NULL;
2813
2814    /* Unpack UTF-8 encoded data */
2815    p = unicode;
2816    e = s + size;
2817    while (s < e) {
2818        Py_UCS4 ch = (unsigned char)*s;
2819
2820        if (ch < 0x80) {
2821            *p++ = (wchar_t)ch;
2822            s++;
2823            continue;
2824        }
2825
2826        n = utf8_code_length[ch];
2827        if (s + n > e) {
2828            goto surrogateescape;
2829        }
2830
2831        switch (n) {
2832        case 0:
2833        case 1:
2834            goto surrogateescape;
2835
2836        case 2:
2837            if ((s[1] & 0xc0) != 0x80)
2838                goto surrogateescape;
2839            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2840            assert ((ch > 0x007F) && (ch <= 0x07FF));
2841            *p++ = (wchar_t)ch;
2842            break;
2843
2844        case 3:
2845            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2846               will result in surrogates in range d800-dfff. Surrogates are
2847               not valid UTF-8 so they are rejected.
2848               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2849               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2850            if ((s[1] & 0xc0) != 0x80 ||
2851                (s[2] & 0xc0) != 0x80 ||
2852                ((unsigned char)s[0] == 0xE0 &&
2853                 (unsigned char)s[1] < 0xA0) ||
2854                ((unsigned char)s[0] == 0xED &&
2855                 (unsigned char)s[1] > 0x9F)) {
2856
2857                goto surrogateescape;
2858            }
2859            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2860            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2861            *p++ = (Py_UNICODE)ch;
2862            break;
2863
2864        case 4:
2865            if ((s[1] & 0xc0) != 0x80 ||
2866                (s[2] & 0xc0) != 0x80 ||
2867                (s[3] & 0xc0) != 0x80 ||
2868                ((unsigned char)s[0] == 0xF0 &&
2869                 (unsigned char)s[1] < 0x90) ||
2870                ((unsigned char)s[0] == 0xF4 &&
2871                 (unsigned char)s[1] > 0x8F)) {
2872                goto surrogateescape;
2873            }
2874            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2875                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2876            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2877
2878#if SIZEOF_WCHAR_T == 4
2879            *p++ = (wchar_t)ch;
2880#else
2881            /*  compute and append the two surrogates: */
2882
2883            /*  translate from 10000..10FFFF to 0..FFFF */
2884            ch -= 0x10000;
2885
2886            /*  high surrogate = top 10 bits added to D800 */
2887            *p++ = (wchar_t)(0xD800 + (ch >> 10));
2888
2889            /*  low surrogate = bottom 10 bits added to DC00 */
2890            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2891#endif
2892            break;
2893        }
2894        s += n;
2895        continue;
2896
2897      surrogateescape:
2898        *p++ = 0xDC00 + ch;
2899        s++;
2900    }
2901    *p = L'\0';
2902    return unicode;
2903}
2904
2905#endif /* __APPLE__ */
2906
2907/* Allocation strategy:  if the string is short, convert into a stack buffer
2908   and allocate exactly as much space needed at the end.  Else allocate the
2909   maximum possible needed (4 result bytes per Unicode character), and return
2910   the excess memory at the end.
2911*/
2912PyObject *
2913PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2914                     Py_ssize_t size,
2915                     const char *errors)
2916{
2917#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2918
2919    Py_ssize_t i;                /* index into s of next input byte */
2920    PyObject *result;            /* result string object */
2921    char *p;                     /* next free byte in output buffer */
2922    Py_ssize_t nallocated;      /* number of result bytes allocated */
2923    Py_ssize_t nneeded;            /* number of result bytes needed */
2924    char stackbuf[MAX_SHORT_UNICHARS * 4];
2925    PyObject *errorHandler = NULL;
2926    PyObject *exc = NULL;
2927
2928    assert(s != NULL);
2929    assert(size >= 0);
2930
2931    if (size <= MAX_SHORT_UNICHARS) {
2932        /* Write into the stack buffer; nallocated can't overflow.
2933         * At the end, we'll allocate exactly as much heap space as it
2934         * turns out we need.
2935         */
2936        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2937        result = NULL;   /* will allocate after we're done */
2938        p = stackbuf;
2939    }
2940    else {
2941        /* Overallocate on the heap, and give the excess back at the end. */
2942        nallocated = size * 4;
2943        if (nallocated / 4 != size)  /* overflow! */
2944            return PyErr_NoMemory();
2945        result = PyBytes_FromStringAndSize(NULL, nallocated);
2946        if (result == NULL)
2947            return NULL;
2948        p = PyBytes_AS_STRING(result);
2949    }
2950
2951    for (i = 0; i < size;) {
2952        Py_UCS4 ch = s[i++];
2953
2954        if (ch < 0x80)
2955            /* Encode ASCII */
2956            *p++ = (char) ch;
2957
2958        else if (ch < 0x0800) {
2959            /* Encode Latin-1 */
2960            *p++ = (char)(0xc0 | (ch >> 6));
2961            *p++ = (char)(0x80 | (ch & 0x3f));
2962        } else if (0xD800 <= ch && ch <= 0xDFFF) {
2963#ifndef Py_UNICODE_WIDE
2964            /* Special case: check for high and low surrogate */
2965            if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2966                Py_UCS4 ch2 = s[i];
2967                /* Combine the two surrogates to form a UCS4 value */
2968                ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2969                i++;
2970
2971                /* Encode UCS4 Unicode ordinals */
2972                *p++ = (char)(0xf0 | (ch >> 18));
2973                *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2974                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2975                *p++ = (char)(0x80 | (ch & 0x3f));
2976            } else {
2977#endif
2978                Py_ssize_t newpos;
2979                PyObject *rep;
2980                Py_ssize_t repsize, k;
2981                rep = unicode_encode_call_errorhandler
2982                    (errors, &errorHandler, "utf-8", "surrogates not allowed",
2983                     s, size, &exc, i-1, i, &newpos);
2984                if (!rep)
2985                    goto error;
2986
2987                if (PyBytes_Check(rep))
2988                    repsize = PyBytes_GET_SIZE(rep);
2989                else
2990                    repsize = PyUnicode_GET_SIZE(rep);
2991
2992                if (repsize > 4) {
2993                    Py_ssize_t offset;
2994
2995                    if (result == NULL)
2996                        offset = p - stackbuf;
2997                    else
2998                        offset = p - PyBytes_AS_STRING(result);
2999
3000                    if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3001                        /* integer overflow */
3002                        PyErr_NoMemory();
3003                        goto error;
3004                    }
3005                    nallocated += repsize - 4;
3006                    if (result != NULL) {
3007                        if (_PyBytes_Resize(&result, nallocated) < 0)
3008                            goto error;
3009                    } else {
3010                        result = PyBytes_FromStringAndSize(NULL, nallocated);
3011                        if (result == NULL)
3012                            goto error;
3013                        Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3014                    }
3015                    p = PyBytes_AS_STRING(result) + offset;
3016                }
3017
3018                if (PyBytes_Check(rep)) {
3019                    char *prep = PyBytes_AS_STRING(rep);
3020                    for(k = repsize; k > 0; k--)
3021                        *p++ = *prep++;
3022                } else /* rep is unicode */ {
3023                    Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3024                    Py_UNICODE c;
3025
3026                    for(k=0; k<repsize; k++) {
3027                        c = prep[k];
3028                        if (0x80 <= c) {
3029                            raise_encode_exception(&exc, "utf-8", s, size,
3030                                                   i-1, i, "surrogates not allowed");
3031                            goto error;
3032                        }
3033                        *p++ = (char)prep[k];
3034                    }
3035                }
3036                Py_DECREF(rep);
3037#ifndef Py_UNICODE_WIDE
3038            }
3039#endif
3040        } else if (ch < 0x10000) {
3041            *p++ = (char)(0xe0 | (ch >> 12));
3042            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3043            *p++ = (char)(0x80 | (ch & 0x3f));
3044        } else /* ch >= 0x10000 */ {
3045            /* Encode UCS4 Unicode ordinals */
3046            *p++ = (char)(0xf0 | (ch >> 18));
3047            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3048            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3049            *p++ = (char)(0x80 | (ch & 0x3f));
3050        }
3051    }
3052
3053    if (result == NULL) {
3054        /* This was stack allocated. */
3055        nneeded = p - stackbuf;
3056        assert(nneeded <= nallocated);
3057        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
3058    }
3059    else {
3060        /* Cut back to size actually needed. */
3061        nneeded = p - PyBytes_AS_STRING(result);
3062        assert(nneeded <= nallocated);
3063        _PyBytes_Resize(&result, nneeded);
3064    }
3065    Py_XDECREF(errorHandler);
3066    Py_XDECREF(exc);
3067    return result;
3068 error:
3069    Py_XDECREF(errorHandler);
3070    Py_XDECREF(exc);
3071    Py_XDECREF(result);
3072    return NULL;
3073
3074#undef MAX_SHORT_UNICHARS
3075}
3076
3077PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3078{
3079    if (!PyUnicode_Check(unicode)) {
3080        PyErr_BadArgument();
3081        return NULL;
3082    }
3083    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
3084                                PyUnicode_GET_SIZE(unicode),
3085                                NULL);
3086}
3087
3088/* --- UTF-32 Codec ------------------------------------------------------- */
3089
3090PyObject *
3091PyUnicode_DecodeUTF32(const char *s,
3092                      Py_ssize_t size,
3093                      const char *errors,
3094                      int *byteorder)
3095{
3096    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3097}
3098
3099PyObject *
3100PyUnicode_DecodeUTF32Stateful(const char *s,
3101                              Py_ssize_t size,
3102                              const char *errors,
3103                              int *byteorder,
3104                              Py_ssize_t *consumed)
3105{
3106    const char *starts = s;
3107    Py_ssize_t startinpos;
3108    Py_ssize_t endinpos;
3109    Py_ssize_t outpos;
3110    PyUnicodeObject *unicode;
3111    Py_UNICODE *p;
3112#ifndef Py_UNICODE_WIDE
3113    int pairs = 0;
3114    const unsigned char *qq;
3115#else
3116    const int pairs = 0;
3117#endif
3118    const unsigned char *q, *e;
3119    int bo = 0;       /* assume native ordering by default */
3120    const char *errmsg = "";
3121    /* Offsets from q for retrieving bytes in the right order. */
3122#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3123    int iorder[] = {0, 1, 2, 3};
3124#else
3125    int iorder[] = {3, 2, 1, 0};
3126#endif
3127    PyObject *errorHandler = NULL;
3128    PyObject *exc = NULL;
3129
3130    q = (unsigned char *)s;
3131    e = q + size;
3132
3133    if (byteorder)
3134        bo = *byteorder;
3135
3136    /* Check for BOM marks (U+FEFF) in the input and adjust current
3137       byte order setting accordingly. In native mode, the leading BOM
3138       mark is skipped, in all other modes, it is copied to the output
3139       stream as-is (giving a ZWNBSP character). */
3140    if (bo == 0) {
3141        if (size >= 4) {
3142            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3143                (q[iorder[1]] << 8) | q[iorder[0]];
3144#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3145            if (bom == 0x0000FEFF) {
3146                q += 4;
3147                bo = -1;
3148            }
3149            else if (bom == 0xFFFE0000) {
3150                q += 4;
3151                bo = 1;
3152            }
3153#else
3154            if (bom == 0x0000FEFF) {
3155                q += 4;
3156                bo = 1;
3157            }
3158            else if (bom == 0xFFFE0000) {
3159                q += 4;
3160                bo = -1;
3161            }
3162#endif
3163        }
3164    }
3165
3166    if (bo == -1) {
3167        /* force LE */
3168        iorder[0] = 0;
3169        iorder[1] = 1;
3170        iorder[2] = 2;
3171        iorder[3] = 3;
3172    }
3173    else if (bo == 1) {
3174        /* force BE */
3175        iorder[0] = 3;
3176        iorder[1] = 2;
3177        iorder[2] = 1;
3178        iorder[3] = 0;
3179    }
3180
3181    /* On narrow builds we split characters outside the BMP into two
3182       codepoints => count how much extra space we need. */
3183#ifndef Py_UNICODE_WIDE
3184    for (qq = q; qq < e; qq += 4)
3185        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3186            pairs++;
3187#endif
3188
3189    /* This might be one to much, because of a BOM */
3190    unicode = _PyUnicode_New((size+3)/4+pairs);
3191    if (!unicode)
3192        return NULL;
3193    if (size == 0)
3194        return (PyObject *)unicode;
3195
3196    /* Unpack UTF-32 encoded data */
3197    p = unicode->str;
3198
3199    while (q < e) {
3200        Py_UCS4 ch;
3201        /* remaining bytes at the end? (size should be divisible by 4) */
3202        if (e-q<4) {
3203            if (consumed)
3204                break;
3205            errmsg = "truncated data";
3206            startinpos = ((const char *)q)-starts;
3207            endinpos = ((const char *)e)-starts;
3208            goto utf32Error;
3209            /* The remaining input chars are ignored if the callback
3210               chooses to skip the input */
3211        }
3212        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3213            (q[iorder[1]] << 8) | q[iorder[0]];
3214
3215        if (ch >= 0x110000)
3216        {
3217            errmsg = "codepoint not in range(0x110000)";
3218            startinpos = ((const char *)q)-starts;
3219            endinpos = startinpos+4;
3220            goto utf32Error;
3221        }
3222#ifndef Py_UNICODE_WIDE
3223        if (ch >= 0x10000)
3224        {
3225            *p++ = 0xD800 | ((ch-0x10000) >> 10);
3226            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3227        }
3228        else
3229#endif
3230            *p++ = ch;
3231        q += 4;
3232        continue;
3233      utf32Error:
3234        outpos = p-PyUnicode_AS_UNICODE(unicode);
3235        if (unicode_decode_call_errorhandler(
3236                errors, &errorHandler,
3237                "utf32", errmsg,
3238                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3239                &unicode, &outpos, &p))
3240            goto onError;
3241    }
3242
3243    if (byteorder)
3244        *byteorder = bo;
3245
3246    if (consumed)
3247        *consumed = (const char *)q-starts;
3248
3249    /* Adjust length */
3250    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3251        goto onError;
3252
3253    Py_XDECREF(errorHandler);
3254    Py_XDECREF(exc);
3255    return (PyObject *)unicode;
3256
3257  onError:
3258    Py_DECREF(unicode);
3259    Py_XDECREF(errorHandler);
3260    Py_XDECREF(exc);
3261    return NULL;
3262}
3263
3264PyObject *
3265PyUnicode_EncodeUTF32(const Py_UNICODE *s,
3266                      Py_ssize_t size,
3267                      const char *errors,
3268                      int byteorder)
3269{
3270    PyObject *v;
3271    unsigned char *p;
3272    Py_ssize_t nsize, bytesize;
3273#ifndef Py_UNICODE_WIDE
3274    Py_ssize_t i, pairs;
3275#else
3276    const int pairs = 0;
3277#endif
3278    /* Offsets from p for storing byte pairs in the right order. */
3279#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3280    int iorder[] = {0, 1, 2, 3};
3281#else
3282    int iorder[] = {3, 2, 1, 0};
3283#endif
3284
3285#define STORECHAR(CH)                           \
3286    do {                                        \
3287        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
3288        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
3289        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
3290        p[iorder[0]] = (CH) & 0xff;             \
3291        p += 4;                                 \
3292    } while(0)
3293
3294    /* In narrow builds we can output surrogate pairs as one codepoint,
3295       so we need less space. */
3296#ifndef Py_UNICODE_WIDE
3297    for (i = pairs = 0; i < size-1; i++)
3298        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3299            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3300            pairs++;
3301#endif
3302    nsize = (size - pairs + (byteorder == 0));
3303    bytesize = nsize * 4;
3304    if (bytesize / 4 != nsize)
3305        return PyErr_NoMemory();
3306    v = PyBytes_FromStringAndSize(NULL, bytesize);
3307    if (v == NULL)
3308        return NULL;
3309
3310    p = (unsigned char *)PyBytes_AS_STRING(v);
3311    if (byteorder == 0)
3312        STORECHAR(0xFEFF);
3313    if (size == 0)
3314        goto done;
3315
3316    if (byteorder == -1) {
3317        /* force LE */
3318        iorder[0] = 0;
3319        iorder[1] = 1;
3320        iorder[2] = 2;
3321        iorder[3] = 3;
3322    }
3323    else if (byteorder == 1) {
3324        /* force BE */
3325        iorder[0] = 3;
3326        iorder[1] = 2;
3327        iorder[2] = 1;
3328        iorder[3] = 0;
3329    }
3330
3331    while (size-- > 0) {
3332        Py_UCS4 ch = *s++;
3333#ifndef Py_UNICODE_WIDE
3334        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3335            Py_UCS4 ch2 = *s;
3336            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3337                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3338                s++;
3339                size--;
3340            }
3341        }
3342#endif
3343        STORECHAR(ch);
3344    }
3345
3346  done:
3347    return v;
3348#undef STORECHAR
3349}
3350
3351PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3352{
3353    if (!PyUnicode_Check(unicode)) {
3354        PyErr_BadArgument();
3355        return NULL;
3356    }
3357    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
3358                                 PyUnicode_GET_SIZE(unicode),
3359                                 NULL,
3360                                 0);
3361}
3362
3363/* --- UTF-16 Codec ------------------------------------------------------- */
3364
3365PyObject *
3366PyUnicode_DecodeUTF16(const char *s,
3367                      Py_ssize_t size,
3368                      const char *errors,
3369                      int *byteorder)
3370{
3371    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3372}
3373
3374/* Two masks for fast checking of whether a C 'long' may contain
3375   UTF16-encoded surrogate characters. This is an efficient heuristic,
3376   assuming that non-surrogate characters with a code point >= 0x8000 are
3377   rare in most input.
3378   FAST_CHAR_MASK is used when the input is in native byte ordering,
3379   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
3380*/
3381#if (SIZEOF_LONG == 8)
3382# define FAST_CHAR_MASK         0x8000800080008000L
3383# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3384#elif (SIZEOF_LONG == 4)
3385# define FAST_CHAR_MASK         0x80008000L
3386# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3387#else
3388# error C 'long' size should be either 4 or 8!
3389#endif
3390
3391PyObject *
3392PyUnicode_DecodeUTF16Stateful(const char *s,
3393                              Py_ssize_t size,
3394                              const char *errors,
3395                              int *byteorder,
3396                              Py_ssize_t *consumed)
3397{
3398    const char *starts = s;
3399    Py_ssize_t startinpos;
3400    Py_ssize_t endinpos;
3401    Py_ssize_t outpos;
3402    PyUnicodeObject *unicode;
3403    Py_UNICODE *p;
3404    const unsigned char *q, *e, *aligned_end;
3405    int bo = 0;       /* assume native ordering by default */
3406    int native_ordering = 0;
3407    const char *errmsg = "";
3408    /* Offsets from q for retrieving byte pairs in the right order. */
3409#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3410    int ihi = 1, ilo = 0;
3411#else
3412    int ihi = 0, ilo = 1;
3413#endif
3414    PyObject *errorHandler = NULL;
3415    PyObject *exc = NULL;
3416
3417    /* Note: size will always be longer than the resulting Unicode
3418       character count */
3419    unicode = _PyUnicode_New(size);
3420    if (!unicode)
3421        return NULL;
3422    if (size == 0)
3423        return (PyObject *)unicode;
3424
3425    /* Unpack UTF-16 encoded data */
3426    p = unicode->str;
3427    q = (unsigned char *)s;
3428    e = q + size;
3429
3430    if (byteorder)
3431        bo = *byteorder;
3432
3433    /* Check for BOM marks (U+FEFF) in the input and adjust current
3434       byte order setting accordingly. In native mode, the leading BOM
3435       mark is skipped, in all other modes, it is copied to the output
3436       stream as-is (giving a ZWNBSP character). */
3437    if (bo == 0) {
3438        if (size >= 2) {
3439            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
3440#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3441            if (bom == 0xFEFF) {
3442                q += 2;
3443                bo = -1;
3444            }
3445            else if (bom == 0xFFFE) {
3446                q += 2;
3447                bo = 1;
3448            }
3449#else
3450            if (bom == 0xFEFF) {
3451                q += 2;
3452                bo = 1;
3453            }
3454            else if (bom == 0xFFFE) {
3455                q += 2;
3456                bo = -1;
3457            }
3458#endif
3459        }
3460    }
3461
3462    if (bo == -1) {
3463        /* force LE */
3464        ihi = 1;
3465        ilo = 0;
3466    }
3467    else if (bo == 1) {
3468        /* force BE */
3469        ihi = 0;
3470        ilo = 1;
3471    }
3472#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3473    native_ordering = ilo < ihi;
3474#else
3475    native_ordering = ilo > ihi;
3476#endif
3477
3478    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3479    while (1) {
3480        Py_UNICODE ch;
3481        if (e - q < 2) {
3482            /* remaining byte at the end? (size should be even) */
3483            if (q == e || consumed)
3484                break;
3485            errmsg = "truncated data";
3486            startinpos = ((const char *)q) - starts;
3487            endinpos = ((const char *)e) - starts;
3488            outpos = p - PyUnicode_AS_UNICODE(unicode);
3489            goto utf16Error;
3490            /* The remaining input chars are ignored if the callback
3491               chooses to skip the input */
3492        }
3493        /* First check for possible aligned read of a C 'long'. Unaligned
3494           reads are more expensive, better to defer to another iteration. */
3495        if (!((size_t) q & LONG_PTR_MASK)) {
3496            /* Fast path for runs of non-surrogate chars. */
3497            register const unsigned char *_q = q;
3498            Py_UNICODE *_p = p;
3499            if (native_ordering) {
3500                /* Native ordering is simple: as long as the input cannot
3501                   possibly contain a surrogate char, do an unrolled copy
3502                   of several 16-bit code points to the target object.
3503                   The non-surrogate check is done on several input bytes
3504                   at a time (as many as a C 'long' can contain). */
3505                while (_q < aligned_end) {
3506                    unsigned long data = * (unsigned long *) _q;
3507                    if (data & FAST_CHAR_MASK)
3508                        break;
3509                    _p[0] = ((unsigned short *) _q)[0];
3510                    _p[1] = ((unsigned short *) _q)[1];
3511#if (SIZEOF_LONG == 8)
3512                    _p[2] = ((unsigned short *) _q)[2];
3513                    _p[3] = ((unsigned short *) _q)[3];
3514#endif
3515                    _q += SIZEOF_LONG;
3516                    _p += SIZEOF_LONG / 2;
3517                }
3518            }
3519            else {
3520                /* Byteswapped ordering is similar, but we must decompose
3521                   the copy bytewise, and take care of zero'ing out the
3522                   upper bytes if the target object is in 32-bit units
3523                   (that is, in UCS-4 builds). */
3524                while (_q < aligned_end) {
3525                    unsigned long data = * (unsigned long *) _q;
3526                    if (data & SWAPPED_FAST_CHAR_MASK)
3527                        break;
3528                    /* Zero upper bytes in UCS-4 builds */
3529#if (Py_UNICODE_SIZE > 2)
3530                    _p[0] = 0;
3531                    _p[1] = 0;
3532#if (SIZEOF_LONG == 8)
3533                    _p[2] = 0;
3534                    _p[3] = 0;
3535#endif
3536#endif
3537                    /* Issue #4916; UCS-4 builds on big endian machines must
3538                       fill the two last bytes of each 4-byte unit. */
3539#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3540# define OFF 2
3541#else
3542# define OFF 0
3543#endif
3544                    ((unsigned char *) _p)[OFF + 1] = _q[0];
3545                    ((unsigned char *) _p)[OFF + 0] = _q[1];
3546                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3547                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3548#if (SIZEOF_LONG == 8)
3549                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3550                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3551                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3552                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3553#endif
3554#undef OFF
3555                    _q += SIZEOF_LONG;
3556                    _p += SIZEOF_LONG / 2;
3557                }
3558            }
3559            p = _p;
3560            q = _q;
3561            if (e - q < 2)
3562                continue;
3563        }
3564        ch = (q[ihi] << 8) | q[ilo];
3565
3566        q += 2;
3567
3568        if (ch < 0xD800 || ch > 0xDFFF) {
3569            *p++ = ch;
3570            continue;
3571        }
3572
3573        /* UTF-16 code pair: */
3574        if (e - q < 2) {
3575            errmsg = "unexpected end of data";
3576            startinpos = (((const char *)q) - 2) - starts;
3577            endinpos = ((const char *)e) - starts;
3578            goto utf16Error;
3579        }
3580        if (0xD800 <= ch && ch <= 0xDBFF) {
3581            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3582            q += 2;
3583            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3584#ifndef Py_UNICODE_WIDE
3585                *p++ = ch;
3586                *p++ = ch2;
3587#else
3588                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3589#endif
3590                continue;
3591            }
3592            else {
3593                errmsg = "illegal UTF-16 surrogate";
3594                startinpos = (((const char *)q)-4)-starts;
3595                endinpos = startinpos+2;
3596                goto utf16Error;
3597            }
3598
3599        }
3600        errmsg = "illegal encoding";
3601        startinpos = (((const char *)q)-2)-starts;
3602        endinpos = startinpos+2;
3603        /* Fall through to report the error */
3604
3605      utf16Error:
3606        outpos = p - PyUnicode_AS_UNICODE(unicode);
3607        if (unicode_decode_call_errorhandler(
3608                errors,
3609                &errorHandler,
3610                "utf16", errmsg,
3611                &starts,
3612                (const char **)&e,
3613                &startinpos,
3614                &endinpos,
3615                &exc,
3616                (const char **)&q,
3617                &unicode,
3618                &outpos,
3619                &p))
3620            goto onError;
3621        /* Update data because unicode_decode_call_errorhandler might have
3622           changed the input object. */
3623        aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3624    }
3625
3626    if (byteorder)
3627        *byteorder = bo;
3628
3629    if (consumed)
3630        *consumed = (const char *)q-starts;
3631
3632    /* Adjust length */
3633    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3634        goto onError;
3635
3636    Py_XDECREF(errorHandler);
3637    Py_XDECREF(exc);
3638    return (PyObject *)unicode;
3639
3640  onError:
3641    Py_DECREF(unicode);
3642    Py_XDECREF(errorHandler);
3643    Py_XDECREF(exc);
3644    return NULL;
3645}
3646
3647#undef FAST_CHAR_MASK
3648#undef SWAPPED_FAST_CHAR_MASK
3649
3650PyObject *
3651PyUnicode_EncodeUTF16(const Py_UNICODE *s,
3652                      Py_ssize_t size,
3653                      const char *errors,
3654                      int byteorder)
3655{
3656    PyObject *v;
3657    unsigned char *p;
3658    Py_ssize_t nsize, bytesize;
3659#ifdef Py_UNICODE_WIDE
3660    Py_ssize_t i, pairs;
3661#else
3662    const int pairs = 0;
3663#endif
3664    /* Offsets from p for storing byte pairs in the right order. */
3665#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3666    int ihi = 1, ilo = 0;
3667#else
3668    int ihi = 0, ilo = 1;
3669#endif
3670
3671#define STORECHAR(CH)                           \
3672    do {                                        \
3673        p[ihi] = ((CH) >> 8) & 0xff;            \
3674        p[ilo] = (CH) & 0xff;                   \
3675        p += 2;                                 \
3676    } while(0)
3677
3678#ifdef Py_UNICODE_WIDE
3679    for (i = pairs = 0; i < size; i++)
3680        if (s[i] >= 0x10000)
3681            pairs++;
3682#endif
3683    /* 2 * (size + pairs + (byteorder == 0)) */
3684    if (size > PY_SSIZE_T_MAX ||
3685        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
3686        return PyErr_NoMemory();
3687    nsize = size + pairs + (byteorder == 0);
3688    bytesize = nsize * 2;
3689    if (bytesize / 2 != nsize)
3690        return PyErr_NoMemory();
3691    v = PyBytes_FromStringAndSize(NULL, bytesize);
3692    if (v == NULL)
3693        return NULL;
3694
3695    p = (unsigned char *)PyBytes_AS_STRING(v);
3696    if (byteorder == 0)
3697        STORECHAR(0xFEFF);
3698    if (size == 0)
3699        goto done;
3700
3701    if (byteorder == -1) {
3702        /* force LE */
3703        ihi = 1;
3704        ilo = 0;
3705    }
3706    else if (byteorder == 1) {
3707        /* force BE */
3708        ihi = 0;
3709        ilo = 1;
3710    }
3711
3712    while (size-- > 0) {
3713        Py_UNICODE ch = *s++;
3714        Py_UNICODE ch2 = 0;
3715#ifdef Py_UNICODE_WIDE
3716        if (ch >= 0x10000) {
3717            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3718            ch  = 0xD800 | ((ch-0x10000) >> 10);
3719        }
3720#endif
3721        STORECHAR(ch);
3722        if (ch2)
3723            STORECHAR(ch2);
3724    }
3725
3726  done:
3727    return v;
3728#undef STORECHAR
3729}
3730
3731PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3732{
3733    if (!PyUnicode_Check(unicode)) {
3734        PyErr_BadArgument();
3735        return NULL;
3736    }
3737    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3738                                 PyUnicode_GET_SIZE(unicode),
3739                                 NULL,
3740                                 0);
3741}
3742
3743/* --- Unicode Escape Codec ----------------------------------------------- */
3744
3745static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3746
3747PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3748                                        Py_ssize_t size,
3749                                        const char *errors)
3750{
3751    const char *starts = s;
3752    Py_ssize_t startinpos;
3753    Py_ssize_t endinpos;
3754    Py_ssize_t outpos;
3755    int i;
3756    PyUnicodeObject *v;
3757    Py_UNICODE *p;
3758    const char *end;
3759    char* message;
3760    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3761    PyObject *errorHandler = NULL;
3762    PyObject *exc = NULL;
3763
3764    /* Escaped strings will always be longer than the resulting
3765       Unicode string, so we start with size here and then reduce the
3766       length after conversion to the true value.
3767       (but if the error callback returns a long replacement string
3768       we'll have to allocate more space) */
3769    v = _PyUnicode_New(size);
3770    if (v == NULL)
3771        goto onError;
3772    if (size == 0)
3773        return (PyObject *)v;
3774
3775    p = PyUnicode_AS_UNICODE(v);
3776    end = s + size;
3777
3778    while (s < end) {
3779        unsigned char c;
3780        Py_UNICODE x;
3781        int digits;
3782
3783        /* Non-escape characters are interpreted as Unicode ordinals */
3784        if (*s != '\\') {
3785            *p++ = (unsigned char) *s++;
3786            continue;
3787        }
3788
3789        startinpos = s-starts;
3790        /* \ - Escapes */
3791        s++;
3792        c = *s++;
3793        if (s > end)
3794            c = '\0'; /* Invalid after \ */
3795        switch (c) {
3796
3797            /* \x escapes */
3798        case '\n': break;
3799        case '\\': *p++ = '\\'; break;
3800        case '\'': *p++ = '\''; break;
3801        case '\"': *p++ = '\"'; break;
3802        case 'b': *p++ = '\b'; break;
3803        case 'f': *p++ = '\014'; break; /* FF */
3804        case 't': *p++ = '\t'; break;
3805        case 'n': *p++ = '\n'; break;
3806        case 'r': *p++ = '\r'; break;
3807        case 'v': *p++ = '\013'; break; /* VT */
3808        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3809
3810            /* \OOO (octal) escapes */
3811        case '0': case '1': case '2': case '3':
3812        case '4': case '5': case '6': case '7':
3813            x = s[-1] - '0';
3814            if (s < end && '0' <= *s && *s <= '7') {
3815                x = (x<<3) + *s++ - '0';
3816                if (s < end && '0' <= *s && *s <= '7')
3817                    x = (x<<3) + *s++ - '0';
3818            }
3819            *p++ = x;
3820            break;
3821
3822            /* hex escapes */
3823            /* \xXX */
3824        case 'x':
3825            digits = 2;
3826            message = "truncated \\xXX escape";
3827            goto hexescape;
3828
3829            /* \uXXXX */
3830        case 'u':
3831            digits = 4;
3832            message = "truncated \\uXXXX escape";
3833            goto hexescape;
3834
3835            /* \UXXXXXXXX */
3836        case 'U':
3837            digits = 8;
3838            message = "truncated \\UXXXXXXXX escape";
3839        hexescape:
3840            chr = 0;
3841            outpos = p-PyUnicode_AS_UNICODE(v);
3842            if (s+digits>end) {
3843                endinpos = size;
3844                if (unicode_decode_call_errorhandler(
3845                        errors, &errorHandler,
3846                        "unicodeescape", "end of string in escape sequence",
3847                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3848                        &v, &outpos, &p))
3849                    goto onError;
3850                goto nextByte;
3851            }
3852            for (i = 0; i < digits; ++i) {
3853                c = (unsigned char) s[i];
3854                if (!Py_ISXDIGIT(c)) {
3855                    endinpos = (s+i+1)-starts;
3856                    if (unicode_decode_call_errorhandler(
3857                            errors, &errorHandler,
3858                            "unicodeescape", message,
3859                            &starts, &end, &startinpos, &endinpos, &exc, &s,
3860                            &v, &outpos, &p))
3861                        goto onError;
3862                    goto nextByte;
3863                }
3864                chr = (chr<<4) & ~0xF;
3865                if (c >= '0' && c <= '9')
3866                    chr += c - '0';
3867                else if (c >= 'a' && c <= 'f')
3868                    chr += 10 + c - 'a';
3869                else
3870                    chr += 10 + c - 'A';
3871            }
3872            s += i;
3873            if (chr == 0xffffffff && PyErr_Occurred())
3874                /* _decoding_error will have already written into the
3875                   target buffer. */
3876                break;
3877        store:
3878            /* when we get here, chr is a 32-bit unicode character */
3879            if (chr <= 0xffff)
3880                /* UCS-2 character */
3881                *p++ = (Py_UNICODE) chr;
3882            else if (chr <= 0x10ffff) {
3883                /* UCS-4 character. Either store directly, or as
3884                   surrogate pair. */
3885#ifdef Py_UNICODE_WIDE
3886                *p++ = chr;
3887#else
3888                chr -= 0x10000L;
3889                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3890                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3891#endif
3892            } else {
3893                endinpos = s-starts;
3894                outpos = p-PyUnicode_AS_UNICODE(v);
3895                if (unicode_decode_call_errorhandler(
3896                        errors, &errorHandler,
3897                        "unicodeescape", "illegal Unicode character",
3898                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3899                        &v, &outpos, &p))
3900                    goto onError;
3901            }
3902            break;
3903
3904            /* \N{name} */
3905        case 'N':
3906            message = "malformed \\N character escape";
3907            if (ucnhash_CAPI == NULL) {
3908                /* load the unicode data module */
3909                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3910                if (ucnhash_CAPI == NULL)
3911                    goto ucnhashError;
3912            }
3913            if (*s == '{') {
3914                const char *start = s+1;
3915                /* look for the closing brace */
3916                while (*s != '}' && s < end)
3917                    s++;
3918                if (s > start && s < end && *s == '}') {
3919                    /* found a name.  look it up in the unicode database */
3920                    message = "unknown Unicode character name";
3921                    s++;
3922                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3923                        goto store;
3924                }
3925            }
3926            endinpos = s-starts;
3927            outpos = p-PyUnicode_AS_UNICODE(v);
3928            if (unicode_decode_call_errorhandler(
3929                    errors, &errorHandler,
3930                    "unicodeescape", message,
3931                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3932                    &v, &outpos, &p))
3933                goto onError;
3934            break;
3935
3936        default:
3937            if (s > end) {
3938                message = "\\ at end of string";
3939                s--;
3940                endinpos = s-starts;
3941                outpos = p-PyUnicode_AS_UNICODE(v);
3942                if (unicode_decode_call_errorhandler(
3943                        errors, &errorHandler,
3944                        "unicodeescape", message,
3945                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3946                        &v, &outpos, &p))
3947                    goto onError;
3948            }
3949            else {
3950                *p++ = '\\';
3951                *p++ = (unsigned char)s[-1];
3952            }
3953            break;
3954        }
3955      nextByte:
3956        ;
3957    }
3958    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3959        goto onError;
3960    Py_XDECREF(errorHandler);
3961    Py_XDECREF(exc);
3962    return (PyObject *)v;
3963
3964  ucnhashError:
3965    PyErr_SetString(
3966        PyExc_UnicodeError,
3967        "\\N escapes not supported (can't load unicodedata module)"
3968        );
3969    Py_XDECREF(v);
3970    Py_XDECREF(errorHandler);
3971    Py_XDECREF(exc);
3972    return NULL;
3973
3974  onError:
3975    Py_XDECREF(v);
3976    Py_XDECREF(errorHandler);
3977    Py_XDECREF(exc);
3978    return NULL;
3979}
3980
3981/* Return a Unicode-Escape string version of the Unicode object.
3982
3983   If quotes is true, the string is enclosed in u"" or u'' quotes as
3984   appropriate.
3985
3986*/
3987
3988Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3989                                             Py_ssize_t size,
3990                                             Py_UNICODE ch)
3991{
3992    /* like wcschr, but doesn't stop at NULL characters */
3993
3994    while (size-- > 0) {
3995        if (*s == ch)
3996            return s;
3997        s++;
3998    }
3999
4000    return NULL;
4001}
4002
4003static const char *hexdigits = "0123456789abcdef";
4004
4005PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4006                                        Py_ssize_t size)
4007{
4008    PyObject *repr;
4009    char *p;
4010
4011#ifdef Py_UNICODE_WIDE
4012    const Py_ssize_t expandsize = 10;
4013#else
4014    const Py_ssize_t expandsize = 6;
4015#endif
4016
4017    /* XXX(nnorwitz): rather than over-allocating, it would be
4018       better to choose a different scheme.  Perhaps scan the
4019       first N-chars of the string and allocate based on that size.
4020    */
4021    /* Initial allocation is based on the longest-possible unichr
4022       escape.
4023
4024       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4025       unichr, so in this case it's the longest unichr escape. In
4026       narrow (UTF-16) builds this is five chars per source unichr
4027       since there are two unichrs in the surrogate pair, so in narrow
4028       (UTF-16) builds it's not the longest unichr escape.
4029
4030       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4031       so in the narrow (UTF-16) build case it's the longest unichr
4032       escape.
4033    */
4034
4035    if (size == 0)
4036        return PyBytes_FromStringAndSize(NULL, 0);
4037
4038    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
4039        return PyErr_NoMemory();
4040
4041    repr = PyBytes_FromStringAndSize(NULL,
4042                                     2
4043                                     + expandsize*size
4044                                     + 1);
4045    if (repr == NULL)
4046        return NULL;
4047
4048    p = PyBytes_AS_STRING(repr);
4049
4050    while (size-- > 0) {
4051        Py_UNICODE ch = *s++;
4052
4053        /* Escape backslashes */
4054        if (ch == '\\') {
4055            *p++ = '\\';
4056            *p++ = (char) ch;
4057            continue;
4058        }
4059
4060#ifdef Py_UNICODE_WIDE
4061        /* Map 21-bit characters to '\U00xxxxxx' */
4062        else if (ch >= 0x10000) {
4063            *p++ = '\\';
4064            *p++ = 'U';
4065            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4066            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4067            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4068            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4069            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4070            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4071            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4072            *p++ = hexdigits[ch & 0x0000000F];
4073            continue;
4074        }
4075#else
4076        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4077        else if (ch >= 0xD800 && ch < 0xDC00) {
4078            Py_UNICODE ch2;
4079            Py_UCS4 ucs;
4080
4081            ch2 = *s++;
4082            size--;
4083            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4084                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4085                *p++ = '\\';
4086                *p++ = 'U';
4087                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4088                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4089                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4090                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4091                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4092                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4093                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4094                *p++ = hexdigits[ucs & 0x0000000F];
4095                continue;
4096            }
4097            /* Fall through: isolated surrogates are copied as-is */
4098            s--;
4099            size++;
4100        }
4101#endif
4102
4103        /* Map 16-bit characters to '\uxxxx' */
4104        if (ch >= 256) {
4105            *p++ = '\\';
4106            *p++ = 'u';
4107            *p++ = hexdigits[(ch >> 12) & 0x000F];
4108            *p++ = hexdigits[(ch >> 8) & 0x000F];
4109            *p++ = hexdigits[(ch >> 4) & 0x000F];
4110            *p++ = hexdigits[ch & 0x000F];
4111        }
4112
4113        /* Map special whitespace to '\t', \n', '\r' */
4114        else if (ch == '\t') {
4115            *p++ = '\\';
4116            *p++ = 't';
4117        }
4118        else if (ch == '\n') {
4119            *p++ = '\\';
4120            *p++ = 'n';
4121        }
4122        else if (ch == '\r') {
4123            *p++ = '\\';
4124            *p++ = 'r';
4125        }
4126
4127        /* Map non-printable US ASCII to '\xhh' */
4128        else if (ch < ' ' || ch >= 0x7F) {
4129            *p++ = '\\';
4130            *p++ = 'x';
4131            *p++ = hexdigits[(ch >> 4) & 0x000F];
4132            *p++ = hexdigits[ch & 0x000F];
4133        }
4134
4135        /* Copy everything else as-is */
4136        else
4137            *p++ = (char) ch;
4138    }
4139
4140    assert(p - PyBytes_AS_STRING(repr) > 0);
4141    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4142        return NULL;
4143    return repr;
4144}
4145
4146PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
4147{
4148    PyObject *s;
4149    if (!PyUnicode_Check(unicode)) {
4150        PyErr_BadArgument();
4151        return NULL;
4152    }
4153    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4154                                      PyUnicode_GET_SIZE(unicode));
4155    return s;
4156}
4157
4158/* --- Raw Unicode Escape Codec ------------------------------------------- */
4159
4160PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
4161                                           Py_ssize_t size,
4162                                           const char *errors)
4163{
4164    const char *starts = s;
4165    Py_ssize_t startinpos;
4166    Py_ssize_t endinpos;
4167    Py_ssize_t outpos;
4168    PyUnicodeObject *v;
4169    Py_UNICODE *p;
4170    const char *end;
4171    const char *bs;
4172    PyObject *errorHandler = NULL;
4173    PyObject *exc = NULL;
4174
4175    /* Escaped strings will always be longer than the resulting
4176       Unicode string, so we start with size here and then reduce the
4177       length after conversion to the true value. (But decoding error
4178       handler might have to resize the string) */
4179    v = _PyUnicode_New(size);
4180    if (v == NULL)
4181        goto onError;
4182    if (size == 0)
4183        return (PyObject *)v;
4184    p = PyUnicode_AS_UNICODE(v);
4185    end = s + size;
4186    while (s < end) {
4187        unsigned char c;
4188        Py_UCS4 x;
4189        int i;
4190        int count;
4191
4192        /* Non-escape characters are interpreted as Unicode ordinals */
4193        if (*s != '\\') {
4194            *p++ = (unsigned char)*s++;
4195            continue;
4196        }
4197        startinpos = s-starts;
4198
4199        /* \u-escapes are only interpreted iff the number of leading
4200           backslashes if odd */
4201        bs = s;
4202        for (;s < end;) {
4203            if (*s != '\\')
4204                break;
4205            *p++ = (unsigned char)*s++;
4206        }
4207        if (((s - bs) & 1) == 0 ||
4208            s >= end ||
4209            (*s != 'u' && *s != 'U')) {
4210            continue;
4211        }
4212        p--;
4213        count = *s=='u' ? 4 : 8;
4214        s++;
4215
4216        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4217        outpos = p-PyUnicode_AS_UNICODE(v);
4218        for (x = 0, i = 0; i < count; ++i, ++s) {
4219            c = (unsigned char)*s;
4220            if (!Py_ISXDIGIT(c)) {
4221                endinpos = s-starts;
4222                if (unicode_decode_call_errorhandler(
4223                        errors, &errorHandler,
4224                        "rawunicodeescape", "truncated \\uXXXX",
4225                        &starts, &end, &startinpos, &endinpos, &exc, &s,
4226                        &v, &outpos, &p))
4227                    goto onError;
4228                goto nextByte;
4229            }
4230            x = (x<<4) & ~0xF;
4231            if (c >= '0' && c <= '9')
4232                x += c - '0';
4233            else if (c >= 'a' && c <= 'f')
4234                x += 10 + c - 'a';
4235            else
4236                x += 10 + c - 'A';
4237        }
4238        if (x <= 0xffff)
4239            /* UCS-2 character */
4240            *p++ = (Py_UNICODE) x;
4241        else if (x <= 0x10ffff) {
4242            /* UCS-4 character. Either store directly, or as
4243               surrogate pair. */
4244#ifdef Py_UNICODE_WIDE
4245            *p++ = (Py_UNICODE) x;
4246#else
4247            x -= 0x10000L;
4248            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4249            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
4250#endif
4251        } else {
4252            endinpos = s-starts;
4253            outpos = p-PyUnicode_AS_UNICODE(v);
4254            if (unicode_decode_call_errorhandler(
4255                    errors, &errorHandler,
4256                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
4257                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4258                    &v, &outpos, &p))
4259                goto onError;
4260        }
4261      nextByte:
4262        ;
4263    }
4264    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4265        goto onError;
4266    Py_XDECREF(errorHandler);
4267    Py_XDECREF(exc);
4268    return (PyObject *)v;
4269
4270  onError:
4271    Py_XDECREF(v);
4272    Py_XDECREF(errorHandler);
4273    Py_XDECREF(exc);
4274    return NULL;
4275}
4276
4277PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4278                                           Py_ssize_t size)
4279{
4280    PyObject *repr;
4281    char *p;
4282    char *q;
4283
4284#ifdef Py_UNICODE_WIDE
4285    const Py_ssize_t expandsize = 10;
4286#else
4287    const Py_ssize_t expandsize = 6;
4288#endif
4289
4290    if (size > PY_SSIZE_T_MAX / expandsize)
4291        return PyErr_NoMemory();
4292
4293    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
4294    if (repr == NULL)
4295        return NULL;
4296    if (size == 0)
4297        return repr;
4298
4299    p = q = PyBytes_AS_STRING(repr);
4300    while (size-- > 0) {
4301        Py_UNICODE ch = *s++;
4302#ifdef Py_UNICODE_WIDE
4303        /* Map 32-bit characters to '\Uxxxxxxxx' */
4304        if (ch >= 0x10000) {
4305            *p++ = '\\';
4306            *p++ = 'U';
4307            *p++ = hexdigits[(ch >> 28) & 0xf];
4308            *p++ = hexdigits[(ch >> 24) & 0xf];
4309            *p++ = hexdigits[(ch >> 20) & 0xf];
4310            *p++ = hexdigits[(ch >> 16) & 0xf];
4311            *p++ = hexdigits[(ch >> 12) & 0xf];
4312            *p++ = hexdigits[(ch >> 8) & 0xf];
4313            *p++ = hexdigits[(ch >> 4) & 0xf];
4314            *p++ = hexdigits[ch & 15];
4315        }
4316        else
4317#else
4318            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4319            if (ch >= 0xD800 && ch < 0xDC00) {
4320                Py_UNICODE ch2;
4321                Py_UCS4 ucs;
4322
4323                ch2 = *s++;
4324                size--;
4325                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4326                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4327                    *p++ = '\\';
4328                    *p++ = 'U';
4329                    *p++ = hexdigits[(ucs >> 28) & 0xf];
4330                    *p++ = hexdigits[(ucs >> 24) & 0xf];
4331                    *p++ = hexdigits[(ucs >> 20) & 0xf];
4332                    *p++ = hexdigits[(ucs >> 16) & 0xf];
4333                    *p++ = hexdigits[(ucs >> 12) & 0xf];
4334                    *p++ = hexdigits[(ucs >> 8) & 0xf];
4335                    *p++ = hexdigits[(ucs >> 4) & 0xf];
4336                    *p++ = hexdigits[ucs & 0xf];
4337                    continue;
4338                }
4339                /* Fall through: isolated surrogates are copied as-is */
4340                s--;
4341                size++;
4342            }
4343#endif
4344        /* Map 16-bit characters to '\uxxxx' */
4345        if (ch >= 256) {
4346            *p++ = '\\';
4347            *p++ = 'u';
4348            *p++ = hexdigits[(ch >> 12) & 0xf];
4349            *p++ = hexdigits[(ch >> 8) & 0xf];
4350            *p++ = hexdigits[(ch >> 4) & 0xf];
4351            *p++ = hexdigits[ch & 15];
4352        }
4353        /* Copy everything else as-is */
4354        else
4355            *p++ = (char) ch;
4356    }
4357    size = p - q;
4358
4359    assert(size > 0);
4360    if (_PyBytes_Resize(&repr, size) < 0)
4361        return NULL;
4362    return repr;
4363}
4364
4365PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4366{
4367    PyObject *s;
4368    if (!PyUnicode_Check(unicode)) {
4369        PyErr_BadArgument();
4370        return NULL;
4371    }
4372    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4373                                         PyUnicode_GET_SIZE(unicode));
4374
4375    return s;
4376}
4377
4378/* --- Unicode Internal Codec ------------------------------------------- */
4379
4380PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
4381                                           Py_ssize_t size,
4382                                           const char *errors)
4383{
4384    const char *starts = s;
4385    Py_ssize_t startinpos;
4386    Py_ssize_t endinpos;
4387    Py_ssize_t outpos;
4388    PyUnicodeObject *v;
4389    Py_UNICODE *p;
4390    const char *end;
4391    const char *reason;
4392    PyObject *errorHandler = NULL;
4393    PyObject *exc = NULL;
4394
4395#ifdef Py_UNICODE_WIDE
4396    Py_UNICODE unimax = PyUnicode_GetMax();
4397#endif
4398
4399    /* XXX overflow detection missing */
4400    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4401    if (v == NULL)
4402        goto onError;
4403    if (PyUnicode_GetSize((PyObject *)v) == 0)
4404        return (PyObject *)v;
4405    p = PyUnicode_AS_UNICODE(v);
4406    end = s + size;
4407
4408    while (s < end) {
4409        memcpy(p, s, sizeof(Py_UNICODE));
4410        /* We have to sanity check the raw data, otherwise doom looms for
4411           some malformed UCS-4 data. */
4412        if (
4413#ifdef Py_UNICODE_WIDE
4414            *p > unimax || *p < 0 ||
4415#endif
4416            end-s < Py_UNICODE_SIZE
4417            )
4418        {
4419            startinpos = s - starts;
4420            if (end-s < Py_UNICODE_SIZE) {
4421                endinpos = end-starts;
4422                reason = "truncated input";
4423            }
4424            else {
4425                endinpos = s - starts + Py_UNICODE_SIZE;
4426                reason = "illegal code point (> 0x10FFFF)";
4427            }
4428            outpos = p - PyUnicode_AS_UNICODE(v);
4429            if (unicode_decode_call_errorhandler(
4430                    errors, &errorHandler,
4431                    "unicode_internal", reason,
4432                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4433                    &v, &outpos, &p)) {
4434                goto onError;
4435            }
4436        }
4437        else {
4438            p++;
4439            s += Py_UNICODE_SIZE;
4440        }
4441    }
4442
4443    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4444        goto onError;
4445    Py_XDECREF(errorHandler);
4446    Py_XDECREF(exc);
4447    return (PyObject *)v;
4448
4449  onError:
4450    Py_XDECREF(v);
4451    Py_XDECREF(errorHandler);
4452    Py_XDECREF(exc);
4453    return NULL;
4454}
4455
4456/* --- Latin-1 Codec ------------------------------------------------------ */
4457
4458PyObject *PyUnicode_DecodeLatin1(const char *s,
4459                                 Py_ssize_t size,
4460                                 const char *errors)
4461{
4462    PyUnicodeObject *v;
4463    Py_UNICODE *p;
4464    const char *e, *unrolled_end;
4465
4466    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
4467    if (size == 1) {
4468        Py_UNICODE r = *(unsigned char*)s;
4469        return PyUnicode_FromUnicode(&r, 1);
4470    }
4471
4472    v = _PyUnicode_New(size);
4473    if (v == NULL)
4474        goto onError;
4475    if (size == 0)
4476        return (PyObject *)v;
4477    p = PyUnicode_AS_UNICODE(v);
4478    e = s + size;
4479    /* Unrolling the copy makes it much faster by reducing the looping
4480       overhead. This is similar to what many memcpy() implementations do. */
4481    unrolled_end = e - 4;
4482    while (s < unrolled_end) {
4483        p[0] = (unsigned char) s[0];
4484        p[1] = (unsigned char) s[1];
4485        p[2] = (unsigned char) s[2];
4486        p[3] = (unsigned char) s[3];
4487        s += 4;
4488        p += 4;
4489    }
4490    while (s < e)
4491        *p++ = (unsigned char) *s++;
4492    return (PyObject *)v;
4493
4494  onError:
4495    Py_XDECREF(v);
4496    return NULL;
4497}
4498
4499/* create or adjust a UnicodeEncodeError */
4500static void make_encode_exception(PyObject **exceptionObject,
4501                                  const char *encoding,
4502                                  const Py_UNICODE *unicode, Py_ssize_t size,
4503                                  Py_ssize_t startpos, Py_ssize_t endpos,
4504                                  const char *reason)
4505{
4506    if (*exceptionObject == NULL) {
4507        *exceptionObject = PyUnicodeEncodeError_Create(
4508            encoding, unicode, size, startpos, endpos, reason);
4509    }
4510    else {
4511        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4512            goto onError;
4513        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4514            goto onError;
4515        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4516            goto onError;
4517        return;
4518      onError:
4519        Py_DECREF(*exceptionObject);
4520        *exceptionObject = NULL;
4521    }
4522}
4523
4524/* raises a UnicodeEncodeError */
4525static void raise_encode_exception(PyObject **exceptionObject,
4526                                   const char *encoding,
4527                                   const Py_UNICODE *unicode, Py_ssize_t size,
4528                                   Py_ssize_t startpos, Py_ssize_t endpos,
4529                                   const char *reason)
4530{
4531    make_encode_exception(exceptionObject,
4532                          encoding, unicode, size, startpos, endpos, reason);
4533    if (*exceptionObject != NULL)
4534        PyCodec_StrictErrors(*exceptionObject);
4535}
4536
4537/* error handling callback helper:
4538   build arguments, call the callback and check the arguments,
4539   put the result into newpos and return the replacement string, which
4540   has to be freed by the caller */
4541static PyObject *unicode_encode_call_errorhandler(const char *errors,
4542                                                  PyObject **errorHandler,
4543                                                  const char *encoding, const char *reason,
4544                                                  const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4545                                                  Py_ssize_t startpos, Py_ssize_t endpos,
4546                                                  Py_ssize_t *newpos)
4547{
4548    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4549
4550    PyObject *restuple;
4551    PyObject *resunicode;
4552
4553    if (*errorHandler == NULL) {
4554        *errorHandler = PyCodec_LookupError(errors);
4555        if (*errorHandler == NULL)
4556            return NULL;
4557    }
4558
4559    make_encode_exception(exceptionObject,
4560                          encoding, unicode, size, startpos, endpos, reason);
4561    if (*exceptionObject == NULL)
4562        return NULL;
4563
4564    restuple = PyObject_CallFunctionObjArgs(
4565        *errorHandler, *exceptionObject, NULL);
4566    if (restuple == NULL)
4567        return NULL;
4568    if (!PyTuple_Check(restuple)) {
4569        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4570        Py_DECREF(restuple);
4571        return NULL;
4572    }
4573    if (!PyArg_ParseTuple(restuple, argparse,
4574                          &resunicode, newpos)) {
4575        Py_DECREF(restuple);
4576        return NULL;
4577    }
4578    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4579        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4580        Py_DECREF(restuple);
4581        return NULL;
4582    }
4583    if (*newpos<0)
4584        *newpos = size+*newpos;
4585    if (*newpos<0 || *newpos>size) {
4586        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4587        Py_DECREF(restuple);
4588        return NULL;
4589    }
4590    Py_INCREF(resunicode);
4591    Py_DECREF(restuple);
4592    return resunicode;
4593}
4594
4595static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
4596                                     Py_ssize_t size,
4597                                     const char *errors,
4598                                     int limit)
4599{
4600    /* output object */
4601    PyObject *res;
4602    /* pointers to the beginning and end+1 of input */
4603    const Py_UNICODE *startp = p;
4604    const Py_UNICODE *endp = p + size;
4605    /* pointer to the beginning of the unencodable characters */
4606    /* const Py_UNICODE *badp = NULL; */
4607    /* pointer into the output */
4608    char *str;
4609    /* current output position */
4610    Py_ssize_t ressize;
4611    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4612    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4613    PyObject *errorHandler = NULL;
4614    PyObject *exc = NULL;
4615    /* the following variable is used for caching string comparisons
4616     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4617    int known_errorHandler = -1;
4618
4619    /* allocate enough for a simple encoding without
4620       replacements, if we need more, we'll resize */
4621    if (size == 0)
4622        return PyBytes_FromStringAndSize(NULL, 0);
4623    res = PyBytes_FromStringAndSize(NULL, size);
4624    if (res == NULL)
4625        return NULL;
4626    str = PyBytes_AS_STRING(res);
4627    ressize = size;
4628
4629    while (p<endp) {
4630        Py_UNICODE c = *p;
4631
4632        /* can we encode this? */
4633        if (c<limit) {
4634            /* no overflow check, because we know that the space is enough */
4635            *str++ = (char)c;
4636            ++p;
4637        }
4638        else {
4639            Py_ssize_t unicodepos = p-startp;
4640            Py_ssize_t requiredsize;
4641            PyObject *repunicode;
4642            Py_ssize_t repsize;
4643            Py_ssize_t newpos;
4644            Py_ssize_t respos;
4645            Py_UNICODE *uni2;
4646            /* startpos for collecting unencodable chars */
4647            const Py_UNICODE *collstart = p;
4648            const Py_UNICODE *collend = p;
4649            /* find all unecodable characters */
4650            while ((collend < endp) && ((*collend)>=limit))
4651                ++collend;
4652            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4653            if (known_errorHandler==-1) {
4654                if ((errors==NULL) || (!strcmp(errors, "strict")))
4655                    known_errorHandler = 1;
4656                else if (!strcmp(errors, "replace"))
4657                    known_errorHandler = 2;
4658                else if (!strcmp(errors, "ignore"))
4659                    known_errorHandler = 3;
4660                else if (!strcmp(errors, "xmlcharrefreplace"))
4661                    known_errorHandler = 4;
4662                else
4663                    known_errorHandler = 0;
4664            }
4665            switch (known_errorHandler) {
4666            case 1: /* strict */
4667                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4668                goto onError;
4669            case 2: /* replace */
4670                while (collstart++<collend)
4671                    *str++ = '?'; /* fall through */
4672            case 3: /* ignore */
4673                p = collend;
4674                break;
4675            case 4: /* xmlcharrefreplace */
4676                respos = str - PyBytes_AS_STRING(res);
4677                /* determine replacement size (temporarily (mis)uses p) */
4678                for (p = collstart, repsize = 0; p < collend; ++p) {
4679                    if (*p<10)
4680                        repsize += 2+1+1;
4681                    else if (*p<100)
4682                        repsize += 2+2+1;
4683                    else if (*p<1000)
4684                        repsize += 2+3+1;
4685                    else if (*p<10000)
4686                        repsize += 2+4+1;
4687#ifndef Py_UNICODE_WIDE
4688                    else
4689                        repsize += 2+5+1;
4690#else
4691                    else if (*p<100000)
4692                        repsize += 2+5+1;
4693                    else if (*p<1000000)
4694                        repsize += 2+6+1;
4695                    else
4696                        repsize += 2+7+1;
4697#endif
4698                }
4699                requiredsize = respos+repsize+(endp-collend);
4700                if (requiredsize > ressize) {
4701                    if (requiredsize<2*ressize)
4702                        requiredsize = 2*ressize;
4703                    if (_PyBytes_Resize(&res, requiredsize))
4704                        goto onError;
4705                    str = PyBytes_AS_STRING(res) + respos;
4706                    ressize = requiredsize;
4707                }
4708                /* generate replacement (temporarily (mis)uses p) */
4709                for (p = collstart; p < collend; ++p) {
4710                    str += sprintf(str, "&#%d;", (int)*p);
4711                }
4712                p = collend;
4713                break;
4714            default:
4715                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4716                                                              encoding, reason, startp, size, &exc,
4717                                                              collstart-startp, collend-startp, &newpos);
4718                if (repunicode == NULL)
4719                    goto onError;
4720                if (PyBytes_Check(repunicode)) {
4721                    /* Directly copy bytes result to output. */
4722                    repsize = PyBytes_Size(repunicode);
4723                    if (repsize > 1) {
4724                        /* Make room for all additional bytes. */
4725                        respos = str - PyBytes_AS_STRING(res);
4726                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4727                            Py_DECREF(repunicode);
4728                            goto onError;
4729                        }
4730                        str = PyBytes_AS_STRING(res) + respos;
4731                        ressize += repsize-1;
4732                    }
4733                    memcpy(str, PyBytes_AsString(repunicode), repsize);
4734                    str += repsize;
4735                    p = startp + newpos;
4736                    Py_DECREF(repunicode);
4737                    break;
4738                }
4739                /* need more space? (at least enough for what we
4740                   have+the replacement+the rest of the string, so
4741                   we won't have to check space for encodable characters) */
4742                respos = str - PyBytes_AS_STRING(res);
4743                repsize = PyUnicode_GET_SIZE(repunicode);
4744                requiredsize = respos+repsize+(endp-collend);
4745                if (requiredsize > ressize) {
4746                    if (requiredsize<2*ressize)
4747                        requiredsize = 2*ressize;
4748                    if (_PyBytes_Resize(&res, requiredsize)) {
4749                        Py_DECREF(repunicode);
4750                        goto onError;
4751                    }
4752                    str = PyBytes_AS_STRING(res) + respos;
4753                    ressize = requiredsize;
4754                }
4755                /* check if there is anything unencodable in the replacement
4756                   and copy it to the output */
4757                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4758                    c = *uni2;
4759                    if (c >= limit) {
4760                        raise_encode_exception(&exc, encoding, startp, size,
4761                                               unicodepos, unicodepos+1, reason);
4762                        Py_DECREF(repunicode);
4763                        goto onError;
4764                    }
4765                    *str = (char)c;
4766                }
4767                p = startp + newpos;
4768                Py_DECREF(repunicode);
4769            }
4770        }
4771    }
4772    /* Resize if we allocated to much */
4773    size = str - PyBytes_AS_STRING(res);
4774    if (size < ressize) { /* If this falls res will be NULL */
4775        assert(size >= 0);
4776        if (_PyBytes_Resize(&res, size) < 0)
4777            goto onError;
4778    }
4779
4780    Py_XDECREF(errorHandler);
4781    Py_XDECREF(exc);
4782    return res;
4783
4784  onError:
4785    Py_XDECREF(res);
4786    Py_XDECREF(errorHandler);
4787    Py_XDECREF(exc);
4788    return NULL;
4789}
4790
4791PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4792                                 Py_ssize_t size,
4793                                 const char *errors)
4794{
4795    return unicode_encode_ucs1(p, size, errors, 256);
4796}
4797
4798PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4799{
4800    if (!PyUnicode_Check(unicode)) {
4801        PyErr_BadArgument();
4802        return NULL;
4803    }
4804    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4805                                  PyUnicode_GET_SIZE(unicode),
4806                                  NULL);
4807}
4808
4809/* --- 7-bit ASCII Codec -------------------------------------------------- */
4810
4811PyObject *PyUnicode_DecodeASCII(const char *s,
4812                                Py_ssize_t size,
4813                                const char *errors)
4814{
4815    const char *starts = s;
4816    PyUnicodeObject *v;
4817    Py_UNICODE *p;
4818    Py_ssize_t startinpos;
4819    Py_ssize_t endinpos;
4820    Py_ssize_t outpos;
4821    const char *e;
4822    PyObject *errorHandler = NULL;
4823    PyObject *exc = NULL;
4824
4825    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4826    if (size == 1 && *(unsigned char*)s < 128) {
4827        Py_UNICODE r = *(unsigned char*)s;
4828        return PyUnicode_FromUnicode(&r, 1);
4829    }
4830
4831    v = _PyUnicode_New(size);
4832    if (v == NULL)
4833        goto onError;
4834    if (size == 0)
4835        return (PyObject *)v;
4836    p = PyUnicode_AS_UNICODE(v);
4837    e = s + size;
4838    while (s < e) {
4839        register unsigned char c = (unsigned char)*s;
4840        if (c < 128) {
4841            *p++ = c;
4842            ++s;
4843        }
4844        else {
4845            startinpos = s-starts;
4846            endinpos = startinpos + 1;
4847            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4848            if (unicode_decode_call_errorhandler(
4849                    errors, &errorHandler,
4850                    "ascii", "ordinal not in range(128)",
4851                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4852                    &v, &outpos, &p))
4853                goto onError;
4854        }
4855    }
4856    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4857        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4858            goto onError;
4859    Py_XDECREF(errorHandler);
4860    Py_XDECREF(exc);
4861    return (PyObject *)v;
4862
4863  onError:
4864    Py_XDECREF(v);
4865    Py_XDECREF(errorHandler);
4866    Py_XDECREF(exc);
4867    return NULL;
4868}
4869
4870PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4871                                Py_ssize_t size,
4872                                const char *errors)
4873{
4874    return unicode_encode_ucs1(p, size, errors, 128);
4875}
4876
4877PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4878{
4879    if (!PyUnicode_Check(unicode)) {
4880        PyErr_BadArgument();
4881        return NULL;
4882    }
4883    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4884                                 PyUnicode_GET_SIZE(unicode),
4885                                 NULL);
4886}
4887
4888#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4889
4890/* --- MBCS codecs for Windows -------------------------------------------- */
4891
4892#if SIZEOF_INT < SIZEOF_SIZE_T
4893#define NEED_RETRY
4894#endif
4895
4896/* XXX This code is limited to "true" double-byte encodings, as
4897   a) it assumes an incomplete character consists of a single byte, and
4898   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4899   encodings, see IsDBCSLeadByteEx documentation. */
4900
4901static int is_dbcs_lead_byte(const char *s, int offset)
4902{
4903    const char *curr = s + offset;
4904
4905    if (IsDBCSLeadByte(*curr)) {
4906        const char *prev = CharPrev(s, curr);
4907        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4908    }
4909    return 0;
4910}
4911
4912/*
4913 * Decode MBCS string into unicode object. If 'final' is set, converts
4914 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4915 */
4916static int decode_mbcs(PyUnicodeObject **v,
4917                       const char *s, /* MBCS string */
4918                       int size, /* sizeof MBCS string */
4919                       int final,
4920                       const char *errors)
4921{
4922    Py_UNICODE *p;
4923    Py_ssize_t n;
4924    DWORD usize;
4925    DWORD flags;
4926
4927    assert(size >= 0);
4928
4929    /* check and handle 'errors' arg */
4930    if (errors==NULL || strcmp(errors, "strict")==0)
4931        flags = MB_ERR_INVALID_CHARS;
4932    else if (strcmp(errors, "ignore")==0)
4933        flags = 0;
4934    else {
4935        PyErr_Format(PyExc_ValueError,
4936                     "mbcs encoding does not support errors='%s'",
4937                     errors);
4938        return -1;
4939    }
4940
4941    /* Skip trailing lead-byte unless 'final' is set */
4942    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4943        --size;
4944
4945    /* First get the size of the result */
4946    if (size > 0) {
4947        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4948        if (usize==0)
4949            goto mbcs_decode_error;
4950    } else
4951        usize = 0;
4952
4953    if (*v == NULL) {
4954        /* Create unicode object */
4955        *v = _PyUnicode_New(usize);
4956        if (*v == NULL)
4957            return -1;
4958        n = 0;
4959    }
4960    else {
4961        /* Extend unicode object */
4962        n = PyUnicode_GET_SIZE(*v);
4963        if (_PyUnicode_Resize(v, n + usize) < 0)
4964            return -1;
4965    }
4966
4967    /* Do the conversion */
4968    if (usize > 0) {
4969        p = PyUnicode_AS_UNICODE(*v) + n;
4970        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4971            goto mbcs_decode_error;
4972        }
4973    }
4974    return size;
4975
4976mbcs_decode_error:
4977    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4978       we raise a UnicodeDecodeError - else it is a 'generic'
4979       windows error
4980     */
4981    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4982        /* Ideally, we should get reason from FormatMessage - this
4983           is the Windows 2000 English version of the message
4984        */
4985        PyObject *exc = NULL;
4986        const char *reason = "No mapping for the Unicode character exists "
4987                             "in the target multi-byte code page.";
4988        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4989        if (exc != NULL) {
4990            PyCodec_StrictErrors(exc);
4991            Py_DECREF(exc);
4992        }
4993    } else {
4994        PyErr_SetFromWindowsErrWithFilename(0, NULL);
4995    }
4996    return -1;
4997}
4998
4999PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
5000                                       Py_ssize_t size,
5001                                       const char *errors,
5002                                       Py_ssize_t *consumed)
5003{
5004    PyUnicodeObject *v = NULL;
5005    int done;
5006
5007    if (consumed)
5008        *consumed = 0;
5009
5010#ifdef NEED_RETRY
5011  retry:
5012    if (size > INT_MAX)
5013        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
5014    else
5015#endif
5016        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
5017
5018    if (done < 0) {
5019        Py_XDECREF(v);
5020        return NULL;
5021    }
5022
5023    if (consumed)
5024        *consumed += done;
5025
5026#ifdef NEED_RETRY
5027    if (size > INT_MAX) {
5028        s += done;
5029        size -= done;
5030        goto retry;
5031    }
5032#endif
5033
5034    return (PyObject *)v;
5035}
5036
5037PyObject *PyUnicode_DecodeMBCS(const char *s,
5038                               Py_ssize_t size,
5039                               const char *errors)
5040{
5041    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5042}
5043
5044/*
5045 * Convert unicode into string object (MBCS).
5046 * Returns 0 if succeed, -1 otherwise.
5047 */
5048static int encode_mbcs(PyObject **repr,
5049                       const Py_UNICODE *p, /* unicode */
5050                       int size, /* size of unicode */
5051                       const char* errors)
5052{
5053    BOOL usedDefaultChar = FALSE;
5054    BOOL *pusedDefaultChar;
5055    int mbcssize;
5056    Py_ssize_t n;
5057    PyObject *exc = NULL;
5058    DWORD flags;
5059
5060    assert(size >= 0);
5061
5062    /* check and handle 'errors' arg */
5063    if (errors==NULL || strcmp(errors, "strict")==0) {
5064        flags = WC_NO_BEST_FIT_CHARS;
5065        pusedDefaultChar = &usedDefaultChar;
5066    } else if (strcmp(errors, "replace")==0) {
5067        flags = 0;
5068        pusedDefaultChar = NULL;
5069    } else {
5070         PyErr_Format(PyExc_ValueError,
5071                      "mbcs encoding does not support errors='%s'",
5072                      errors);
5073         return -1;
5074    }
5075
5076    /* First get the size of the result */
5077    if (size > 0) {
5078        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5079                                       NULL, pusedDefaultChar);
5080        if (mbcssize == 0) {
5081            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5082            return -1;
5083        }
5084        /* If we used a default char, then we failed! */
5085        if (pusedDefaultChar && *pusedDefaultChar)
5086            goto mbcs_encode_error;
5087    } else {
5088        mbcssize = 0;
5089    }
5090
5091    if (*repr == NULL) {
5092        /* Create string object */
5093        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5094        if (*repr == NULL)
5095            return -1;
5096        n = 0;
5097    }
5098    else {
5099        /* Extend string object */
5100        n = PyBytes_Size(*repr);
5101        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5102            return -1;
5103    }
5104
5105    /* Do the conversion */
5106    if (size > 0) {
5107        char *s = PyBytes_AS_STRING(*repr) + n;
5108        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5109                                     NULL, pusedDefaultChar)) {
5110            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5111            return -1;
5112        }
5113        if (pusedDefaultChar && *pusedDefaultChar)
5114            goto mbcs_encode_error;
5115    }
5116    return 0;
5117
5118mbcs_encode_error:
5119    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5120    Py_XDECREF(exc);
5121    return -1;
5122}
5123
5124PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5125                               Py_ssize_t size,
5126                               const char *errors)
5127{
5128    PyObject *repr = NULL;
5129    int ret;
5130
5131#ifdef NEED_RETRY
5132  retry:
5133    if (size > INT_MAX)
5134        ret = encode_mbcs(&repr, p, INT_MAX, errors);
5135    else
5136#endif
5137        ret = encode_mbcs(&repr, p, (int)size, errors);
5138
5139    if (ret < 0) {
5140        Py_XDECREF(repr);
5141        return NULL;
5142    }
5143
5144#ifdef NEED_RETRY
5145    if (size > INT_MAX) {
5146        p += INT_MAX;
5147        size -= INT_MAX;
5148        goto retry;
5149    }
5150#endif
5151
5152    return repr;
5153}
5154
5155PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5156{
5157    if (!PyUnicode_Check(unicode)) {
5158        PyErr_BadArgument();
5159        return NULL;
5160    }
5161    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
5162                                PyUnicode_GET_SIZE(unicode),
5163                                NULL);
5164}
5165
5166#undef NEED_RETRY
5167
5168#endif /* MS_WINDOWS */
5169
5170/* --- Character Mapping Codec -------------------------------------------- */
5171
5172PyObject *PyUnicode_DecodeCharmap(const char *s,
5173                                  Py_ssize_t size,
5174                                  PyObject *mapping,
5175                                  const char *errors)
5176{
5177    const char *starts = s;
5178    Py_ssize_t startinpos;
5179    Py_ssize_t endinpos;
5180    Py_ssize_t outpos;
5181    const char *e;
5182    PyUnicodeObject *v;
5183    Py_UNICODE *p;
5184    Py_ssize_t extrachars = 0;
5185    PyObject *errorHandler = NULL;
5186    PyObject *exc = NULL;
5187    Py_UNICODE *mapstring = NULL;
5188    Py_ssize_t maplen = 0;
5189
5190    /* Default to Latin-1 */
5191    if (mapping == NULL)
5192        return PyUnicode_DecodeLatin1(s, size, errors);
5193
5194    v = _PyUnicode_New(size);
5195    if (v == NULL)
5196        goto onError;
5197    if (size == 0)
5198        return (PyObject *)v;
5199    p = PyUnicode_AS_UNICODE(v);
5200    e = s + size;
5201    if (PyUnicode_CheckExact(mapping)) {
5202        mapstring = PyUnicode_AS_UNICODE(mapping);
5203        maplen = PyUnicode_GET_SIZE(mapping);
5204        while (s < e) {
5205            unsigned char ch = *s;
5206            Py_UNICODE x = 0xfffe; /* illegal value */
5207
5208            if (ch < maplen)
5209                x = mapstring[ch];
5210
5211            if (x == 0xfffe) {
5212                /* undefined mapping */
5213                outpos = p-PyUnicode_AS_UNICODE(v);
5214                startinpos = s-starts;
5215                endinpos = startinpos+1;
5216                if (unicode_decode_call_errorhandler(
5217                        errors, &errorHandler,
5218                        "charmap", "character maps to <undefined>",
5219                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5220                        &v, &outpos, &p)) {
5221                    goto onError;
5222                }
5223                continue;
5224            }
5225            *p++ = x;
5226            ++s;
5227        }
5228    }
5229    else {
5230        while (s < e) {
5231            unsigned char ch = *s;
5232            PyObject *w, *x;
5233
5234            /* Get mapping (char ordinal -> integer, Unicode char or None) */
5235            w = PyLong_FromLong((long)ch);
5236            if (w == NULL)
5237                goto onError;
5238            x = PyObject_GetItem(mapping, w);
5239            Py_DECREF(w);
5240            if (x == NULL) {
5241                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5242                    /* No mapping found means: mapping is undefined. */
5243                    PyErr_Clear();
5244                    x = Py_None;
5245                    Py_INCREF(x);
5246                } else
5247                    goto onError;
5248            }
5249
5250            /* Apply mapping */
5251            if (PyLong_Check(x)) {
5252                long value = PyLong_AS_LONG(x);
5253                if (value < 0 || value > 65535) {
5254                    PyErr_SetString(PyExc_TypeError,
5255                                    "character mapping must be in range(65536)");
5256                    Py_DECREF(x);
5257                    goto onError;
5258                }
5259                *p++ = (Py_UNICODE)value;
5260            }
5261            else if (x == Py_None) {
5262                /* undefined mapping */
5263                outpos = p-PyUnicode_AS_UNICODE(v);
5264                startinpos = s-starts;
5265                endinpos = startinpos+1;
5266                if (unicode_decode_call_errorhandler(
5267                        errors, &errorHandler,
5268                        "charmap", "character maps to <undefined>",
5269                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5270                        &v, &outpos, &p)) {
5271                    Py_DECREF(x);
5272                    goto onError;
5273                }
5274                Py_DECREF(x);
5275                continue;
5276            }
5277            else if (PyUnicode_Check(x)) {
5278                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
5279
5280                if (targetsize == 1)
5281                    /* 1-1 mapping */
5282                    *p++ = *PyUnicode_AS_UNICODE(x);
5283
5284                else if (targetsize > 1) {
5285                    /* 1-n mapping */
5286                    if (targetsize > extrachars) {
5287                        /* resize first */
5288                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5289                        Py_ssize_t needed = (targetsize - extrachars) + \
5290                            (targetsize << 2);
5291                        extrachars += needed;
5292                        /* XXX overflow detection missing */
5293                        if (_PyUnicode_Resize(&v,
5294                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
5295                            Py_DECREF(x);
5296                            goto onError;
5297                        }
5298                        p = PyUnicode_AS_UNICODE(v) + oldpos;
5299                    }
5300                    Py_UNICODE_COPY(p,
5301                                    PyUnicode_AS_UNICODE(x),
5302                                    targetsize);
5303                    p += targetsize;
5304                    extrachars -= targetsize;
5305                }
5306                /* 1-0 mapping: skip the character */
5307            }
5308            else {
5309                /* wrong return value */
5310                PyErr_SetString(PyExc_TypeError,
5311                                "character mapping must return integer, None or str");
5312                Py_DECREF(x);
5313                goto onError;
5314            }
5315            Py_DECREF(x);
5316            ++s;
5317        }
5318    }
5319    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
5320        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5321            goto onError;
5322    Py_XDECREF(errorHandler);
5323    Py_XDECREF(exc);
5324    return (PyObject *)v;
5325
5326  onError:
5327    Py_XDECREF(errorHandler);
5328    Py_XDECREF(exc);
5329    Py_XDECREF(v);
5330    return NULL;
5331}
5332
5333/* Charmap encoding: the lookup table */
5334
5335struct encoding_map{
5336    PyObject_HEAD
5337    unsigned char level1[32];
5338    int count2, count3;
5339    unsigned char level23[1];
5340};
5341
5342static PyObject*
5343encoding_map_size(PyObject *obj, PyObject* args)
5344{
5345    struct encoding_map *map = (struct encoding_map*)obj;
5346    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
5347                           128*map->count3);
5348}
5349
5350static PyMethodDef encoding_map_methods[] = {
5351    {"size", encoding_map_size, METH_NOARGS,
5352     PyDoc_STR("Return the size (in bytes) of this object") },
5353    { 0 }
5354};
5355
5356static void
5357encoding_map_dealloc(PyObject* o)
5358{
5359    PyObject_FREE(o);
5360}
5361
5362static PyTypeObject EncodingMapType = {
5363    PyVarObject_HEAD_INIT(NULL, 0)
5364    "EncodingMap",          /*tp_name*/
5365    sizeof(struct encoding_map),   /*tp_basicsize*/
5366    0,                      /*tp_itemsize*/
5367    /* methods */
5368    encoding_map_dealloc,   /*tp_dealloc*/
5369    0,                      /*tp_print*/
5370    0,                      /*tp_getattr*/
5371    0,                      /*tp_setattr*/
5372    0,                      /*tp_reserved*/
5373    0,                      /*tp_repr*/
5374    0,                      /*tp_as_number*/
5375    0,                      /*tp_as_sequence*/
5376    0,                      /*tp_as_mapping*/
5377    0,                      /*tp_hash*/
5378    0,                      /*tp_call*/
5379    0,                      /*tp_str*/
5380    0,                      /*tp_getattro*/
5381    0,                      /*tp_setattro*/
5382    0,                      /*tp_as_buffer*/
5383    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
5384    0,                      /*tp_doc*/
5385    0,                      /*tp_traverse*/
5386    0,                      /*tp_clear*/
5387    0,                      /*tp_richcompare*/
5388    0,                      /*tp_weaklistoffset*/
5389    0,                      /*tp_iter*/
5390    0,                      /*tp_iternext*/
5391    encoding_map_methods,   /*tp_methods*/
5392    0,                      /*tp_members*/
5393    0,                      /*tp_getset*/
5394    0,                      /*tp_base*/
5395    0,                      /*tp_dict*/
5396    0,                      /*tp_descr_get*/
5397    0,                      /*tp_descr_set*/
5398    0,                      /*tp_dictoffset*/
5399    0,                      /*tp_init*/
5400    0,                      /*tp_alloc*/
5401    0,                      /*tp_new*/
5402    0,                      /*tp_free*/
5403    0,                      /*tp_is_gc*/
5404};
5405
5406PyObject*
5407PyUnicode_BuildEncodingMap(PyObject* string)
5408{
5409    Py_UNICODE *decode;
5410    PyObject *result;
5411    struct encoding_map *mresult;
5412    int i;
5413    int need_dict = 0;
5414    unsigned char level1[32];
5415    unsigned char level2[512];
5416    unsigned char *mlevel1, *mlevel2, *mlevel3;
5417    int count2 = 0, count3 = 0;
5418
5419    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5420        PyErr_BadArgument();
5421        return NULL;
5422    }
5423    decode = PyUnicode_AS_UNICODE(string);
5424    memset(level1, 0xFF, sizeof level1);
5425    memset(level2, 0xFF, sizeof level2);
5426
5427    /* If there isn't a one-to-one mapping of NULL to \0,
5428       or if there are non-BMP characters, we need to use
5429       a mapping dictionary. */
5430    if (decode[0] != 0)
5431        need_dict = 1;
5432    for (i = 1; i < 256; i++) {
5433        int l1, l2;
5434        if (decode[i] == 0
5435#ifdef Py_UNICODE_WIDE
5436            || decode[i] > 0xFFFF
5437#endif
5438            ) {
5439            need_dict = 1;
5440            break;
5441        }
5442        if (decode[i] == 0xFFFE)
5443            /* unmapped character */
5444            continue;
5445        l1 = decode[i] >> 11;
5446        l2 = decode[i] >> 7;
5447        if (level1[l1] == 0xFF)
5448            level1[l1] = count2++;
5449        if (level2[l2] == 0xFF)
5450            level2[l2] = count3++;
5451    }
5452
5453    if (count2 >= 0xFF || count3 >= 0xFF)
5454        need_dict = 1;
5455
5456    if (need_dict) {
5457        PyObject *result = PyDict_New();
5458        PyObject *key, *value;
5459        if (!result)
5460            return NULL;
5461        for (i = 0; i < 256; i++) {
5462            key = value = NULL;
5463            key = PyLong_FromLong(decode[i]);
5464            value = PyLong_FromLong(i);
5465            if (!key || !value)
5466                goto failed1;
5467            if (PyDict_SetItem(result, key, value) == -1)
5468                goto failed1;
5469            Py_DECREF(key);
5470            Py_DECREF(value);
5471        }
5472        return result;
5473      failed1:
5474        Py_XDECREF(key);
5475        Py_XDECREF(value);
5476        Py_DECREF(result);
5477        return NULL;
5478    }
5479
5480    /* Create a three-level trie */
5481    result = PyObject_MALLOC(sizeof(struct encoding_map) +
5482                             16*count2 + 128*count3 - 1);
5483    if (!result)
5484        return PyErr_NoMemory();
5485    PyObject_Init(result, &EncodingMapType);
5486    mresult = (struct encoding_map*)result;
5487    mresult->count2 = count2;
5488    mresult->count3 = count3;
5489    mlevel1 = mresult->level1;
5490    mlevel2 = mresult->level23;
5491    mlevel3 = mresult->level23 + 16*count2;
5492    memcpy(mlevel1, level1, 32);
5493    memset(mlevel2, 0xFF, 16*count2);
5494    memset(mlevel3, 0, 128*count3);
5495    count3 = 0;
5496    for (i = 1; i < 256; i++) {
5497        int o1, o2, o3, i2, i3;
5498        if (decode[i] == 0xFFFE)
5499            /* unmapped character */
5500            continue;
5501        o1 = decode[i]>>11;
5502        o2 = (decode[i]>>7) & 0xF;
5503        i2 = 16*mlevel1[o1] + o2;
5504        if (mlevel2[i2] == 0xFF)
5505            mlevel2[i2] = count3++;
5506        o3 = decode[i] & 0x7F;
5507        i3 = 128*mlevel2[i2] + o3;
5508        mlevel3[i3] = i;
5509    }
5510    return result;
5511}
5512
5513static int
5514encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5515{
5516    struct encoding_map *map = (struct encoding_map*)mapping;
5517    int l1 = c>>11;
5518    int l2 = (c>>7) & 0xF;
5519    int l3 = c & 0x7F;
5520    int i;
5521
5522#ifdef Py_UNICODE_WIDE
5523    if (c > 0xFFFF) {
5524        return -1;
5525    }
5526#endif
5527    if (c == 0)
5528        return 0;
5529    /* level 1*/
5530    i = map->level1[l1];
5531    if (i == 0xFF) {
5532        return -1;
5533    }
5534    /* level 2*/
5535    i = map->level23[16*i+l2];
5536    if (i == 0xFF) {
5537        return -1;
5538    }
5539    /* level 3 */
5540    i = map->level23[16*map->count2 + 128*i + l3];
5541    if (i == 0) {
5542        return -1;
5543    }
5544    return i;
5545}
5546
5547/* Lookup the character ch in the mapping. If the character
5548   can't be found, Py_None is returned (or NULL, if another
5549   error occurred). */
5550static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5551{
5552    PyObject *w = PyLong_FromLong((long)c);
5553    PyObject *x;
5554
5555    if (w == NULL)
5556        return NULL;
5557    x = PyObject_GetItem(mapping, w);
5558    Py_DECREF(w);
5559    if (x == NULL) {
5560        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5561            /* No mapping found means: mapping is undefined. */
5562            PyErr_Clear();
5563            x = Py_None;
5564            Py_INCREF(x);
5565            return x;
5566        } else
5567            return NULL;
5568    }
5569    else if (x == Py_None)
5570        return x;
5571    else if (PyLong_Check(x)) {
5572        long value = PyLong_AS_LONG(x);
5573        if (value < 0 || value > 255) {
5574            PyErr_SetString(PyExc_TypeError,
5575                            "character mapping must be in range(256)");
5576            Py_DECREF(x);
5577            return NULL;
5578        }
5579        return x;
5580    }
5581    else if (PyBytes_Check(x))
5582        return x;
5583    else {
5584        /* wrong return value */
5585        PyErr_Format(PyExc_TypeError,
5586                     "character mapping must return integer, bytes or None, not %.400s",
5587                     x->ob_type->tp_name);
5588        Py_DECREF(x);
5589        return NULL;
5590    }
5591}
5592
5593static int
5594charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5595{
5596    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5597    /* exponentially overallocate to minimize reallocations */
5598    if (requiredsize < 2*outsize)
5599        requiredsize = 2*outsize;
5600    if (_PyBytes_Resize(outobj, requiredsize))
5601        return -1;
5602    return 0;
5603}
5604
5605typedef enum charmapencode_result {
5606    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5607}charmapencode_result;
5608/* lookup the character, put the result in the output string and adjust
5609   various state variables. Resize the output bytes object if not enough
5610   space is available. Return a new reference to the object that
5611   was put in the output buffer, or Py_None, if the mapping was undefined
5612   (in which case no character was written) or NULL, if a
5613   reallocation error occurred. The caller must decref the result */
5614static
5615charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
5616                                          PyObject **outobj, Py_ssize_t *outpos)
5617{
5618    PyObject *rep;
5619    char *outstart;
5620    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5621
5622    if (Py_TYPE(mapping) == &EncodingMapType) {
5623        int res = encoding_map_lookup(c, mapping);
5624        Py_ssize_t requiredsize = *outpos+1;
5625        if (res == -1)
5626            return enc_FAILED;
5627        if (outsize<requiredsize)
5628            if (charmapencode_resize(outobj, outpos, requiredsize))
5629                return enc_EXCEPTION;
5630        outstart = PyBytes_AS_STRING(*outobj);
5631        outstart[(*outpos)++] = (char)res;
5632        return enc_SUCCESS;
5633    }
5634
5635    rep = charmapencode_lookup(c, mapping);
5636    if (rep==NULL)
5637        return enc_EXCEPTION;
5638    else if (rep==Py_None) {
5639        Py_DECREF(rep);
5640        return enc_FAILED;
5641    } else {
5642        if (PyLong_Check(rep)) {
5643            Py_ssize_t requiredsize = *outpos+1;
5644            if (outsize<requiredsize)
5645                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5646                    Py_DECREF(rep);
5647                    return enc_EXCEPTION;
5648                }
5649            outstart = PyBytes_AS_STRING(*outobj);
5650            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5651        }
5652        else {
5653            const char *repchars = PyBytes_AS_STRING(rep);
5654            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5655            Py_ssize_t requiredsize = *outpos+repsize;
5656            if (outsize<requiredsize)
5657                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5658                    Py_DECREF(rep);
5659                    return enc_EXCEPTION;
5660                }
5661            outstart = PyBytes_AS_STRING(*outobj);
5662            memcpy(outstart + *outpos, repchars, repsize);
5663            *outpos += repsize;
5664        }
5665    }
5666    Py_DECREF(rep);
5667    return enc_SUCCESS;
5668}
5669
5670/* handle an error in PyUnicode_EncodeCharmap
5671   Return 0 on success, -1 on error */
5672static
5673int charmap_encoding_error(
5674    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5675    PyObject **exceptionObject,
5676    int *known_errorHandler, PyObject **errorHandler, const char *errors,
5677    PyObject **res, Py_ssize_t *respos)
5678{
5679    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5680    Py_ssize_t repsize;
5681    Py_ssize_t newpos;
5682    Py_UNICODE *uni2;
5683    /* startpos for collecting unencodable chars */
5684    Py_ssize_t collstartpos = *inpos;
5685    Py_ssize_t collendpos = *inpos+1;
5686    Py_ssize_t collpos;
5687    char *encoding = "charmap";
5688    char *reason = "character maps to <undefined>";
5689    charmapencode_result x;
5690
5691    /* find all unencodable characters */
5692    while (collendpos < size) {
5693        PyObject *rep;
5694        if (Py_TYPE(mapping) == &EncodingMapType) {
5695            int res = encoding_map_lookup(p[collendpos], mapping);
5696            if (res != -1)
5697                break;
5698            ++collendpos;
5699            continue;
5700        }
5701
5702        rep = charmapencode_lookup(p[collendpos], mapping);
5703        if (rep==NULL)
5704            return -1;
5705        else if (rep!=Py_None) {
5706            Py_DECREF(rep);
5707            break;
5708        }
5709        Py_DECREF(rep);
5710        ++collendpos;
5711    }
5712    /* cache callback name lookup
5713     * (if not done yet, i.e. it's the first error) */
5714    if (*known_errorHandler==-1) {
5715        if ((errors==NULL) || (!strcmp(errors, "strict")))
5716            *known_errorHandler = 1;
5717        else if (!strcmp(errors, "replace"))
5718            *known_errorHandler = 2;
5719        else if (!strcmp(errors, "ignore"))
5720            *known_errorHandler = 3;
5721        else if (!strcmp(errors, "xmlcharrefreplace"))
5722            *known_errorHandler = 4;
5723        else
5724            *known_errorHandler = 0;
5725    }
5726    switch (*known_errorHandler) {
5727    case 1: /* strict */
5728        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5729        return -1;
5730    case 2: /* replace */
5731        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
5732            x = charmapencode_output('?', mapping, res, respos);
5733            if (x==enc_EXCEPTION) {
5734                return -1;
5735            }
5736            else if (x==enc_FAILED) {
5737                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5738                return -1;
5739            }
5740        }
5741        /* fall through */
5742    case 3: /* ignore */
5743        *inpos = collendpos;
5744        break;
5745    case 4: /* xmlcharrefreplace */
5746        /* generate replacement (temporarily (mis)uses p) */
5747        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
5748            char buffer[2+29+1+1];
5749            char *cp;
5750            sprintf(buffer, "&#%d;", (int)p[collpos]);
5751            for (cp = buffer; *cp; ++cp) {
5752                x = charmapencode_output(*cp, mapping, res, respos);
5753                if (x==enc_EXCEPTION)
5754                    return -1;
5755                else if (x==enc_FAILED) {
5756                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5757                    return -1;
5758                }
5759            }
5760        }
5761        *inpos = collendpos;
5762        break;
5763    default:
5764        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
5765                                                      encoding, reason, p, size, exceptionObject,
5766                                                      collstartpos, collendpos, &newpos);
5767        if (repunicode == NULL)
5768            return -1;
5769        if (PyBytes_Check(repunicode)) {
5770            /* Directly copy bytes result to output. */
5771            Py_ssize_t outsize = PyBytes_Size(*res);
5772            Py_ssize_t requiredsize;
5773            repsize = PyBytes_Size(repunicode);
5774            requiredsize = *respos + repsize;
5775            if (requiredsize > outsize)
5776                /* Make room for all additional bytes. */
5777                if (charmapencode_resize(res, respos, requiredsize)) {
5778                    Py_DECREF(repunicode);
5779                    return -1;
5780                }
5781            memcpy(PyBytes_AsString(*res) + *respos,
5782                   PyBytes_AsString(repunicode),  repsize);
5783            *respos += repsize;
5784            *inpos = newpos;
5785            Py_DECREF(repunicode);
5786            break;
5787        }
5788        /* generate replacement  */
5789        repsize = PyUnicode_GET_SIZE(repunicode);
5790        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5791            x = charmapencode_output(*uni2, mapping, res, respos);
5792            if (x==enc_EXCEPTION) {
5793                return -1;
5794            }
5795            else if (x==enc_FAILED) {
5796                Py_DECREF(repunicode);
5797                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5798                return -1;
5799            }
5800        }
5801        *inpos = newpos;
5802        Py_DECREF(repunicode);
5803    }
5804    return 0;
5805}
5806
5807PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5808                                  Py_ssize_t size,
5809                                  PyObject *mapping,
5810                                  const char *errors)
5811{
5812    /* output object */
5813    PyObject *res = NULL;
5814    /* current input position */
5815    Py_ssize_t inpos = 0;
5816    /* current output position */
5817    Py_ssize_t respos = 0;
5818    PyObject *errorHandler = NULL;
5819    PyObject *exc = NULL;
5820    /* the following variable is used for caching string comparisons
5821     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5822     * 3=ignore, 4=xmlcharrefreplace */
5823    int known_errorHandler = -1;
5824
5825    /* Default to Latin-1 */
5826    if (mapping == NULL)
5827        return PyUnicode_EncodeLatin1(p, size, errors);
5828
5829    /* allocate enough for a simple encoding without
5830       replacements, if we need more, we'll resize */
5831    res = PyBytes_FromStringAndSize(NULL, size);
5832    if (res == NULL)
5833        goto onError;
5834    if (size == 0)
5835        return res;
5836
5837    while (inpos<size) {
5838        /* try to encode it */
5839        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5840        if (x==enc_EXCEPTION) /* error */
5841            goto onError;
5842        if (x==enc_FAILED) { /* unencodable character */
5843            if (charmap_encoding_error(p, size, &inpos, mapping,
5844                                       &exc,
5845                                       &known_errorHandler, &errorHandler, errors,
5846                                       &res, &respos)) {
5847                goto onError;
5848            }
5849        }
5850        else
5851            /* done with this character => adjust input position */
5852            ++inpos;
5853    }
5854
5855    /* Resize if we allocated to much */
5856    if (respos<PyBytes_GET_SIZE(res))
5857        if (_PyBytes_Resize(&res, respos) < 0)
5858            goto onError;
5859
5860    Py_XDECREF(exc);
5861    Py_XDECREF(errorHandler);
5862    return res;
5863
5864  onError:
5865    Py_XDECREF(res);
5866    Py_XDECREF(exc);
5867    Py_XDECREF(errorHandler);
5868    return NULL;
5869}
5870
5871PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5872                                    PyObject *mapping)
5873{
5874    if (!PyUnicode_Check(unicode) || mapping == NULL) {
5875        PyErr_BadArgument();
5876        return NULL;
5877    }
5878    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5879                                   PyUnicode_GET_SIZE(unicode),
5880                                   mapping,
5881                                   NULL);
5882}
5883
5884/* create or adjust a UnicodeTranslateError */
5885static void make_translate_exception(PyObject **exceptionObject,
5886                                     const Py_UNICODE *unicode, Py_ssize_t size,
5887                                     Py_ssize_t startpos, Py_ssize_t endpos,
5888                                     const char *reason)
5889{
5890    if (*exceptionObject == NULL) {
5891        *exceptionObject = PyUnicodeTranslateError_Create(
5892            unicode, size, startpos, endpos, reason);
5893    }
5894    else {
5895        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5896            goto onError;
5897        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5898            goto onError;
5899        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5900            goto onError;
5901        return;
5902      onError:
5903        Py_DECREF(*exceptionObject);
5904        *exceptionObject = NULL;
5905    }
5906}
5907
5908/* raises a UnicodeTranslateError */
5909static void raise_translate_exception(PyObject **exceptionObject,
5910                                      const Py_UNICODE *unicode, Py_ssize_t size,
5911                                      Py_ssize_t startpos, Py_ssize_t endpos,
5912                                      const char *reason)
5913{
5914    make_translate_exception(exceptionObject,
5915                             unicode, size, startpos, endpos, reason);
5916    if (*exceptionObject != NULL)
5917        PyCodec_StrictErrors(*exceptionObject);
5918}
5919
5920/* error handling callback helper:
5921   build arguments, call the callback and check the arguments,
5922   put the result into newpos and return the replacement string, which
5923   has to be freed by the caller */
5924static PyObject *unicode_translate_call_errorhandler(const char *errors,
5925                                                     PyObject **errorHandler,
5926                                                     const char *reason,
5927                                                     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5928                                                     Py_ssize_t startpos, Py_ssize_t endpos,
5929                                                     Py_ssize_t *newpos)
5930{
5931    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5932
5933    Py_ssize_t i_newpos;
5934    PyObject *restuple;
5935    PyObject *resunicode;
5936
5937    if (*errorHandler == NULL) {
5938        *errorHandler = PyCodec_LookupError(errors);
5939        if (*errorHandler == NULL)
5940            return NULL;
5941    }
5942
5943    make_translate_exception(exceptionObject,
5944                             unicode, size, startpos, endpos, reason);
5945    if (*exceptionObject == NULL)
5946        return NULL;
5947
5948    restuple = PyObject_CallFunctionObjArgs(
5949        *errorHandler, *exceptionObject, NULL);
5950    if (restuple == NULL)
5951        return NULL;
5952    if (!PyTuple_Check(restuple)) {
5953        PyErr_SetString(PyExc_TypeError, &argparse[4]);
5954        Py_DECREF(restuple);
5955        return NULL;
5956    }
5957    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
5958                          &resunicode, &i_newpos)) {
5959        Py_DECREF(restuple);
5960        return NULL;
5961    }
5962    if (i_newpos<0)
5963        *newpos = size+i_newpos;
5964    else
5965        *newpos = i_newpos;
5966    if (*newpos<0 || *newpos>size) {
5967        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5968        Py_DECREF(restuple);
5969        return NULL;
5970    }
5971    Py_INCREF(resunicode);
5972    Py_DECREF(restuple);
5973    return resunicode;
5974}
5975
5976/* Lookup the character ch in the mapping and put the result in result,
5977   which must be decrefed by the caller.
5978   Return 0 on success, -1 on error */
5979static
5980int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5981{
5982    PyObject *w = PyLong_FromLong((long)c);
5983    PyObject *x;
5984
5985    if (w == NULL)
5986        return -1;
5987    x = PyObject_GetItem(mapping, w);
5988    Py_DECREF(w);
5989    if (x == NULL) {
5990        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5991            /* No mapping found means: use 1:1 mapping. */
5992            PyErr_Clear();
5993            *result = NULL;
5994            return 0;
5995        } else
5996            return -1;
5997    }
5998    else if (x == Py_None) {
5999        *result = x;
6000        return 0;
6001    }
6002    else if (PyLong_Check(x)) {
6003        long value = PyLong_AS_LONG(x);
6004        long max = PyUnicode_GetMax();
6005        if (value < 0 || value > max) {
6006            PyErr_Format(PyExc_TypeError,
6007                         "character mapping must be in range(0x%x)", max+1);
6008            Py_DECREF(x);
6009            return -1;
6010        }
6011        *result = x;
6012        return 0;
6013    }
6014    else if (PyUnicode_Check(x)) {
6015        *result = x;
6016        return 0;
6017    }
6018    else {
6019        /* wrong return value */
6020        PyErr_SetString(PyExc_TypeError,
6021                        "character mapping must return integer, None or str");
6022        Py_DECREF(x);
6023        return -1;
6024    }
6025}
6026/* ensure that *outobj is at least requiredsize characters long,
6027   if not reallocate and adjust various state variables.
6028   Return 0 on success, -1 on error */
6029static
6030int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
6031                               Py_ssize_t requiredsize)
6032{
6033    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
6034    if (requiredsize > oldsize) {
6035        /* remember old output position */
6036        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6037        /* exponentially overallocate to minimize reallocations */
6038        if (requiredsize < 2 * oldsize)
6039            requiredsize = 2 * oldsize;
6040        if (PyUnicode_Resize(outobj, requiredsize) < 0)
6041            return -1;
6042        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
6043    }
6044    return 0;
6045}
6046/* lookup the character, put the result in the output string and adjust
6047   various state variables. Return a new reference to the object that
6048   was put in the output buffer in *result, or Py_None, if the mapping was
6049   undefined (in which case no character was written).
6050   The called must decref result.
6051   Return 0 on success, -1 on error. */
6052static
6053int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6054                            Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6055                            PyObject **res)
6056{
6057    if (charmaptranslate_lookup(*curinp, mapping, res))
6058        return -1;
6059    if (*res==NULL) {
6060        /* not found => default to 1:1 mapping */
6061        *(*outp)++ = *curinp;
6062    }
6063    else if (*res==Py_None)
6064        ;
6065    else if (PyLong_Check(*res)) {
6066        /* no overflow check, because we know that the space is enough */
6067        *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
6068    }
6069    else if (PyUnicode_Check(*res)) {
6070        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6071        if (repsize==1) {
6072            /* no overflow check, because we know that the space is enough */
6073            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6074        }
6075        else if (repsize!=0) {
6076            /* more than one character */
6077            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6078                (insize - (curinp-startinp)) +
6079                repsize - 1;
6080            if (charmaptranslate_makespace(outobj, outp, requiredsize))
6081                return -1;
6082            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6083            *outp += repsize;
6084        }
6085    }
6086    else
6087        return -1;
6088    return 0;
6089}
6090
6091PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6092                                     Py_ssize_t size,
6093                                     PyObject *mapping,
6094                                     const char *errors)
6095{
6096    /* output object */
6097    PyObject *res = NULL;
6098    /* pointers to the beginning and end+1 of input */
6099    const Py_UNICODE *startp = p;
6100    const Py_UNICODE *endp = p + size;
6101    /* pointer into the output */
6102    Py_UNICODE *str;
6103    /* current output position */
6104    Py_ssize_t respos = 0;
6105    char *reason = "character maps to <undefined>";
6106    PyObject *errorHandler = NULL;
6107    PyObject *exc = NULL;
6108    /* the following variable is used for caching string comparisons
6109     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6110     * 3=ignore, 4=xmlcharrefreplace */
6111    int known_errorHandler = -1;
6112
6113    if (mapping == NULL) {
6114        PyErr_BadArgument();
6115        return NULL;
6116    }
6117
6118    /* allocate enough for a simple 1:1 translation without
6119       replacements, if we need more, we'll resize */
6120    res = PyUnicode_FromUnicode(NULL, size);
6121    if (res == NULL)
6122        goto onError;
6123    if (size == 0)
6124        return res;
6125    str = PyUnicode_AS_UNICODE(res);
6126
6127    while (p<endp) {
6128        /* try to encode it */
6129        PyObject *x = NULL;
6130        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6131            Py_XDECREF(x);
6132            goto onError;
6133        }
6134        Py_XDECREF(x);
6135        if (x!=Py_None) /* it worked => adjust input pointer */
6136            ++p;
6137        else { /* untranslatable character */
6138            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6139            Py_ssize_t repsize;
6140            Py_ssize_t newpos;
6141            Py_UNICODE *uni2;
6142            /* startpos for collecting untranslatable chars */
6143            const Py_UNICODE *collstart = p;
6144            const Py_UNICODE *collend = p+1;
6145            const Py_UNICODE *coll;
6146
6147            /* find all untranslatable characters */
6148            while (collend < endp) {
6149                if (charmaptranslate_lookup(*collend, mapping, &x))
6150                    goto onError;
6151                Py_XDECREF(x);
6152                if (x!=Py_None)
6153                    break;
6154                ++collend;
6155            }
6156            /* cache callback name lookup
6157             * (if not done yet, i.e. it's the first error) */
6158            if (known_errorHandler==-1) {
6159                if ((errors==NULL) || (!strcmp(errors, "strict")))
6160                    known_errorHandler = 1;
6161                else if (!strcmp(errors, "replace"))
6162                    known_errorHandler = 2;
6163                else if (!strcmp(errors, "ignore"))
6164                    known_errorHandler = 3;
6165                else if (!strcmp(errors, "xmlcharrefreplace"))
6166                    known_errorHandler = 4;
6167                else
6168                    known_errorHandler = 0;
6169            }
6170            switch (known_errorHandler) {
6171            case 1: /* strict */
6172                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
6173                goto onError;
6174            case 2: /* replace */
6175                /* No need to check for space, this is a 1:1 replacement */
6176                for (coll = collstart; coll<collend; ++coll)
6177                    *str++ = '?';
6178                /* fall through */
6179            case 3: /* ignore */
6180                p = collend;
6181                break;
6182            case 4: /* xmlcharrefreplace */
6183                /* generate replacement (temporarily (mis)uses p) */
6184                for (p = collstart; p < collend; ++p) {
6185                    char buffer[2+29+1+1];
6186                    char *cp;
6187                    sprintf(buffer, "&#%d;", (int)*p);
6188                    if (charmaptranslate_makespace(&res, &str,
6189                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6190                        goto onError;
6191                    for (cp = buffer; *cp; ++cp)
6192                        *str++ = *cp;
6193                }
6194                p = collend;
6195                break;
6196            default:
6197                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6198                                                                 reason, startp, size, &exc,
6199                                                                 collstart-startp, collend-startp, &newpos);
6200                if (repunicode == NULL)
6201                    goto onError;
6202                /* generate replacement  */
6203                repsize = PyUnicode_GET_SIZE(repunicode);
6204                if (charmaptranslate_makespace(&res, &str,
6205                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6206                    Py_DECREF(repunicode);
6207                    goto onError;
6208                }
6209                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6210                    *str++ = *uni2;
6211                p = startp + newpos;
6212                Py_DECREF(repunicode);
6213            }
6214        }
6215    }
6216    /* Resize if we allocated to much */
6217    respos = str-PyUnicode_AS_UNICODE(res);
6218    if (respos<PyUnicode_GET_SIZE(res)) {
6219        if (PyUnicode_Resize(&res, respos) < 0)
6220            goto onError;
6221    }
6222    Py_XDECREF(exc);
6223    Py_XDECREF(errorHandler);
6224    return res;
6225
6226  onError:
6227    Py_XDECREF(res);
6228    Py_XDECREF(exc);
6229    Py_XDECREF(errorHandler);
6230    return NULL;
6231}
6232
6233PyObject *PyUnicode_Translate(PyObject *str,
6234                              PyObject *mapping,
6235                              const char *errors)
6236{
6237    PyObject *result;
6238
6239    str = PyUnicode_FromObject(str);
6240    if (str == NULL)
6241        goto onError;
6242    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
6243                                        PyUnicode_GET_SIZE(str),
6244                                        mapping,
6245                                        errors);
6246    Py_DECREF(str);
6247    return result;
6248
6249  onError:
6250    Py_XDECREF(str);
6251    return NULL;
6252}
6253
6254PyObject *
6255PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6256                                  Py_ssize_t length)
6257{
6258    PyObject *result;
6259    Py_UNICODE *p; /* write pointer into result */
6260    Py_ssize_t i;
6261    /* Copy to a new string */
6262    result = (PyObject *)_PyUnicode_New(length);
6263    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6264    if (result == NULL)
6265        return result;
6266    p = PyUnicode_AS_UNICODE(result);
6267    /* Iterate over code points */
6268    for (i = 0; i < length; i++) {
6269        Py_UNICODE ch =s[i];
6270        if (ch > 127) {
6271            int decimal = Py_UNICODE_TODECIMAL(ch);
6272            if (decimal >= 0)
6273                p[i] = '0' + decimal;
6274        }
6275    }
6276    return result;
6277}
6278/* --- Decimal Encoder ---------------------------------------------------- */
6279
6280int PyUnicode_EncodeDecimal(Py_UNICODE *s,
6281                            Py_ssize_t length,
6282                            char *output,
6283                            const char *errors)
6284{
6285    Py_UNICODE *p, *end;
6286    PyObject *errorHandler = NULL;
6287    PyObject *exc = NULL;
6288    const char *encoding = "decimal";
6289    const char *reason = "invalid decimal Unicode string";
6290    /* the following variable is used for caching string comparisons
6291     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6292    int known_errorHandler = -1;
6293
6294    if (output == NULL) {
6295        PyErr_BadArgument();
6296        return -1;
6297    }
6298
6299    p = s;
6300    end = s + length;
6301    while (p < end) {
6302        register Py_UNICODE ch = *p;
6303        int decimal;
6304        PyObject *repunicode;
6305        Py_ssize_t repsize;
6306        Py_ssize_t newpos;
6307        Py_UNICODE *uni2;
6308        Py_UNICODE *collstart;
6309        Py_UNICODE *collend;
6310
6311        if (Py_UNICODE_ISSPACE(ch)) {
6312            *output++ = ' ';
6313            ++p;
6314            continue;
6315        }
6316        decimal = Py_UNICODE_TODECIMAL(ch);
6317        if (decimal >= 0) {
6318            *output++ = '0' + decimal;
6319            ++p;
6320            continue;
6321        }
6322        if (0 < ch && ch < 256) {
6323            *output++ = (char)ch;
6324            ++p;
6325            continue;
6326        }
6327        /* All other characters are considered unencodable */
6328        collstart = p;
6329        for (collend = p+1; collend < end; collend++) {
6330            if ((0 < *collend && *collend < 256) ||
6331                Py_UNICODE_ISSPACE(*collend) ||
6332                0 <= Py_UNICODE_TODECIMAL(*collend))
6333                break;
6334        }
6335        /* cache callback name lookup
6336         * (if not done yet, i.e. it's the first error) */
6337        if (known_errorHandler==-1) {
6338            if ((errors==NULL) || (!strcmp(errors, "strict")))
6339                known_errorHandler = 1;
6340            else if (!strcmp(errors, "replace"))
6341                known_errorHandler = 2;
6342            else if (!strcmp(errors, "ignore"))
6343                known_errorHandler = 3;
6344            else if (!strcmp(errors, "xmlcharrefreplace"))
6345                known_errorHandler = 4;
6346            else
6347                known_errorHandler = 0;
6348        }
6349        switch (known_errorHandler) {
6350        case 1: /* strict */
6351            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6352            goto onError;
6353        case 2: /* replace */
6354            for (p = collstart; p < collend; ++p)
6355                *output++ = '?';
6356            /* fall through */
6357        case 3: /* ignore */
6358            p = collend;
6359            break;
6360        case 4: /* xmlcharrefreplace */
6361            /* generate replacement (temporarily (mis)uses p) */
6362            for (p = collstart; p < collend; ++p)
6363                output += sprintf(output, "&#%d;", (int)*p);
6364            p = collend;
6365            break;
6366        default:
6367            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6368                                                          encoding, reason, s, length, &exc,
6369                                                          collstart-s, collend-s, &newpos);
6370            if (repunicode == NULL)
6371                goto onError;
6372            if (!PyUnicode_Check(repunicode)) {
6373                /* Byte results not supported, since they have no decimal property. */
6374                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6375                Py_DECREF(repunicode);
6376                goto onError;
6377            }
6378            /* generate replacement  */
6379            repsize = PyUnicode_GET_SIZE(repunicode);
6380            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6381                Py_UNICODE ch = *uni2;
6382                if (Py_UNICODE_ISSPACE(ch))
6383                    *output++ = ' ';
6384                else {
6385                    decimal = Py_UNICODE_TODECIMAL(ch);
6386                    if (decimal >= 0)
6387                        *output++ = '0' + decimal;
6388                    else if (0 < ch && ch < 256)
6389                        *output++ = (char)ch;
6390                    else {
6391                        Py_DECREF(repunicode);
6392                        raise_encode_exception(&exc, encoding,
6393                                               s, length, collstart-s, collend-s, reason);
6394                        goto onError;
6395                    }
6396                }
6397            }
6398            p = s + newpos;
6399            Py_DECREF(repunicode);
6400        }
6401    }
6402    /* 0-terminate the output string */
6403    *output++ = '\0';
6404    Py_XDECREF(exc);
6405    Py_XDECREF(errorHandler);
6406    return 0;
6407
6408  onError:
6409    Py_XDECREF(exc);
6410    Py_XDECREF(errorHandler);
6411    return -1;
6412}
6413
6414/* --- Helpers ------------------------------------------------------------ */
6415
6416#include "stringlib/unicodedefs.h"
6417#include "stringlib/fastsearch.h"
6418
6419#include "stringlib/count.h"
6420#include "stringlib/find.h"
6421#include "stringlib/partition.h"
6422#include "stringlib/split.h"
6423
6424#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
6425#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
6426#include "stringlib/localeutil.h"
6427
6428/* helper macro to fixup start/end slice values */
6429#define ADJUST_INDICES(start, end, len)         \
6430    if (end > len)                              \
6431        end = len;                              \
6432    else if (end < 0) {                         \
6433        end += len;                             \
6434        if (end < 0)                            \
6435            end = 0;                            \
6436    }                                           \
6437    if (start < 0) {                            \
6438        start += len;                           \
6439        if (start < 0)                          \
6440            start = 0;                          \
6441    }
6442
6443/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
6444 * by 'ptr', possibly combining surrogate pairs on narrow builds.
6445 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
6446 * that should be returned and 'end' pointing to the end of the buffer.
6447 * ('end' is used on narrow builds to detect a lone surrogate at the
6448 * end of the buffer that should be returned unchanged.)
6449 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
6450 * The type of the returned char is always Py_UCS4.
6451 *
6452 * Note: the macro advances ptr to next char, so it might have side-effects
6453 *       (especially if used with other macros).
6454 */
6455
6456/* helper macros used by _Py_UNICODE_NEXT */
6457#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
6458#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
6459/* Join two surrogate characters and return a single Py_UCS4 value. */
6460#define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
6461    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
6462      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
6463
6464#ifdef Py_UNICODE_WIDE
6465#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
6466#else
6467#define _Py_UNICODE_NEXT(ptr, end)                                      \
6468     (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
6469        _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
6470       ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
6471       (Py_UCS4)*(ptr)++)
6472#endif
6473
6474Py_ssize_t PyUnicode_Count(PyObject *str,
6475                           PyObject *substr,
6476                           Py_ssize_t start,
6477                           Py_ssize_t end)
6478{
6479    Py_ssize_t result;
6480    PyUnicodeObject* str_obj;
6481    PyUnicodeObject* sub_obj;
6482
6483    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6484    if (!str_obj)
6485        return -1;
6486    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6487    if (!sub_obj) {
6488        Py_DECREF(str_obj);
6489        return -1;
6490    }
6491
6492    ADJUST_INDICES(start, end, str_obj->length);
6493    result = stringlib_count(
6494        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6495        PY_SSIZE_T_MAX
6496        );
6497
6498    Py_DECREF(sub_obj);
6499    Py_DECREF(str_obj);
6500
6501    return result;
6502}
6503
6504Py_ssize_t PyUnicode_Find(PyObject *str,
6505                          PyObject *sub,
6506                          Py_ssize_t start,
6507                          Py_ssize_t end,
6508                          int direction)
6509{
6510    Py_ssize_t result;
6511
6512    str = PyUnicode_FromObject(str);
6513    if (!str)
6514        return -2;
6515    sub = PyUnicode_FromObject(sub);
6516    if (!sub) {
6517        Py_DECREF(str);
6518        return -2;
6519    }
6520
6521    if (direction > 0)
6522        result = stringlib_find_slice(
6523            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6524            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6525            start, end
6526            );
6527    else
6528        result = stringlib_rfind_slice(
6529            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6530            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6531            start, end
6532            );
6533
6534    Py_DECREF(str);
6535    Py_DECREF(sub);
6536
6537    return result;
6538}
6539
6540static
6541int tailmatch(PyUnicodeObject *self,
6542              PyUnicodeObject *substring,
6543              Py_ssize_t start,
6544              Py_ssize_t end,
6545              int direction)
6546{
6547    if (substring->length == 0)
6548        return 1;
6549
6550    ADJUST_INDICES(start, end, self->length);
6551    end -= substring->length;
6552    if (end < start)
6553        return 0;
6554
6555    if (direction > 0) {
6556        if (Py_UNICODE_MATCH(self, end, substring))
6557            return 1;
6558    } else {
6559        if (Py_UNICODE_MATCH(self, start, substring))
6560            return 1;
6561    }
6562
6563    return 0;
6564}
6565
6566Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
6567                               PyObject *substr,
6568                               Py_ssize_t start,
6569                               Py_ssize_t end,
6570                               int direction)
6571{
6572    Py_ssize_t result;
6573
6574    str = PyUnicode_FromObject(str);
6575    if (str == NULL)
6576        return -1;
6577    substr = PyUnicode_FromObject(substr);
6578    if (substr == NULL) {
6579        Py_DECREF(str);
6580        return -1;
6581    }
6582
6583    result = tailmatch((PyUnicodeObject *)str,
6584                       (PyUnicodeObject *)substr,
6585                       start, end, direction);
6586    Py_DECREF(str);
6587    Py_DECREF(substr);
6588    return result;
6589}
6590
6591/* Apply fixfct filter to the Unicode object self and return a
6592   reference to the modified object */
6593
6594static
6595PyObject *fixup(PyUnicodeObject *self,
6596                int (*fixfct)(PyUnicodeObject *s))
6597{
6598
6599    PyUnicodeObject *u;
6600
6601    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6602    if (u == NULL)
6603        return NULL;
6604
6605    Py_UNICODE_COPY(u->str, self->str, self->length);
6606
6607    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6608        /* fixfct should return TRUE if it modified the buffer. If
6609           FALSE, return a reference to the original buffer instead
6610           (to save space, not time) */
6611        Py_INCREF(self);
6612        Py_DECREF(u);
6613        return (PyObject*) self;
6614    }
6615    return (PyObject*) u;
6616}
6617
6618static
6619int fixupper(PyUnicodeObject *self)
6620{
6621    Py_ssize_t len = self->length;
6622    Py_UNICODE *s = self->str;
6623    int status = 0;
6624
6625    while (len-- > 0) {
6626        register Py_UNICODE ch;
6627
6628        ch = Py_UNICODE_TOUPPER(*s);
6629        if (ch != *s) {
6630            status = 1;
6631            *s = ch;
6632        }
6633        s++;
6634    }
6635
6636    return status;
6637}
6638
6639static
6640int fixlower(PyUnicodeObject *self)
6641{
6642    Py_ssize_t len = self->length;
6643    Py_UNICODE *s = self->str;
6644    int status = 0;
6645
6646    while (len-- > 0) {
6647        register Py_UNICODE ch;
6648
6649        ch = Py_UNICODE_TOLOWER(*s);
6650        if (ch != *s) {
6651            status = 1;
6652            *s = ch;
6653        }
6654        s++;
6655    }
6656
6657    return status;
6658}
6659
6660static
6661int fixswapcase(PyUnicodeObject *self)
6662{
6663    Py_ssize_t len = self->length;
6664    Py_UNICODE *s = self->str;
6665    int status = 0;
6666
6667    while (len-- > 0) {
6668        if (Py_UNICODE_ISUPPER(*s)) {
6669            *s = Py_UNICODE_TOLOWER(*s);
6670            status = 1;
6671        } else if (Py_UNICODE_ISLOWER(*s)) {
6672            *s = Py_UNICODE_TOUPPER(*s);
6673            status = 1;
6674        }
6675        s++;
6676    }
6677
6678    return status;
6679}
6680
6681static
6682int fixcapitalize(PyUnicodeObject *self)
6683{
6684    Py_ssize_t len = self->length;
6685    Py_UNICODE *s = self->str;
6686    int status = 0;
6687
6688    if (len == 0)
6689        return 0;
6690    if (!Py_UNICODE_ISUPPER(*s)) {
6691        *s = Py_UNICODE_TOUPPER(*s);
6692        status = 1;
6693    }
6694    s++;
6695    while (--len > 0) {
6696        if (!Py_UNICODE_ISLOWER(*s)) {
6697            *s = Py_UNICODE_TOLOWER(*s);
6698            status = 1;
6699        }
6700        s++;
6701    }
6702    return status;
6703}
6704
6705static
6706int fixtitle(PyUnicodeObject *self)
6707{
6708    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6709    register Py_UNICODE *e;
6710    int previous_is_cased;
6711
6712    /* Shortcut for single character strings */
6713    if (PyUnicode_GET_SIZE(self) == 1) {
6714        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6715        if (*p != ch) {
6716            *p = ch;
6717            return 1;
6718        }
6719        else
6720            return 0;
6721    }
6722
6723    e = p + PyUnicode_GET_SIZE(self);
6724    previous_is_cased = 0;
6725    for (; p < e; p++) {
6726        register const Py_UNICODE ch = *p;
6727
6728        if (previous_is_cased)
6729            *p = Py_UNICODE_TOLOWER(ch);
6730        else
6731            *p = Py_UNICODE_TOTITLE(ch);
6732
6733        if (Py_UNICODE_ISLOWER(ch) ||
6734            Py_UNICODE_ISUPPER(ch) ||
6735            Py_UNICODE_ISTITLE(ch))
6736            previous_is_cased = 1;
6737        else
6738            previous_is_cased = 0;
6739    }
6740    return 1;
6741}
6742
6743PyObject *
6744PyUnicode_Join(PyObject *separator, PyObject *seq)
6745{
6746    const Py_UNICODE blank = ' ';
6747    const Py_UNICODE *sep = &blank;
6748    Py_ssize_t seplen = 1;
6749    PyUnicodeObject *res = NULL; /* the result */
6750    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
6751    PyObject *fseq;          /* PySequence_Fast(seq) */
6752    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
6753    PyObject **items;
6754    PyObject *item;
6755    Py_ssize_t sz, i;
6756
6757    fseq = PySequence_Fast(seq, "");
6758    if (fseq == NULL) {
6759        return NULL;
6760    }
6761
6762    /* NOTE: the following code can't call back into Python code,
6763     * so we are sure that fseq won't be mutated.
6764     */
6765
6766    seqlen = PySequence_Fast_GET_SIZE(fseq);
6767    /* If empty sequence, return u"". */
6768    if (seqlen == 0) {
6769        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
6770        goto Done;
6771    }
6772    items = PySequence_Fast_ITEMS(fseq);
6773    /* If singleton sequence with an exact Unicode, return that. */
6774    if (seqlen == 1) {
6775        item = items[0];
6776        if (PyUnicode_CheckExact(item)) {
6777            Py_INCREF(item);
6778            res = (PyUnicodeObject *)item;
6779            goto Done;
6780        }
6781    }
6782    else {
6783        /* Set up sep and seplen */
6784        if (separator == NULL) {
6785            sep = &blank;
6786            seplen = 1;
6787        }
6788        else {
6789            if (!PyUnicode_Check(separator)) {
6790                PyErr_Format(PyExc_TypeError,
6791                             "separator: expected str instance,"
6792                             " %.80s found",
6793                             Py_TYPE(separator)->tp_name);
6794                goto onError;
6795            }
6796            sep = PyUnicode_AS_UNICODE(separator);
6797            seplen = PyUnicode_GET_SIZE(separator);
6798        }
6799    }
6800
6801    /* There are at least two things to join, or else we have a subclass
6802     * of str in the sequence.
6803     * Do a pre-pass to figure out the total amount of space we'll
6804     * need (sz), and see whether all argument are strings.
6805     */
6806    sz = 0;
6807    for (i = 0; i < seqlen; i++) {
6808        const Py_ssize_t old_sz = sz;
6809        item = items[i];
6810        if (!PyUnicode_Check(item)) {
6811            PyErr_Format(PyExc_TypeError,
6812                         "sequence item %zd: expected str instance,"
6813                         " %.80s found",
6814                         i, Py_TYPE(item)->tp_name);
6815            goto onError;
6816        }
6817        sz += PyUnicode_GET_SIZE(item);
6818        if (i != 0)
6819            sz += seplen;
6820        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6821            PyErr_SetString(PyExc_OverflowError,
6822                            "join() result is too long for a Python string");
6823            goto onError;
6824        }
6825    }
6826
6827    res = _PyUnicode_New(sz);
6828    if (res == NULL)
6829        goto onError;
6830
6831    /* Catenate everything. */
6832    res_p = PyUnicode_AS_UNICODE(res);
6833    for (i = 0; i < seqlen; ++i) {
6834        Py_ssize_t itemlen;
6835        item = items[i];
6836        itemlen = PyUnicode_GET_SIZE(item);
6837        /* Copy item, and maybe the separator. */
6838        if (i) {
6839            Py_UNICODE_COPY(res_p, sep, seplen);
6840            res_p += seplen;
6841        }
6842        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6843        res_p += itemlen;
6844    }
6845
6846  Done:
6847    Py_DECREF(fseq);
6848    return (PyObject *)res;
6849
6850  onError:
6851    Py_DECREF(fseq);
6852    Py_XDECREF(res);
6853    return NULL;
6854}
6855
6856static
6857PyUnicodeObject *pad(PyUnicodeObject *self,
6858                     Py_ssize_t left,
6859                     Py_ssize_t right,
6860                     Py_UNICODE fill)
6861{
6862    PyUnicodeObject *u;
6863
6864    if (left < 0)
6865        left = 0;
6866    if (right < 0)
6867        right = 0;
6868
6869    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6870        Py_INCREF(self);
6871        return self;
6872    }
6873
6874    if (left > PY_SSIZE_T_MAX - self->length ||
6875        right > PY_SSIZE_T_MAX - (left + self->length)) {
6876        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6877        return NULL;
6878    }
6879    u = _PyUnicode_New(left + self->length + right);
6880    if (u) {
6881        if (left)
6882            Py_UNICODE_FILL(u->str, fill, left);
6883        Py_UNICODE_COPY(u->str + left, self->str, self->length);
6884        if (right)
6885            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6886    }
6887
6888    return u;
6889}
6890
6891PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
6892{
6893    PyObject *list;
6894
6895    string = PyUnicode_FromObject(string);
6896    if (string == NULL)
6897        return NULL;
6898
6899    list = stringlib_splitlines(
6900        (PyObject*) string, PyUnicode_AS_UNICODE(string),
6901        PyUnicode_GET_SIZE(string), keepends);
6902
6903    Py_DECREF(string);
6904    return list;
6905}
6906
6907static
6908PyObject *split(PyUnicodeObject *self,
6909                PyUnicodeObject *substring,
6910                Py_ssize_t maxcount)
6911{
6912    if (maxcount < 0)
6913        maxcount = PY_SSIZE_T_MAX;
6914
6915    if (substring == NULL)
6916        return stringlib_split_whitespace(
6917            (PyObject*) self,  self->str, self->length, maxcount
6918            );
6919
6920    return stringlib_split(
6921        (PyObject*) self,  self->str, self->length,
6922        substring->str, substring->length,
6923        maxcount
6924        );
6925}
6926
6927static
6928PyObject *rsplit(PyUnicodeObject *self,
6929                 PyUnicodeObject *substring,
6930                 Py_ssize_t maxcount)
6931{
6932    if (maxcount < 0)
6933        maxcount = PY_SSIZE_T_MAX;
6934
6935    if (substring == NULL)
6936        return stringlib_rsplit_whitespace(
6937            (PyObject*) self,  self->str, self->length, maxcount
6938            );
6939
6940    return stringlib_rsplit(
6941        (PyObject*) self,  self->str, self->length,
6942        substring->str, substring->length,
6943        maxcount
6944        );
6945}
6946
6947static
6948PyObject *replace(PyUnicodeObject *self,
6949                  PyUnicodeObject *str1,
6950                  PyUnicodeObject *str2,
6951                  Py_ssize_t maxcount)
6952{
6953    PyUnicodeObject *u;
6954
6955    if (maxcount < 0)
6956        maxcount = PY_SSIZE_T_MAX;
6957    else if (maxcount == 0 || self->length == 0)
6958        goto nothing;
6959
6960    if (str1->length == str2->length) {
6961        Py_ssize_t i;
6962        /* same length */
6963        if (str1->length == 0)
6964            goto nothing;
6965        if (str1->length == 1) {
6966            /* replace characters */
6967            Py_UNICODE u1, u2;
6968            if (!findchar(self->str, self->length, str1->str[0]))
6969                goto nothing;
6970            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6971            if (!u)
6972                return NULL;
6973            Py_UNICODE_COPY(u->str, self->str, self->length);
6974            u1 = str1->str[0];
6975            u2 = str2->str[0];
6976            for (i = 0; i < u->length; i++)
6977                if (u->str[i] == u1) {
6978                    if (--maxcount < 0)
6979                        break;
6980                    u->str[i] = u2;
6981                }
6982        } else {
6983            i = stringlib_find(
6984                self->str, self->length, str1->str, str1->length, 0
6985                );
6986            if (i < 0)
6987                goto nothing;
6988            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6989            if (!u)
6990                return NULL;
6991            Py_UNICODE_COPY(u->str, self->str, self->length);
6992
6993            /* change everything in-place, starting with this one */
6994            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6995            i += str1->length;
6996
6997            while ( --maxcount > 0) {
6998                i = stringlib_find(self->str+i, self->length-i,
6999                                   str1->str, str1->length,
7000                                   i);
7001                if (i == -1)
7002                    break;
7003                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7004                i += str1->length;
7005            }
7006        }
7007    } else {
7008
7009        Py_ssize_t n, i, j;
7010        Py_ssize_t product, new_size, delta;
7011        Py_UNICODE *p;
7012
7013        /* replace strings */
7014        n = stringlib_count(self->str, self->length, str1->str, str1->length,
7015                            maxcount);
7016        if (n == 0)
7017            goto nothing;
7018        /* new_size = self->length + n * (str2->length - str1->length)); */
7019        delta = (str2->length - str1->length);
7020        if (delta == 0) {
7021            new_size = self->length;
7022        } else {
7023            product = n * (str2->length - str1->length);
7024            if ((product / (str2->length - str1->length)) != n) {
7025                PyErr_SetString(PyExc_OverflowError,
7026                                "replace string is too long");
7027                return NULL;
7028            }
7029            new_size = self->length + product;
7030            if (new_size < 0) {
7031                PyErr_SetString(PyExc_OverflowError,
7032                                "replace string is too long");
7033                return NULL;
7034            }
7035        }
7036        u = _PyUnicode_New(new_size);
7037        if (!u)
7038            return NULL;
7039        i = 0;
7040        p = u->str;
7041        if (str1->length > 0) {
7042            while (n-- > 0) {
7043                /* look for next match */
7044                j = stringlib_find(self->str+i, self->length-i,
7045                                   str1->str, str1->length,
7046                                   i);
7047                if (j == -1)
7048                    break;
7049                else if (j > i) {
7050                    /* copy unchanged part [i:j] */
7051                    Py_UNICODE_COPY(p, self->str+i, j-i);
7052                    p += j - i;
7053                }
7054                /* copy substitution string */
7055                if (str2->length > 0) {
7056                    Py_UNICODE_COPY(p, str2->str, str2->length);
7057                    p += str2->length;
7058                }
7059                i = j + str1->length;
7060            }
7061            if (i < self->length)
7062                /* copy tail [i:] */
7063                Py_UNICODE_COPY(p, self->str+i, self->length-i);
7064        } else {
7065            /* interleave */
7066            while (n > 0) {
7067                Py_UNICODE_COPY(p, str2->str, str2->length);
7068                p += str2->length;
7069                if (--n <= 0)
7070                    break;
7071                *p++ = self->str[i++];
7072            }
7073            Py_UNICODE_COPY(p, self->str+i, self->length-i);
7074        }
7075    }
7076    return (PyObject *) u;
7077
7078  nothing:
7079    /* nothing to replace; return original string (when possible) */
7080    if (PyUnicode_CheckExact(self)) {
7081        Py_INCREF(self);
7082        return (PyObject *) self;
7083    }
7084    return PyUnicode_FromUnicode(self->str, self->length);
7085}
7086
7087/* --- Unicode Object Methods --------------------------------------------- */
7088
7089PyDoc_STRVAR(title__doc__,
7090             "S.title() -> str\n\
7091\n\
7092Return a titlecased version of S, i.e. words start with title case\n\
7093characters, all remaining cased characters have lower case.");
7094
7095static PyObject*
7096unicode_title(PyUnicodeObject *self)
7097{
7098    return fixup(self, fixtitle);
7099}
7100
7101PyDoc_STRVAR(capitalize__doc__,
7102             "S.capitalize() -> str\n\
7103\n\
7104Return a capitalized version of S, i.e. make the first character\n\
7105have upper case and the rest lower case.");
7106
7107static PyObject*
7108unicode_capitalize(PyUnicodeObject *self)
7109{
7110    return fixup(self, fixcapitalize);
7111}
7112
7113#if 0
7114PyDoc_STRVAR(capwords__doc__,
7115             "S.capwords() -> str\n\
7116\n\
7117Apply .capitalize() to all words in S and return the result with\n\
7118normalized whitespace (all whitespace strings are replaced by ' ').");
7119
7120static PyObject*
7121unicode_capwords(PyUnicodeObject *self)
7122{
7123    PyObject *list;
7124    PyObject *item;
7125    Py_ssize_t i;
7126
7127    /* Split into words */
7128    list = split(self, NULL, -1);
7129    if (!list)
7130        return NULL;
7131
7132    /* Capitalize each word */
7133    for (i = 0; i < PyList_GET_SIZE(list); i++) {
7134        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
7135                     fixcapitalize);
7136        if (item == NULL)
7137            goto onError;
7138        Py_DECREF(PyList_GET_ITEM(list, i));
7139        PyList_SET_ITEM(list, i, item);
7140    }
7141
7142    /* Join the words to form a new string */
7143    item = PyUnicode_Join(NULL, list);
7144
7145  onError:
7146    Py_DECREF(list);
7147    return (PyObject *)item;
7148}
7149#endif
7150
7151/* Argument converter.  Coerces to a single unicode character */
7152
7153static int
7154convert_uc(PyObject *obj, void *addr)
7155{
7156    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7157    PyObject *uniobj;
7158    Py_UNICODE *unistr;
7159
7160    uniobj = PyUnicode_FromObject(obj);
7161    if (uniobj == NULL) {
7162        PyErr_SetString(PyExc_TypeError,
7163                        "The fill character cannot be converted to Unicode");
7164        return 0;
7165    }
7166    if (PyUnicode_GET_SIZE(uniobj) != 1) {
7167        PyErr_SetString(PyExc_TypeError,
7168                        "The fill character must be exactly one character long");
7169        Py_DECREF(uniobj);
7170        return 0;
7171    }
7172    unistr = PyUnicode_AS_UNICODE(uniobj);
7173    *fillcharloc = unistr[0];
7174    Py_DECREF(uniobj);
7175    return 1;
7176}
7177
7178PyDoc_STRVAR(center__doc__,
7179             "S.center(width[, fillchar]) -> str\n\
7180\n\
7181Return S centered in a string of length width. Padding is\n\
7182done using the specified fill character (default is a space)");
7183
7184static PyObject *
7185unicode_center(PyUnicodeObject *self, PyObject *args)
7186{
7187    Py_ssize_t marg, left;
7188    Py_ssize_t width;
7189    Py_UNICODE fillchar = ' ';
7190
7191    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
7192        return NULL;
7193
7194    if (self->length >= width && PyUnicode_CheckExact(self)) {
7195        Py_INCREF(self);
7196        return (PyObject*) self;
7197    }
7198
7199    marg = width - self->length;
7200    left = marg / 2 + (marg & width & 1);
7201
7202    return (PyObject*) pad(self, left, marg - left, fillchar);
7203}
7204
7205#if 0
7206
7207/* This code should go into some future Unicode collation support
7208   module. The basic comparison should compare ordinals on a naive
7209   basis (this is what Java does and thus Jython too). */
7210
7211/* speedy UTF-16 code point order comparison */
7212/* gleaned from: */
7213/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7214
7215static short utf16Fixup[32] =
7216{
7217    0, 0, 0, 0, 0, 0, 0, 0,
7218    0, 0, 0, 0, 0, 0, 0, 0,
7219    0, 0, 0, 0, 0, 0, 0, 0,
7220    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
7221};
7222
7223static int
7224unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7225{
7226    Py_ssize_t len1, len2;
7227
7228    Py_UNICODE *s1 = str1->str;
7229    Py_UNICODE *s2 = str2->str;
7230
7231    len1 = str1->length;
7232    len2 = str2->length;
7233
7234    while (len1 > 0 && len2 > 0) {
7235        Py_UNICODE c1, c2;
7236
7237        c1 = *s1++;
7238        c2 = *s2++;
7239
7240        if (c1 > (1<<11) * 26)
7241            c1 += utf16Fixup[c1>>11];
7242        if (c2 > (1<<11) * 26)
7243            c2 += utf16Fixup[c2>>11];
7244        /* now c1 and c2 are in UTF-32-compatible order */
7245
7246        if (c1 != c2)
7247            return (c1 < c2) ? -1 : 1;
7248
7249        len1--; len2--;
7250    }
7251
7252    return (len1 < len2) ? -1 : (len1 != len2);
7253}
7254
7255#else
7256
7257static int
7258unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7259{
7260    register Py_ssize_t len1, len2;
7261
7262    Py_UNICODE *s1 = str1->str;
7263    Py_UNICODE *s2 = str2->str;
7264
7265    len1 = str1->length;
7266    len2 = str2->length;
7267
7268    while (len1 > 0 && len2 > 0) {
7269        Py_UNICODE c1, c2;
7270
7271        c1 = *s1++;
7272        c2 = *s2++;
7273
7274        if (c1 != c2)
7275            return (c1 < c2) ? -1 : 1;
7276
7277        len1--; len2--;
7278    }
7279
7280    return (len1 < len2) ? -1 : (len1 != len2);
7281}
7282
7283#endif
7284
7285int PyUnicode_Compare(PyObject *left,
7286                      PyObject *right)
7287{
7288    if (PyUnicode_Check(left) && PyUnicode_Check(right))
7289        return unicode_compare((PyUnicodeObject *)left,
7290                               (PyUnicodeObject *)right);
7291    PyErr_Format(PyExc_TypeError,
7292                 "Can't compare %.100s and %.100s",
7293                 left->ob_type->tp_name,
7294                 right->ob_type->tp_name);
7295    return -1;
7296}
7297
7298int
7299PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7300{
7301    int i;
7302    Py_UNICODE *id;
7303    assert(PyUnicode_Check(uni));
7304    id = PyUnicode_AS_UNICODE(uni);
7305    /* Compare Unicode string and source character set string */
7306    for (i = 0; id[i] && str[i]; i++)
7307        if (id[i] != str[i])
7308            return ((int)id[i] < (int)str[i]) ? -1 : 1;
7309    /* This check keeps Python strings that end in '\0' from comparing equal
7310     to C strings identical up to that point. */
7311    if (PyUnicode_GET_SIZE(uni) != i || id[i])
7312        return 1; /* uni is longer */
7313    if (str[i])
7314        return -1; /* str is longer */
7315    return 0;
7316}
7317
7318
7319#define TEST_COND(cond)                         \
7320    ((cond) ? Py_True : Py_False)
7321
7322PyObject *PyUnicode_RichCompare(PyObject *left,
7323                                PyObject *right,
7324                                int op)
7325{
7326    int result;
7327
7328    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7329        PyObject *v;
7330        if (((PyUnicodeObject *) left)->length !=
7331            ((PyUnicodeObject *) right)->length) {
7332            if (op == Py_EQ) {
7333                Py_INCREF(Py_False);
7334                return Py_False;
7335            }
7336            if (op == Py_NE) {
7337                Py_INCREF(Py_True);
7338                return Py_True;
7339            }
7340        }
7341        if (left == right)
7342            result = 0;
7343        else
7344            result = unicode_compare((PyUnicodeObject *)left,
7345                                     (PyUnicodeObject *)right);
7346
7347        /* Convert the return value to a Boolean */
7348        switch (op) {
7349        case Py_EQ:
7350            v = TEST_COND(result == 0);
7351            break;
7352        case Py_NE:
7353            v = TEST_COND(result != 0);
7354            break;
7355        case Py_LE:
7356            v = TEST_COND(result <= 0);
7357            break;
7358        case Py_GE:
7359            v = TEST_COND(result >= 0);
7360            break;
7361        case Py_LT:
7362            v = TEST_COND(result == -1);
7363            break;
7364        case Py_GT:
7365            v = TEST_COND(result == 1);
7366            break;
7367        default:
7368            PyErr_BadArgument();
7369            return NULL;
7370        }
7371        Py_INCREF(v);
7372        return v;
7373    }
7374
7375    Py_INCREF(Py_NotImplemented);
7376    return Py_NotImplemented;
7377}
7378
7379int PyUnicode_Contains(PyObject *container,
7380                       PyObject *element)
7381{
7382    PyObject *str, *sub;
7383    int result;
7384
7385    /* Coerce the two arguments */
7386    sub = PyUnicode_FromObject(element);
7387    if (!sub) {
7388        PyErr_Format(PyExc_TypeError,
7389                     "'in <string>' requires string as left operand, not %s",
7390                     element->ob_type->tp_name);
7391        return -1;
7392    }
7393
7394    str = PyUnicode_FromObject(container);
7395    if (!str) {
7396        Py_DECREF(sub);
7397        return -1;
7398    }
7399
7400    result = stringlib_contains_obj(str, sub);
7401
7402    Py_DECREF(str);
7403    Py_DECREF(sub);
7404
7405    return result;
7406}
7407
7408/* Concat to string or Unicode object giving a new Unicode object. */
7409
7410PyObject *PyUnicode_Concat(PyObject *left,
7411                           PyObject *right)
7412{
7413    PyUnicodeObject *u = NULL, *v = NULL, *w;
7414
7415    /* Coerce the two arguments */
7416    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7417    if (u == NULL)
7418        goto onError;
7419    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7420    if (v == NULL)
7421        goto onError;
7422
7423    /* Shortcuts */
7424    if (v == unicode_empty) {
7425        Py_DECREF(v);
7426        return (PyObject *)u;
7427    }
7428    if (u == unicode_empty) {
7429        Py_DECREF(u);
7430        return (PyObject *)v;
7431    }
7432
7433    /* Concat the two Unicode strings */
7434    w = _PyUnicode_New(u->length + v->length);
7435    if (w == NULL)
7436        goto onError;
7437    Py_UNICODE_COPY(w->str, u->str, u->length);
7438    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7439
7440    Py_DECREF(u);
7441    Py_DECREF(v);
7442    return (PyObject *)w;
7443
7444  onError:
7445    Py_XDECREF(u);
7446    Py_XDECREF(v);
7447    return NULL;
7448}
7449
7450void
7451PyUnicode_Append(PyObject **pleft, PyObject *right)
7452{
7453    PyObject *new;
7454    if (*pleft == NULL)
7455        return;
7456    if (right == NULL || !PyUnicode_Check(*pleft)) {
7457        Py_DECREF(*pleft);
7458        *pleft = NULL;
7459        return;
7460    }
7461    new = PyUnicode_Concat(*pleft, right);
7462    Py_DECREF(*pleft);
7463    *pleft = new;
7464}
7465
7466void
7467PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7468{
7469    PyUnicode_Append(pleft, right);
7470    Py_XDECREF(right);
7471}
7472
7473PyDoc_STRVAR(count__doc__,
7474             "S.count(sub[, start[, end]]) -> int\n\
7475\n\
7476Return the number of non-overlapping occurrences of substring sub in\n\
7477string S[start:end].  Optional arguments start and end are\n\
7478interpreted as in slice notation.");
7479
7480static PyObject *
7481unicode_count(PyUnicodeObject *self, PyObject *args)
7482{
7483    PyUnicodeObject *substring;
7484    Py_ssize_t start = 0;
7485    Py_ssize_t end = PY_SSIZE_T_MAX;
7486    PyObject *result;
7487
7488    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7489                                            &start, &end))
7490        return NULL;
7491
7492    ADJUST_INDICES(start, end, self->length);
7493    result = PyLong_FromSsize_t(
7494        stringlib_count(self->str + start, end - start,
7495                        substring->str, substring->length,
7496                        PY_SSIZE_T_MAX)
7497        );
7498
7499    Py_DECREF(substring);
7500
7501    return result;
7502}
7503
7504PyDoc_STRVAR(encode__doc__,
7505             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
7506\n\
7507Encode S using the codec registered for encoding. Default encoding\n\
7508is 'utf-8'. errors may be given to set a different error\n\
7509handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7510a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7511'xmlcharrefreplace' as well as any other name registered with\n\
7512codecs.register_error that can handle UnicodeEncodeErrors.");
7513
7514static PyObject *
7515unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7516{
7517    static char *kwlist[] = {"encoding", "errors", 0};
7518    char *encoding = NULL;
7519    char *errors = NULL;
7520
7521    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7522                                     kwlist, &encoding, &errors))
7523        return NULL;
7524    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7525}
7526
7527PyDoc_STRVAR(expandtabs__doc__,
7528             "S.expandtabs([tabsize]) -> str\n\
7529\n\
7530Return a copy of S where all tab characters are expanded using spaces.\n\
7531If tabsize is not given, a tab size of 8 characters is assumed.");
7532
7533static PyObject*
7534unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7535{
7536    Py_UNICODE *e;
7537    Py_UNICODE *p;
7538    Py_UNICODE *q;
7539    Py_UNICODE *qe;
7540    Py_ssize_t i, j, incr;
7541    PyUnicodeObject *u;
7542    int tabsize = 8;
7543
7544    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
7545        return NULL;
7546
7547    /* First pass: determine size of output string */
7548    i = 0; /* chars up to and including most recent \n or \r */
7549    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7550    e = self->str + self->length; /* end of input */
7551    for (p = self->str; p < e; p++)
7552        if (*p == '\t') {
7553            if (tabsize > 0) {
7554                incr = tabsize - (j % tabsize); /* cannot overflow */
7555                if (j > PY_SSIZE_T_MAX - incr)
7556                    goto overflow1;
7557                j += incr;
7558            }
7559        }
7560        else {
7561            if (j > PY_SSIZE_T_MAX - 1)
7562                goto overflow1;
7563            j++;
7564            if (*p == '\n' || *p == '\r') {
7565                if (i > PY_SSIZE_T_MAX - j)
7566                    goto overflow1;
7567                i += j;
7568                j = 0;
7569            }
7570        }
7571
7572    if (i > PY_SSIZE_T_MAX - j)
7573        goto overflow1;
7574
7575    /* Second pass: create output string and fill it */
7576    u = _PyUnicode_New(i + j);
7577    if (!u)
7578        return NULL;
7579
7580    j = 0; /* same as in first pass */
7581    q = u->str; /* next output char */
7582    qe = u->str + u->length; /* end of output */
7583
7584    for (p = self->str; p < e; p++)
7585        if (*p == '\t') {
7586            if (tabsize > 0) {
7587                i = tabsize - (j % tabsize);
7588                j += i;
7589                while (i--) {
7590                    if (q >= qe)
7591                        goto overflow2;
7592                    *q++ = ' ';
7593                }
7594            }
7595        }
7596        else {
7597            if (q >= qe)
7598                goto overflow2;
7599            *q++ = *p;
7600            j++;
7601            if (*p == '\n' || *p == '\r')
7602                j = 0;
7603        }
7604
7605    return (PyObject*) u;
7606
7607  overflow2:
7608    Py_DECREF(u);
7609  overflow1:
7610    PyErr_SetString(PyExc_OverflowError, "new string is too long");
7611    return NULL;
7612}
7613
7614PyDoc_STRVAR(find__doc__,
7615             "S.find(sub[, start[, end]]) -> int\n\
7616\n\
7617Return the lowest index in S where substring sub is found,\n\
7618such that sub is contained within S[start:end].  Optional\n\
7619arguments start and end are interpreted as in slice notation.\n\
7620\n\
7621Return -1 on failure.");
7622
7623static PyObject *
7624unicode_find(PyUnicodeObject *self, PyObject *args)
7625{
7626    PyUnicodeObject *substring;
7627    Py_ssize_t start;
7628    Py_ssize_t end;
7629    Py_ssize_t result;
7630
7631    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7632                                            &start, &end))
7633        return NULL;
7634
7635    result = stringlib_find_slice(
7636        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7637        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7638        start, end
7639        );
7640
7641    Py_DECREF(substring);
7642
7643    return PyLong_FromSsize_t(result);
7644}
7645
7646static PyObject *
7647unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7648{
7649    if (index < 0 || index >= self->length) {
7650        PyErr_SetString(PyExc_IndexError, "string index out of range");
7651        return NULL;
7652    }
7653
7654    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7655}
7656
7657/* Believe it or not, this produces the same value for ASCII strings
7658   as string_hash(). */
7659static Py_hash_t
7660unicode_hash(PyUnicodeObject *self)
7661{
7662    Py_ssize_t len;
7663    Py_UNICODE *p;
7664    Py_hash_t x;
7665
7666#ifdef Py_DEBUG
7667    assert(_Py_HashSecret_Initialized);
7668#endif
7669    if (self->hash != -1)
7670        return self->hash;
7671    len = Py_SIZE(self);
7672    /*
7673      We make the hash of the empty string be 0, rather than using
7674      (prefix ^ suffix), since this slightly obfuscates the hash secret
7675    */
7676    if (len == 0) {
7677        self->hash = 0;
7678        return 0;
7679    }
7680    p = self->str;
7681    x = _Py_HashSecret.prefix;
7682    x ^= *p << 7;
7683    while (--len >= 0)
7684        x = (_PyHASH_MULTIPLIER*x) ^ *p++;
7685    x ^= Py_SIZE(self);
7686    x ^= _Py_HashSecret.suffix;
7687    if (x == -1)
7688        x = -2;
7689    self->hash = x;
7690    return x;
7691}
7692
7693PyDoc_STRVAR(index__doc__,
7694             "S.index(sub[, start[, end]]) -> int\n\
7695\n\
7696Like S.find() but raise ValueError when the substring is not found.");
7697
7698static PyObject *
7699unicode_index(PyUnicodeObject *self, PyObject *args)
7700{
7701    Py_ssize_t result;
7702    PyUnicodeObject *substring;
7703    Py_ssize_t start;
7704    Py_ssize_t end;
7705
7706    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7707                                            &start, &end))
7708        return NULL;
7709
7710    result = stringlib_find_slice(
7711        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7712        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7713        start, end
7714        );
7715
7716    Py_DECREF(substring);
7717
7718    if (result < 0) {
7719        PyErr_SetString(PyExc_ValueError, "substring not found");
7720        return NULL;
7721    }
7722
7723    return PyLong_FromSsize_t(result);
7724}
7725
7726PyDoc_STRVAR(islower__doc__,
7727             "S.islower() -> bool\n\
7728\n\
7729Return True if all cased characters in S are lowercase and there is\n\
7730at least one cased character in S, False otherwise.");
7731
7732static PyObject*
7733unicode_islower(PyUnicodeObject *self)
7734{
7735    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7736    register const Py_UNICODE *e;
7737    int cased;
7738
7739    /* Shortcut for single character strings */
7740    if (PyUnicode_GET_SIZE(self) == 1)
7741        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7742
7743    /* Special case for empty strings */
7744    if (PyUnicode_GET_SIZE(self) == 0)
7745        return PyBool_FromLong(0);
7746
7747    e = p + PyUnicode_GET_SIZE(self);
7748    cased = 0;
7749    while (p < e) {
7750        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7751
7752        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7753            return PyBool_FromLong(0);
7754        else if (!cased && Py_UNICODE_ISLOWER(ch))
7755            cased = 1;
7756    }
7757    return PyBool_FromLong(cased);
7758}
7759
7760PyDoc_STRVAR(isupper__doc__,
7761             "S.isupper() -> bool\n\
7762\n\
7763Return True if all cased characters in S are uppercase and there is\n\
7764at least one cased character in S, False otherwise.");
7765
7766static PyObject*
7767unicode_isupper(PyUnicodeObject *self)
7768{
7769    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7770    register const Py_UNICODE *e;
7771    int cased;
7772
7773    /* Shortcut for single character strings */
7774    if (PyUnicode_GET_SIZE(self) == 1)
7775        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7776
7777    /* Special case for empty strings */
7778    if (PyUnicode_GET_SIZE(self) == 0)
7779        return PyBool_FromLong(0);
7780
7781    e = p + PyUnicode_GET_SIZE(self);
7782    cased = 0;
7783    while (p < e) {
7784        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7785
7786        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7787            return PyBool_FromLong(0);
7788        else if (!cased && Py_UNICODE_ISUPPER(ch))
7789            cased = 1;
7790    }
7791    return PyBool_FromLong(cased);
7792}
7793
7794PyDoc_STRVAR(istitle__doc__,
7795             "S.istitle() -> bool\n\
7796\n\
7797Return True if S is a titlecased string and there is at least one\n\
7798character in S, i.e. upper- and titlecase characters may only\n\
7799follow uncased characters and lowercase characters only cased ones.\n\
7800Return False otherwise.");
7801
7802static PyObject*
7803unicode_istitle(PyUnicodeObject *self)
7804{
7805    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7806    register const Py_UNICODE *e;
7807    int cased, previous_is_cased;
7808
7809    /* Shortcut for single character strings */
7810    if (PyUnicode_GET_SIZE(self) == 1)
7811        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7812                               (Py_UNICODE_ISUPPER(*p) != 0));
7813
7814    /* Special case for empty strings */
7815    if (PyUnicode_GET_SIZE(self) == 0)
7816        return PyBool_FromLong(0);
7817
7818    e = p + PyUnicode_GET_SIZE(self);
7819    cased = 0;
7820    previous_is_cased = 0;
7821    while (p < e) {
7822        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7823
7824        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7825            if (previous_is_cased)
7826                return PyBool_FromLong(0);
7827            previous_is_cased = 1;
7828            cased = 1;
7829        }
7830        else if (Py_UNICODE_ISLOWER(ch)) {
7831            if (!previous_is_cased)
7832                return PyBool_FromLong(0);
7833            previous_is_cased = 1;
7834            cased = 1;
7835        }
7836        else
7837            previous_is_cased = 0;
7838    }
7839    return PyBool_FromLong(cased);
7840}
7841
7842PyDoc_STRVAR(isspace__doc__,
7843             "S.isspace() -> bool\n\
7844\n\
7845Return True if all characters in S are whitespace\n\
7846and there is at least one character in S, False otherwise.");
7847
7848static PyObject*
7849unicode_isspace(PyUnicodeObject *self)
7850{
7851    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7852    register const Py_UNICODE *e;
7853
7854    /* Shortcut for single character strings */
7855    if (PyUnicode_GET_SIZE(self) == 1 &&
7856        Py_UNICODE_ISSPACE(*p))
7857        return PyBool_FromLong(1);
7858
7859    /* Special case for empty strings */
7860    if (PyUnicode_GET_SIZE(self) == 0)
7861        return PyBool_FromLong(0);
7862
7863    e = p + PyUnicode_GET_SIZE(self);
7864    while (p < e) {
7865        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7866        if (!Py_UNICODE_ISSPACE(ch))
7867            return PyBool_FromLong(0);
7868    }
7869    return PyBool_FromLong(1);
7870}
7871
7872PyDoc_STRVAR(isalpha__doc__,
7873             "S.isalpha() -> bool\n\
7874\n\
7875Return True if all characters in S are alphabetic\n\
7876and there is at least one character in S, False otherwise.");
7877
7878static PyObject*
7879unicode_isalpha(PyUnicodeObject *self)
7880{
7881    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7882    register const Py_UNICODE *e;
7883
7884    /* Shortcut for single character strings */
7885    if (PyUnicode_GET_SIZE(self) == 1 &&
7886        Py_UNICODE_ISALPHA(*p))
7887        return PyBool_FromLong(1);
7888
7889    /* Special case for empty strings */
7890    if (PyUnicode_GET_SIZE(self) == 0)
7891        return PyBool_FromLong(0);
7892
7893    e = p + PyUnicode_GET_SIZE(self);
7894    while (p < e) {
7895        if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
7896            return PyBool_FromLong(0);
7897    }
7898    return PyBool_FromLong(1);
7899}
7900
7901PyDoc_STRVAR(isalnum__doc__,
7902             "S.isalnum() -> bool\n\
7903\n\
7904Return True if all characters in S are alphanumeric\n\
7905and there is at least one character in S, False otherwise.");
7906
7907static PyObject*
7908unicode_isalnum(PyUnicodeObject *self)
7909{
7910    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7911    register const Py_UNICODE *e;
7912
7913    /* Shortcut for single character strings */
7914    if (PyUnicode_GET_SIZE(self) == 1 &&
7915        Py_UNICODE_ISALNUM(*p))
7916        return PyBool_FromLong(1);
7917
7918    /* Special case for empty strings */
7919    if (PyUnicode_GET_SIZE(self) == 0)
7920        return PyBool_FromLong(0);
7921
7922    e = p + PyUnicode_GET_SIZE(self);
7923    while (p < e) {
7924        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7925        if (!Py_UNICODE_ISALNUM(ch))
7926            return PyBool_FromLong(0);
7927    }
7928    return PyBool_FromLong(1);
7929}
7930
7931PyDoc_STRVAR(isdecimal__doc__,
7932             "S.isdecimal() -> bool\n\
7933\n\
7934Return True if there are only decimal characters in S,\n\
7935False otherwise.");
7936
7937static PyObject*
7938unicode_isdecimal(PyUnicodeObject *self)
7939{
7940    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7941    register const Py_UNICODE *e;
7942
7943    /* Shortcut for single character strings */
7944    if (PyUnicode_GET_SIZE(self) == 1 &&
7945        Py_UNICODE_ISDECIMAL(*p))
7946        return PyBool_FromLong(1);
7947
7948    /* Special case for empty strings */
7949    if (PyUnicode_GET_SIZE(self) == 0)
7950        return PyBool_FromLong(0);
7951
7952    e = p + PyUnicode_GET_SIZE(self);
7953    while (p < e) {
7954        if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
7955            return PyBool_FromLong(0);
7956    }
7957    return PyBool_FromLong(1);
7958}
7959
7960PyDoc_STRVAR(isdigit__doc__,
7961             "S.isdigit() -> bool\n\
7962\n\
7963Return True if all characters in S are digits\n\
7964and there is at least one character in S, False otherwise.");
7965
7966static PyObject*
7967unicode_isdigit(PyUnicodeObject *self)
7968{
7969    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7970    register const Py_UNICODE *e;
7971
7972    /* Shortcut for single character strings */
7973    if (PyUnicode_GET_SIZE(self) == 1 &&
7974        Py_UNICODE_ISDIGIT(*p))
7975        return PyBool_FromLong(1);
7976
7977    /* Special case for empty strings */
7978    if (PyUnicode_GET_SIZE(self) == 0)
7979        return PyBool_FromLong(0);
7980
7981    e = p + PyUnicode_GET_SIZE(self);
7982    while (p < e) {
7983        if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
7984            return PyBool_FromLong(0);
7985    }
7986    return PyBool_FromLong(1);
7987}
7988
7989PyDoc_STRVAR(isnumeric__doc__,
7990             "S.isnumeric() -> bool\n\
7991\n\
7992Return True if there are only numeric characters in S,\n\
7993False otherwise.");
7994
7995static PyObject*
7996unicode_isnumeric(PyUnicodeObject *self)
7997{
7998    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7999    register const Py_UNICODE *e;
8000
8001    /* Shortcut for single character strings */
8002    if (PyUnicode_GET_SIZE(self) == 1 &&
8003        Py_UNICODE_ISNUMERIC(*p))
8004        return PyBool_FromLong(1);
8005
8006    /* Special case for empty strings */
8007    if (PyUnicode_GET_SIZE(self) == 0)
8008        return PyBool_FromLong(0);
8009
8010    e = p + PyUnicode_GET_SIZE(self);
8011    while (p < e) {
8012        if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
8013            return PyBool_FromLong(0);
8014    }
8015    return PyBool_FromLong(1);
8016}
8017
8018int
8019PyUnicode_IsIdentifier(PyObject *self)
8020{
8021    const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8022    const Py_UNICODE *e;
8023    Py_UCS4 first;
8024
8025    /* Special case for empty strings */
8026    if (PyUnicode_GET_SIZE(self) == 0)
8027        return 0;
8028
8029    /* PEP 3131 says that the first character must be in
8030       XID_Start and subsequent characters in XID_Continue,
8031       and for the ASCII range, the 2.x rules apply (i.e
8032       start with letters and underscore, continue with
8033       letters, digits, underscore). However, given the current
8034       definition of XID_Start and XID_Continue, it is sufficient
8035       to check just for these, except that _ must be allowed
8036       as starting an identifier.  */
8037    e = p + PyUnicode_GET_SIZE(self);
8038    first = _Py_UNICODE_NEXT(p, e);
8039    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
8040        return 0;
8041
8042    while (p < e)
8043        if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
8044            return 0;
8045    return 1;
8046}
8047
8048PyDoc_STRVAR(isidentifier__doc__,
8049             "S.isidentifier() -> bool\n\
8050\n\
8051Return True if S is a valid identifier according\n\
8052to the language definition.");
8053
8054static PyObject*
8055unicode_isidentifier(PyObject *self)
8056{
8057    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8058}
8059
8060PyDoc_STRVAR(isprintable__doc__,
8061             "S.isprintable() -> bool\n\
8062\n\
8063Return True if all characters in S are considered\n\
8064printable in repr() or S is empty, False otherwise.");
8065
8066static PyObject*
8067unicode_isprintable(PyObject *self)
8068{
8069    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8070    register const Py_UNICODE *e;
8071
8072    /* Shortcut for single character strings */
8073    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8074        Py_RETURN_TRUE;
8075    }
8076
8077    e = p + PyUnicode_GET_SIZE(self);
8078    while (p < e) {
8079        if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
8080            Py_RETURN_FALSE;
8081        }
8082    }
8083    Py_RETURN_TRUE;
8084}
8085
8086PyDoc_STRVAR(join__doc__,
8087             "S.join(iterable) -> str\n\
8088\n\
8089Return a string which is the concatenation of the strings in the\n\
8090iterable.  The separator between elements is S.");
8091
8092static PyObject*
8093unicode_join(PyObject *self, PyObject *data)
8094{
8095    return PyUnicode_Join(self, data);
8096}
8097
8098static Py_ssize_t
8099unicode_length(PyUnicodeObject *self)
8100{
8101    return self->length;
8102}
8103
8104PyDoc_STRVAR(ljust__doc__,
8105             "S.ljust(width[, fillchar]) -> str\n\
8106\n\
8107Return S left-justified in a Unicode string of length width. Padding is\n\
8108done using the specified fill character (default is a space).");
8109
8110static PyObject *
8111unicode_ljust(PyUnicodeObject *self, PyObject *args)
8112{
8113    Py_ssize_t width;
8114    Py_UNICODE fillchar = ' ';
8115
8116    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
8117        return NULL;
8118
8119    if (self->length >= width && PyUnicode_CheckExact(self)) {
8120        Py_INCREF(self);
8121        return (PyObject*) self;
8122    }
8123
8124    return (PyObject*) pad(self, 0, width - self->length, fillchar);
8125}
8126
8127PyDoc_STRVAR(lower__doc__,
8128             "S.lower() -> str\n\
8129\n\
8130Return a copy of the string S converted to lowercase.");
8131
8132static PyObject*
8133unicode_lower(PyUnicodeObject *self)
8134{
8135    return fixup(self, fixlower);
8136}
8137
8138#define LEFTSTRIP 0
8139#define RIGHTSTRIP 1
8140#define BOTHSTRIP 2
8141
8142/* Arrays indexed by above */
8143static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8144
8145#define STRIPNAME(i) (stripformat[i]+3)
8146
8147/* externally visible for str.strip(unicode) */
8148PyObject *
8149_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8150{
8151    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8152    Py_ssize_t len = PyUnicode_GET_SIZE(self);
8153    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8154    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8155    Py_ssize_t i, j;
8156
8157    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
8158
8159    i = 0;
8160    if (striptype != RIGHTSTRIP) {
8161        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8162            i++;
8163        }
8164    }
8165
8166    j = len;
8167    if (striptype != LEFTSTRIP) {
8168        do {
8169            j--;
8170        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8171        j++;
8172    }
8173
8174    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8175        Py_INCREF(self);
8176        return (PyObject*)self;
8177    }
8178    else
8179        return PyUnicode_FromUnicode(s+i, j-i);
8180}
8181
8182
8183static PyObject *
8184do_strip(PyUnicodeObject *self, int striptype)
8185{
8186    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8187    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
8188
8189    i = 0;
8190    if (striptype != RIGHTSTRIP) {
8191        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8192            i++;
8193        }
8194    }
8195
8196    j = len;
8197    if (striptype != LEFTSTRIP) {
8198        do {
8199            j--;
8200        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8201        j++;
8202    }
8203
8204    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8205        Py_INCREF(self);
8206        return (PyObject*)self;
8207    }
8208    else
8209        return PyUnicode_FromUnicode(s+i, j-i);
8210}
8211
8212
8213static PyObject *
8214do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8215{
8216    PyObject *sep = NULL;
8217
8218    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8219        return NULL;
8220
8221    if (sep != NULL && sep != Py_None) {
8222        if (PyUnicode_Check(sep))
8223            return _PyUnicode_XStrip(self, striptype, sep);
8224        else {
8225            PyErr_Format(PyExc_TypeError,
8226                         "%s arg must be None or str",
8227                         STRIPNAME(striptype));
8228            return NULL;
8229        }
8230    }
8231
8232    return do_strip(self, striptype);
8233}
8234
8235
8236PyDoc_STRVAR(strip__doc__,
8237             "S.strip([chars]) -> str\n\
8238\n\
8239Return a copy of the string S with leading and trailing\n\
8240whitespace removed.\n\
8241If chars is given and not None, remove characters in chars instead.");
8242
8243static PyObject *
8244unicode_strip(PyUnicodeObject *self, PyObject *args)
8245{
8246    if (PyTuple_GET_SIZE(args) == 0)
8247        return do_strip(self, BOTHSTRIP); /* Common case */
8248    else
8249        return do_argstrip(self, BOTHSTRIP, args);
8250}
8251
8252
8253PyDoc_STRVAR(lstrip__doc__,
8254             "S.lstrip([chars]) -> str\n\
8255\n\
8256Return a copy of the string S with leading whitespace removed.\n\
8257If chars is given and not None, remove characters in chars instead.");
8258
8259static PyObject *
8260unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8261{
8262    if (PyTuple_GET_SIZE(args) == 0)
8263        return do_strip(self, LEFTSTRIP); /* Common case */
8264    else
8265        return do_argstrip(self, LEFTSTRIP, args);
8266}
8267
8268
8269PyDoc_STRVAR(rstrip__doc__,
8270             "S.rstrip([chars]) -> str\n\
8271\n\
8272Return a copy of the string S with trailing whitespace removed.\n\
8273If chars is given and not None, remove characters in chars instead.");
8274
8275static PyObject *
8276unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8277{
8278    if (PyTuple_GET_SIZE(args) == 0)
8279        return do_strip(self, RIGHTSTRIP); /* Common case */
8280    else
8281        return do_argstrip(self, RIGHTSTRIP, args);
8282}
8283
8284
8285static PyObject*
8286unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
8287{
8288    PyUnicodeObject *u;
8289    Py_UNICODE *p;
8290    Py_ssize_t nchars;
8291    size_t nbytes;
8292
8293    if (len < 1) {
8294        Py_INCREF(unicode_empty);
8295        return (PyObject *)unicode_empty;
8296    }
8297
8298    if (len == 1 && PyUnicode_CheckExact(str)) {
8299        /* no repeat, return original string */
8300        Py_INCREF(str);
8301        return (PyObject*) str;
8302    }
8303
8304    /* ensure # of chars needed doesn't overflow int and # of bytes
8305     * needed doesn't overflow size_t
8306     */
8307    nchars = len * str->length;
8308    if (nchars / len != str->length) {
8309        PyErr_SetString(PyExc_OverflowError,
8310                        "repeated string is too long");
8311        return NULL;
8312    }
8313    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8314    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8315        PyErr_SetString(PyExc_OverflowError,
8316                        "repeated string is too long");
8317        return NULL;
8318    }
8319    u = _PyUnicode_New(nchars);
8320    if (!u)
8321        return NULL;
8322
8323    p = u->str;
8324
8325    if (str->length == 1) {
8326        Py_UNICODE_FILL(p, str->str[0], len);
8327    } else {
8328        Py_ssize_t done = str->length; /* number of characters copied this far */
8329        Py_UNICODE_COPY(p, str->str, str->length);
8330        while (done < nchars) {
8331            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
8332            Py_UNICODE_COPY(p+done, p, n);
8333            done += n;
8334        }
8335    }
8336
8337    return (PyObject*) u;
8338}
8339
8340PyObject *PyUnicode_Replace(PyObject *obj,
8341                            PyObject *subobj,
8342                            PyObject *replobj,
8343                            Py_ssize_t maxcount)
8344{
8345    PyObject *self;
8346    PyObject *str1;
8347    PyObject *str2;
8348    PyObject *result;
8349
8350    self = PyUnicode_FromObject(obj);
8351    if (self == NULL)
8352        return NULL;
8353    str1 = PyUnicode_FromObject(subobj);
8354    if (str1 == NULL) {
8355        Py_DECREF(self);
8356        return NULL;
8357    }
8358    str2 = PyUnicode_FromObject(replobj);
8359    if (str2 == NULL) {
8360        Py_DECREF(self);
8361        Py_DECREF(str1);
8362        return NULL;
8363    }
8364    result = replace((PyUnicodeObject *)self,
8365                     (PyUnicodeObject *)str1,
8366                     (PyUnicodeObject *)str2,
8367                     maxcount);
8368    Py_DECREF(self);
8369    Py_DECREF(str1);
8370    Py_DECREF(str2);
8371    return result;
8372}
8373
8374PyDoc_STRVAR(replace__doc__,
8375             "S.replace(old, new[, count]) -> str\n\
8376\n\
8377Return a copy of S with all occurrences of substring\n\
8378old replaced by new.  If the optional argument count is\n\
8379given, only the first count occurrences are replaced.");
8380
8381static PyObject*
8382unicode_replace(PyUnicodeObject *self, PyObject *args)
8383{
8384    PyUnicodeObject *str1;
8385    PyUnicodeObject *str2;
8386    Py_ssize_t maxcount = -1;
8387    PyObject *result;
8388
8389    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8390        return NULL;
8391    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8392    if (str1 == NULL)
8393        return NULL;
8394    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
8395    if (str2 == NULL) {
8396        Py_DECREF(str1);
8397        return NULL;
8398    }
8399
8400    result = replace(self, str1, str2, maxcount);
8401
8402    Py_DECREF(str1);
8403    Py_DECREF(str2);
8404    return result;
8405}
8406
8407static
8408PyObject *unicode_repr(PyObject *unicode)
8409{
8410    PyObject *repr;
8411    Py_UNICODE *p;
8412    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8413    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8414
8415    /* XXX(nnorwitz): rather than over-allocating, it would be
8416       better to choose a different scheme.  Perhaps scan the
8417       first N-chars of the string and allocate based on that size.
8418    */
8419    /* Initial allocation is based on the longest-possible unichr
8420       escape.
8421
8422       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8423       unichr, so in this case it's the longest unichr escape. In
8424       narrow (UTF-16) builds this is five chars per source unichr
8425       since there are two unichrs in the surrogate pair, so in narrow
8426       (UTF-16) builds it's not the longest unichr escape.
8427
8428       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8429       so in the narrow (UTF-16) build case it's the longest unichr
8430       escape.
8431    */
8432
8433    repr = PyUnicode_FromUnicode(NULL,
8434                                 2 /* quotes */
8435#ifdef Py_UNICODE_WIDE
8436                                 + 10*size
8437#else
8438                                 + 6*size
8439#endif
8440                                 + 1);
8441    if (repr == NULL)
8442        return NULL;
8443
8444    p = PyUnicode_AS_UNICODE(repr);
8445
8446    /* Add quote */
8447    *p++ = (findchar(s, size, '\'') &&
8448            !findchar(s, size, '"')) ? '"' : '\'';
8449    while (size-- > 0) {
8450        Py_UNICODE ch = *s++;
8451
8452        /* Escape quotes and backslashes */
8453        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8454            *p++ = '\\';
8455            *p++ = ch;
8456            continue;
8457        }
8458
8459        /* Map special whitespace to '\t', \n', '\r' */
8460        if (ch == '\t') {
8461            *p++ = '\\';
8462            *p++ = 't';
8463        }
8464        else if (ch == '\n') {
8465            *p++ = '\\';
8466            *p++ = 'n';
8467        }
8468        else if (ch == '\r') {
8469            *p++ = '\\';
8470            *p++ = 'r';
8471        }
8472
8473        /* Map non-printable US ASCII to '\xhh' */
8474        else if (ch < ' ' || ch == 0x7F) {
8475            *p++ = '\\';
8476            *p++ = 'x';
8477            *p++ = hexdigits[(ch >> 4) & 0x000F];
8478            *p++ = hexdigits[ch & 0x000F];
8479        }
8480
8481        /* Copy ASCII characters as-is */
8482        else if (ch < 0x7F) {
8483            *p++ = ch;
8484        }
8485
8486        /* Non-ASCII characters */
8487        else {
8488            Py_UCS4 ucs = ch;
8489
8490#ifndef Py_UNICODE_WIDE
8491            Py_UNICODE ch2 = 0;
8492            /* Get code point from surrogate pair */
8493            if (size > 0) {
8494                ch2 = *s;
8495                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
8496                    && ch2 <= 0xDFFF) {
8497                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
8498                        + 0x00010000;
8499                    s++;
8500                    size--;
8501                }
8502            }
8503#endif
8504            /* Map Unicode whitespace and control characters
8505               (categories Z* and C* except ASCII space)
8506            */
8507            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8508                /* Map 8-bit characters to '\xhh' */
8509                if (ucs <= 0xff) {
8510                    *p++ = '\\';
8511                    *p++ = 'x';
8512                    *p++ = hexdigits[(ch >> 4) & 0x000F];
8513                    *p++ = hexdigits[ch & 0x000F];
8514                }
8515                /* Map 21-bit characters to '\U00xxxxxx' */
8516                else if (ucs >= 0x10000) {
8517                    *p++ = '\\';
8518                    *p++ = 'U';
8519                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8520                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8521                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8522                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8523                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8524                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8525                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8526                    *p++ = hexdigits[ucs & 0x0000000F];
8527                }
8528                /* Map 16-bit characters to '\uxxxx' */
8529                else {
8530                    *p++ = '\\';
8531                    *p++ = 'u';
8532                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
8533                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
8534                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
8535                    *p++ = hexdigits[ucs & 0x000F];
8536                }
8537            }
8538            /* Copy characters as-is */
8539            else {
8540                *p++ = ch;
8541#ifndef Py_UNICODE_WIDE
8542                if (ucs >= 0x10000)
8543                    *p++ = ch2;
8544#endif
8545            }
8546        }
8547    }
8548    /* Add quote */
8549    *p++ = PyUnicode_AS_UNICODE(repr)[0];
8550
8551    *p = '\0';
8552    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8553    return repr;
8554}
8555
8556PyDoc_STRVAR(rfind__doc__,
8557             "S.rfind(sub[, start[, end]]) -> int\n\
8558\n\
8559Return the highest index in S where substring sub is found,\n\
8560such that sub is contained within S[start:end].  Optional\n\
8561arguments start and end are interpreted as in slice notation.\n\
8562\n\
8563Return -1 on failure.");
8564
8565static PyObject *
8566unicode_rfind(PyUnicodeObject *self, PyObject *args)
8567{
8568    PyUnicodeObject *substring;
8569    Py_ssize_t start;
8570    Py_ssize_t end;
8571    Py_ssize_t result;
8572
8573    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8574                                            &start, &end))
8575        return NULL;
8576
8577    result = stringlib_rfind_slice(
8578        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8579        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8580        start, end
8581        );
8582
8583    Py_DECREF(substring);
8584
8585    return PyLong_FromSsize_t(result);
8586}
8587
8588PyDoc_STRVAR(rindex__doc__,
8589             "S.rindex(sub[, start[, end]]) -> int\n\
8590\n\
8591Like S.rfind() but raise ValueError when the substring is not found.");
8592
8593static PyObject *
8594unicode_rindex(PyUnicodeObject *self, PyObject *args)
8595{
8596    PyUnicodeObject *substring;
8597    Py_ssize_t start;
8598    Py_ssize_t end;
8599    Py_ssize_t result;
8600
8601    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8602                                            &start, &end))
8603        return NULL;
8604
8605    result = stringlib_rfind_slice(
8606        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8607        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8608        start, end
8609        );
8610
8611    Py_DECREF(substring);
8612
8613    if (result < 0) {
8614        PyErr_SetString(PyExc_ValueError, "substring not found");
8615        return NULL;
8616    }
8617    return PyLong_FromSsize_t(result);
8618}
8619
8620PyDoc_STRVAR(rjust__doc__,
8621             "S.rjust(width[, fillchar]) -> str\n\
8622\n\
8623Return S right-justified in a string of length width. Padding is\n\
8624done using the specified fill character (default is a space).");
8625
8626static PyObject *
8627unicode_rjust(PyUnicodeObject *self, PyObject *args)
8628{
8629    Py_ssize_t width;
8630    Py_UNICODE fillchar = ' ';
8631
8632    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8633        return NULL;
8634
8635    if (self->length >= width && PyUnicode_CheckExact(self)) {
8636        Py_INCREF(self);
8637        return (PyObject*) self;
8638    }
8639
8640    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8641}
8642
8643PyObject *PyUnicode_Split(PyObject *s,
8644                          PyObject *sep,
8645                          Py_ssize_t maxsplit)
8646{
8647    PyObject *result;
8648
8649    s = PyUnicode_FromObject(s);
8650    if (s == NULL)
8651        return NULL;
8652    if (sep != NULL) {
8653        sep = PyUnicode_FromObject(sep);
8654        if (sep == NULL) {
8655            Py_DECREF(s);
8656            return NULL;
8657        }
8658    }
8659
8660    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8661
8662    Py_DECREF(s);
8663    Py_XDECREF(sep);
8664    return result;
8665}
8666
8667PyDoc_STRVAR(split__doc__,
8668             "S.split([sep[, maxsplit]]) -> list of strings\n\
8669\n\
8670Return a list of the words in S, using sep as the\n\
8671delimiter string.  If maxsplit is given, at most maxsplit\n\
8672splits are done. If sep is not specified or is None, any\n\
8673whitespace string is a separator and empty strings are\n\
8674removed from the result.");
8675
8676static PyObject*
8677unicode_split(PyUnicodeObject *self, PyObject *args)
8678{
8679    PyObject *substring = Py_None;
8680    Py_ssize_t maxcount = -1;
8681
8682    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8683        return NULL;
8684
8685    if (substring == Py_None)
8686        return split(self, NULL, maxcount);
8687    else if (PyUnicode_Check(substring))
8688        return split(self, (PyUnicodeObject *)substring, maxcount);
8689    else
8690        return PyUnicode_Split((PyObject *)self, substring, maxcount);
8691}
8692
8693PyObject *
8694PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8695{
8696    PyObject* str_obj;
8697    PyObject* sep_obj;
8698    PyObject* out;
8699
8700    str_obj = PyUnicode_FromObject(str_in);
8701    if (!str_obj)
8702        return NULL;
8703    sep_obj = PyUnicode_FromObject(sep_in);
8704    if (!sep_obj) {
8705        Py_DECREF(str_obj);
8706        return NULL;
8707    }
8708
8709    out = stringlib_partition(
8710        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8711        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8712        );
8713
8714    Py_DECREF(sep_obj);
8715    Py_DECREF(str_obj);
8716
8717    return out;
8718}
8719
8720
8721PyObject *
8722PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8723{
8724    PyObject* str_obj;
8725    PyObject* sep_obj;
8726    PyObject* out;
8727
8728    str_obj = PyUnicode_FromObject(str_in);
8729    if (!str_obj)
8730        return NULL;
8731    sep_obj = PyUnicode_FromObject(sep_in);
8732    if (!sep_obj) {
8733        Py_DECREF(str_obj);
8734        return NULL;
8735    }
8736
8737    out = stringlib_rpartition(
8738        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8739        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8740        );
8741
8742    Py_DECREF(sep_obj);
8743    Py_DECREF(str_obj);
8744
8745    return out;
8746}
8747
8748PyDoc_STRVAR(partition__doc__,
8749             "S.partition(sep) -> (head, sep, tail)\n\
8750\n\
8751Search for the separator sep in S, and return the part before it,\n\
8752the separator itself, and the part after it.  If the separator is not\n\
8753found, return S and two empty strings.");
8754
8755static PyObject*
8756unicode_partition(PyUnicodeObject *self, PyObject *separator)
8757{
8758    return PyUnicode_Partition((PyObject *)self, separator);
8759}
8760
8761PyDoc_STRVAR(rpartition__doc__,
8762             "S.rpartition(sep) -> (head, sep, tail)\n\
8763\n\
8764Search for the separator sep in S, starting at the end of S, and return\n\
8765the part before it, the separator itself, and the part after it.  If the\n\
8766separator is not found, return two empty strings and S.");
8767
8768static PyObject*
8769unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8770{
8771    return PyUnicode_RPartition((PyObject *)self, separator);
8772}
8773
8774PyObject *PyUnicode_RSplit(PyObject *s,
8775                           PyObject *sep,
8776                           Py_ssize_t maxsplit)
8777{
8778    PyObject *result;
8779
8780    s = PyUnicode_FromObject(s);
8781    if (s == NULL)
8782        return NULL;
8783    if (sep != NULL) {
8784        sep = PyUnicode_FromObject(sep);
8785        if (sep == NULL) {
8786            Py_DECREF(s);
8787            return NULL;
8788        }
8789    }
8790
8791    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8792
8793    Py_DECREF(s);
8794    Py_XDECREF(sep);
8795    return result;
8796}
8797
8798PyDoc_STRVAR(rsplit__doc__,
8799             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8800\n\
8801Return a list of the words in S, using sep as the\n\
8802delimiter string, starting at the end of the string and\n\
8803working to the front.  If maxsplit is given, at most maxsplit\n\
8804splits are done. If sep is not specified, any whitespace string\n\
8805is a separator.");
8806
8807static PyObject*
8808unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8809{
8810    PyObject *substring = Py_None;
8811    Py_ssize_t maxcount = -1;
8812
8813    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8814        return NULL;
8815
8816    if (substring == Py_None)
8817        return rsplit(self, NULL, maxcount);
8818    else if (PyUnicode_Check(substring))
8819        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8820    else
8821        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8822}
8823
8824PyDoc_STRVAR(splitlines__doc__,
8825             "S.splitlines([keepends]) -> list of strings\n\
8826\n\
8827Return a list of the lines in S, breaking at line boundaries.\n\
8828Line breaks are not included in the resulting list unless keepends\n\
8829is given and true.");
8830
8831static PyObject*
8832unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8833{
8834    int keepends = 0;
8835
8836    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8837        return NULL;
8838
8839    return PyUnicode_Splitlines((PyObject *)self, keepends);
8840}
8841
8842static
8843PyObject *unicode_str(PyObject *self)
8844{
8845    if (PyUnicode_CheckExact(self)) {
8846        Py_INCREF(self);
8847        return self;
8848    } else
8849        /* Subtype -- return genuine unicode string with the same value. */
8850        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8851                                     PyUnicode_GET_SIZE(self));
8852}
8853
8854PyDoc_STRVAR(swapcase__doc__,
8855             "S.swapcase() -> str\n\
8856\n\
8857Return a copy of S with uppercase characters converted to lowercase\n\
8858and vice versa.");
8859
8860static PyObject*
8861unicode_swapcase(PyUnicodeObject *self)
8862{
8863    return fixup(self, fixswapcase);
8864}
8865
8866PyDoc_STRVAR(maketrans__doc__,
8867             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8868\n\
8869Return a translation table usable for str.translate().\n\
8870If there is only one argument, it must be a dictionary mapping Unicode\n\
8871ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8872Character keys will be then converted to ordinals.\n\
8873If there are two arguments, they must be strings of equal length, and\n\
8874in the resulting dictionary, each character in x will be mapped to the\n\
8875character at the same position in y. If there is a third argument, it\n\
8876must be a string, whose characters will be mapped to None in the result.");
8877
8878static PyObject*
8879unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8880{
8881    PyObject *x, *y = NULL, *z = NULL;
8882    PyObject *new = NULL, *key, *value;
8883    Py_ssize_t i = 0;
8884    int res;
8885
8886    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8887        return NULL;
8888    new = PyDict_New();
8889    if (!new)
8890        return NULL;
8891    if (y != NULL) {
8892        /* x must be a string too, of equal length */
8893        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8894        if (!PyUnicode_Check(x)) {
8895            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8896                            "be a string if there is a second argument");
8897            goto err;
8898        }
8899        if (PyUnicode_GET_SIZE(x) != ylen) {
8900            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8901                            "arguments must have equal length");
8902            goto err;
8903        }
8904        /* create entries for translating chars in x to those in y */
8905        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8906            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8907            if (!key)
8908                goto err;
8909            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8910            if (!value) {
8911                Py_DECREF(key);
8912                goto err;
8913            }
8914            res = PyDict_SetItem(new, key, value);
8915            Py_DECREF(key);
8916            Py_DECREF(value);
8917            if (res < 0)
8918                goto err;
8919        }
8920        /* create entries for deleting chars in z */
8921        if (z != NULL) {
8922            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8923                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8924                if (!key)
8925                    goto err;
8926                res = PyDict_SetItem(new, key, Py_None);
8927                Py_DECREF(key);
8928                if (res < 0)
8929                    goto err;
8930            }
8931        }
8932    } else {
8933        /* x must be a dict */
8934        if (!PyDict_CheckExact(x)) {
8935            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8936                            "to maketrans it must be a dict");
8937            goto err;
8938        }
8939        /* copy entries into the new dict, converting string keys to int keys */
8940        while (PyDict_Next(x, &i, &key, &value)) {
8941            if (PyUnicode_Check(key)) {
8942                /* convert string keys to integer keys */
8943                PyObject *newkey;
8944                if (PyUnicode_GET_SIZE(key) != 1) {
8945                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8946                                    "table must be of length 1");
8947                    goto err;
8948                }
8949                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8950                if (!newkey)
8951                    goto err;
8952                res = PyDict_SetItem(new, newkey, value);
8953                Py_DECREF(newkey);
8954                if (res < 0)
8955                    goto err;
8956            } else if (PyLong_Check(key)) {
8957                /* just keep integer keys */
8958                if (PyDict_SetItem(new, key, value) < 0)
8959                    goto err;
8960            } else {
8961                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8962                                "be strings or integers");
8963                goto err;
8964            }
8965        }
8966    }
8967    return new;
8968  err:
8969    Py_DECREF(new);
8970    return NULL;
8971}
8972
8973PyDoc_STRVAR(translate__doc__,
8974             "S.translate(table) -> str\n\
8975\n\
8976Return a copy of the string S, where all characters have been mapped\n\
8977through the given translation table, which must be a mapping of\n\
8978Unicode ordinals to Unicode ordinals, strings, or None.\n\
8979Unmapped characters are left untouched. Characters mapped to None\n\
8980are deleted.");
8981
8982static PyObject*
8983unicode_translate(PyUnicodeObject *self, PyObject *table)
8984{
8985    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8986}
8987
8988PyDoc_STRVAR(upper__doc__,
8989             "S.upper() -> str\n\
8990\n\
8991Return a copy of S converted to uppercase.");
8992
8993static PyObject*
8994unicode_upper(PyUnicodeObject *self)
8995{
8996    return fixup(self, fixupper);
8997}
8998
8999PyDoc_STRVAR(zfill__doc__,
9000             "S.zfill(width) -> str\n\
9001\n\
9002Pad a numeric string S with zeros on the left, to fill a field\n\
9003of the specified width. The string S is never truncated.");
9004
9005static PyObject *
9006unicode_zfill(PyUnicodeObject *self, PyObject *args)
9007{
9008    Py_ssize_t fill;
9009    PyUnicodeObject *u;
9010
9011    Py_ssize_t width;
9012    if (!PyArg_ParseTuple(args, "n:zfill", &width))
9013        return NULL;
9014
9015    if (self->length >= width) {
9016        if (PyUnicode_CheckExact(self)) {
9017            Py_INCREF(self);
9018            return (PyObject*) self;
9019        }
9020        else
9021            return PyUnicode_FromUnicode(
9022                PyUnicode_AS_UNICODE(self),
9023                PyUnicode_GET_SIZE(self)
9024                );
9025    }
9026
9027    fill = width - self->length;
9028
9029    u = pad(self, fill, 0, '0');
9030
9031    if (u == NULL)
9032        return NULL;
9033
9034    if (u->str[fill] == '+' || u->str[fill] == '-') {
9035        /* move sign to beginning of string */
9036        u->str[0] = u->str[fill];
9037        u->str[fill] = '0';
9038    }
9039
9040    return (PyObject*) u;
9041}
9042
9043#if 0
9044static PyObject*
9045unicode_freelistsize(PyUnicodeObject *self)
9046{
9047    return PyLong_FromLong(numfree);
9048}
9049
9050static PyObject *
9051unicode__decimal2ascii(PyObject *self)
9052{
9053    return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9054                                             PyUnicode_GET_SIZE(self));
9055}
9056#endif
9057
9058PyDoc_STRVAR(startswith__doc__,
9059             "S.startswith(prefix[, start[, end]]) -> bool\n\
9060\n\
9061Return True if S starts with the specified prefix, False otherwise.\n\
9062With optional start, test S beginning at that position.\n\
9063With optional end, stop comparing S at that position.\n\
9064prefix can also be a tuple of strings to try.");
9065
9066static PyObject *
9067unicode_startswith(PyUnicodeObject *self,
9068                   PyObject *args)
9069{
9070    PyObject *subobj;
9071    PyUnicodeObject *substring;
9072    Py_ssize_t start = 0;
9073    Py_ssize_t end = PY_SSIZE_T_MAX;
9074    int result;
9075
9076    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
9077        return NULL;
9078    if (PyTuple_Check(subobj)) {
9079        Py_ssize_t i;
9080        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9081            substring = (PyUnicodeObject *)PyUnicode_FromObject(
9082                PyTuple_GET_ITEM(subobj, i));
9083            if (substring == NULL)
9084                return NULL;
9085            result = tailmatch(self, substring, start, end, -1);
9086            Py_DECREF(substring);
9087            if (result) {
9088                Py_RETURN_TRUE;
9089            }
9090        }
9091        /* nothing matched */
9092        Py_RETURN_FALSE;
9093    }
9094    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9095    if (substring == NULL) {
9096        if (PyErr_ExceptionMatches(PyExc_TypeError))
9097            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9098                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
9099        return NULL;
9100    }
9101    result = tailmatch(self, substring, start, end, -1);
9102    Py_DECREF(substring);
9103    return PyBool_FromLong(result);
9104}
9105
9106
9107PyDoc_STRVAR(endswith__doc__,
9108             "S.endswith(suffix[, start[, end]]) -> bool\n\
9109\n\
9110Return True if S ends with the specified suffix, False otherwise.\n\
9111With optional start, test S beginning at that position.\n\
9112With optional end, stop comparing S at that position.\n\
9113suffix can also be a tuple of strings to try.");
9114
9115static PyObject *
9116unicode_endswith(PyUnicodeObject *self,
9117                 PyObject *args)
9118{
9119    PyObject *subobj;
9120    PyUnicodeObject *substring;
9121    Py_ssize_t start = 0;
9122    Py_ssize_t end = PY_SSIZE_T_MAX;
9123    int result;
9124
9125    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
9126        return NULL;
9127    if (PyTuple_Check(subobj)) {
9128        Py_ssize_t i;
9129        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9130            substring = (PyUnicodeObject *)PyUnicode_FromObject(
9131                PyTuple_GET_ITEM(subobj, i));
9132            if (substring == NULL)
9133                return NULL;
9134            result = tailmatch(self, substring, start, end, +1);
9135            Py_DECREF(substring);
9136            if (result) {
9137                Py_RETURN_TRUE;
9138            }
9139        }
9140        Py_RETURN_FALSE;
9141    }
9142    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9143    if (substring == NULL) {
9144        if (PyErr_ExceptionMatches(PyExc_TypeError))
9145            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9146                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
9147        return NULL;
9148    }
9149    result = tailmatch(self, substring, start, end, +1);
9150    Py_DECREF(substring);
9151    return PyBool_FromLong(result);
9152}
9153
9154#include "stringlib/string_format.h"
9155
9156PyDoc_STRVAR(format__doc__,
9157             "S.format(*args, **kwargs) -> str\n\
9158\n\
9159Return a formatted version of S, using substitutions from args and kwargs.\n\
9160The substitutions are identified by braces ('{' and '}').");
9161
9162PyDoc_STRVAR(format_map__doc__,
9163             "S.format_map(mapping) -> str\n\
9164\n\
9165Return a formatted version of S, using substitutions from mapping.\n\
9166The substitutions are identified by braces ('{' and '}').");
9167
9168static PyObject *
9169unicode__format__(PyObject* self, PyObject* args)
9170{
9171    PyObject *format_spec;
9172
9173    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9174        return NULL;
9175
9176    return _PyUnicode_FormatAdvanced(self,
9177                                     PyUnicode_AS_UNICODE(format_spec),
9178                                     PyUnicode_GET_SIZE(format_spec));
9179}
9180
9181PyDoc_STRVAR(p_format__doc__,
9182             "S.__format__(format_spec) -> str\n\
9183\n\
9184Return a formatted version of S as described by format_spec.");
9185
9186static PyObject *
9187unicode__sizeof__(PyUnicodeObject *v)
9188{
9189    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9190                              sizeof(Py_UNICODE) * (v->length + 1));
9191}
9192
9193PyDoc_STRVAR(sizeof__doc__,
9194             "S.__sizeof__() -> size of S in memory, in bytes");
9195
9196static PyObject *
9197unicode_getnewargs(PyUnicodeObject *v)
9198{
9199    return Py_BuildValue("(u#)", v->str, v->length);
9200}
9201
9202static PyMethodDef unicode_methods[] = {
9203    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
9204    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9205    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
9206    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
9207    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9208    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9209    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9210    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9211    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9212    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9213    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
9214    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
9215    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9216    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9217    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
9218    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
9219    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9220    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9221    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
9222    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
9223    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
9224    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
9225    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
9226    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9227    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9228    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9229    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9230    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9231    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9232    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9233    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9234    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9235    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9236    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9237    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9238    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9239    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
9240    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
9241    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
9242    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
9243    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
9244    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
9245    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
9246    {"maketrans", (PyCFunction) unicode_maketrans,
9247     METH_VARARGS | METH_STATIC, maketrans__doc__},
9248    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
9249#if 0
9250    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
9251#endif
9252
9253#if 0
9254    /* These methods are just used for debugging the implementation. */
9255    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
9256    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
9257#endif
9258
9259    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
9260    {NULL, NULL}
9261};
9262
9263static PyObject *
9264unicode_mod(PyObject *v, PyObject *w)
9265{
9266    if (!PyUnicode_Check(v)) {
9267        Py_INCREF(Py_NotImplemented);
9268        return Py_NotImplemented;
9269    }
9270    return PyUnicode_Format(v, w);
9271}
9272
9273static PyNumberMethods unicode_as_number = {
9274    0,              /*nb_add*/
9275    0,              /*nb_subtract*/
9276    0,              /*nb_multiply*/
9277    unicode_mod,            /*nb_remainder*/
9278};
9279
9280static PySequenceMethods unicode_as_sequence = {
9281    (lenfunc) unicode_length,       /* sq_length */
9282    PyUnicode_Concat,           /* sq_concat */
9283    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
9284    (ssizeargfunc) unicode_getitem,     /* sq_item */
9285    0,                  /* sq_slice */
9286    0,                  /* sq_ass_item */
9287    0,                  /* sq_ass_slice */
9288    PyUnicode_Contains,         /* sq_contains */
9289};
9290
9291static PyObject*
9292unicode_subscript(PyUnicodeObject* self, PyObject* item)
9293{
9294    if (PyIndex_Check(item)) {
9295        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
9296        if (i == -1 && PyErr_Occurred())
9297            return NULL;
9298        if (i < 0)
9299            i += PyUnicode_GET_SIZE(self);
9300        return unicode_getitem(self, i);
9301    } else if (PySlice_Check(item)) {
9302        Py_ssize_t start, stop, step, slicelength, cur, i;
9303        Py_UNICODE* source_buf;
9304        Py_UNICODE* result_buf;
9305        PyObject* result;
9306
9307        if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
9308                                 &start, &stop, &step, &slicelength) < 0) {
9309            return NULL;
9310        }
9311
9312        if (slicelength <= 0) {
9313            return PyUnicode_FromUnicode(NULL, 0);
9314        } else if (start == 0 && step == 1 && slicelength == self->length &&
9315                   PyUnicode_CheckExact(self)) {
9316            Py_INCREF(self);
9317            return (PyObject *)self;
9318        } else if (step == 1) {
9319            return PyUnicode_FromUnicode(self->str + start, slicelength);
9320        } else {
9321            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
9322            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9323                                                       sizeof(Py_UNICODE));
9324
9325            if (result_buf == NULL)
9326                return PyErr_NoMemory();
9327
9328            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9329                result_buf[i] = source_buf[cur];
9330            }
9331
9332            result = PyUnicode_FromUnicode(result_buf, slicelength);
9333            PyObject_FREE(result_buf);
9334            return result;
9335        }
9336    } else {
9337        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9338        return NULL;
9339    }
9340}
9341
9342static PyMappingMethods unicode_as_mapping = {
9343    (lenfunc)unicode_length,        /* mp_length */
9344    (binaryfunc)unicode_subscript,  /* mp_subscript */
9345    (objobjargproc)0,           /* mp_ass_subscript */
9346};
9347
9348
9349/* Helpers for PyUnicode_Format() */
9350
9351static PyObject *
9352getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
9353{
9354    Py_ssize_t argidx = *p_argidx;
9355    if (argidx < arglen) {
9356        (*p_argidx)++;
9357        if (arglen < 0)
9358            return args;
9359        else
9360            return PyTuple_GetItem(args, argidx);
9361    }
9362    PyErr_SetString(PyExc_TypeError,
9363                    "not enough arguments for format string");
9364    return NULL;
9365}
9366
9367/* Returns a new reference to a PyUnicode object, or NULL on failure. */
9368
9369static PyObject *
9370formatfloat(PyObject *v, int flags, int prec, int type)
9371{
9372    char *p;
9373    PyObject *result;
9374    double x;
9375
9376    x = PyFloat_AsDouble(v);
9377    if (x == -1.0 && PyErr_Occurred())
9378        return NULL;
9379
9380    if (prec < 0)
9381        prec = 6;
9382
9383    p = PyOS_double_to_string(x, type, prec,
9384                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
9385    if (p == NULL)
9386        return NULL;
9387    result = PyUnicode_FromStringAndSize(p, strlen(p));
9388    PyMem_Free(p);
9389    return result;
9390}
9391
9392static PyObject*
9393formatlong(PyObject *val, int flags, int prec, int type)
9394{
9395    char *buf;
9396    int len;
9397    PyObject *str; /* temporary string object. */
9398    PyObject *result;
9399
9400    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9401    if (!str)
9402        return NULL;
9403    result = PyUnicode_FromStringAndSize(buf, len);
9404    Py_DECREF(str);
9405    return result;
9406}
9407
9408static int
9409formatchar(Py_UNICODE *buf,
9410           size_t buflen,
9411           PyObject *v)
9412{
9413    /* presume that the buffer is at least 3 characters long */
9414    if (PyUnicode_Check(v)) {
9415        if (PyUnicode_GET_SIZE(v) == 1) {
9416            buf[0] = PyUnicode_AS_UNICODE(v)[0];
9417            buf[1] = '\0';
9418            return 1;
9419        }
9420#ifndef Py_UNICODE_WIDE
9421        if (PyUnicode_GET_SIZE(v) == 2) {
9422            /* Decode a valid surrogate pair */
9423            int c0 = PyUnicode_AS_UNICODE(v)[0];
9424            int c1 = PyUnicode_AS_UNICODE(v)[1];
9425            if (0xD800 <= c0 && c0 <= 0xDBFF &&
9426                0xDC00 <= c1 && c1 <= 0xDFFF) {
9427                buf[0] = c0;
9428                buf[1] = c1;
9429                buf[2] = '\0';
9430                return 2;
9431            }
9432        }
9433#endif
9434        goto onError;
9435    }
9436    else {
9437        /* Integer input truncated to a character */
9438        long x;
9439        x = PyLong_AsLong(v);
9440        if (x == -1 && PyErr_Occurred())
9441            goto onError;
9442
9443        if (x < 0 || x > 0x10ffff) {
9444            PyErr_SetString(PyExc_OverflowError,
9445                            "%c arg not in range(0x110000)");
9446            return -1;
9447        }
9448
9449#ifndef Py_UNICODE_WIDE
9450        if (x > 0xffff) {
9451            x -= 0x10000;
9452            buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9453            buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9454            return 2;
9455        }
9456#endif
9457        buf[0] = (Py_UNICODE) x;
9458        buf[1] = '\0';
9459        return 1;
9460    }
9461
9462  onError:
9463    PyErr_SetString(PyExc_TypeError,
9464                    "%c requires int or char");
9465    return -1;
9466}
9467
9468/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9469   FORMATBUFLEN is the length of the buffer in which chars are formatted.
9470*/
9471#define FORMATBUFLEN (size_t)10
9472
9473PyObject *PyUnicode_Format(PyObject *format,
9474                           PyObject *args)
9475{
9476    Py_UNICODE *fmt, *res;
9477    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
9478    int args_owned = 0;
9479    PyUnicodeObject *result = NULL;
9480    PyObject *dict = NULL;
9481    PyObject *uformat;
9482
9483    if (format == NULL || args == NULL) {
9484        PyErr_BadInternalCall();
9485        return NULL;
9486    }
9487    uformat = PyUnicode_FromObject(format);
9488    if (uformat == NULL)
9489        return NULL;
9490    fmt = PyUnicode_AS_UNICODE(uformat);
9491    fmtcnt = PyUnicode_GET_SIZE(uformat);
9492
9493    reslen = rescnt = fmtcnt + 100;
9494    result = _PyUnicode_New(reslen);
9495    if (result == NULL)
9496        goto onError;
9497    res = PyUnicode_AS_UNICODE(result);
9498
9499    if (PyTuple_Check(args)) {
9500        arglen = PyTuple_Size(args);
9501        argidx = 0;
9502    }
9503    else {
9504        arglen = -1;
9505        argidx = -2;
9506    }
9507    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
9508        dict = args;
9509
9510    while (--fmtcnt >= 0) {
9511        if (*fmt != '%') {
9512            if (--rescnt < 0) {
9513                rescnt = fmtcnt + 100;
9514                reslen += rescnt;
9515                if (_PyUnicode_Resize(&result, reslen) < 0)
9516                    goto onError;
9517                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9518                --rescnt;
9519            }
9520            *res++ = *fmt++;
9521        }
9522        else {
9523            /* Got a format specifier */
9524            int flags = 0;
9525            Py_ssize_t width = -1;
9526            int prec = -1;
9527            Py_UNICODE c = '\0';
9528            Py_UNICODE fill;
9529            int isnumok;
9530            PyObject *v = NULL;
9531            PyObject *temp = NULL;
9532            Py_UNICODE *pbuf;
9533            Py_UNICODE sign;
9534            Py_ssize_t len;
9535            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
9536
9537            fmt++;
9538            if (*fmt == '(') {
9539                Py_UNICODE *keystart;
9540                Py_ssize_t keylen;
9541                PyObject *key;
9542                int pcount = 1;
9543
9544                if (dict == NULL) {
9545                    PyErr_SetString(PyExc_TypeError,
9546                                    "format requires a mapping");
9547                    goto onError;
9548                }
9549                ++fmt;
9550                --fmtcnt;
9551                keystart = fmt;
9552                /* Skip over balanced parentheses */
9553                while (pcount > 0 && --fmtcnt >= 0) {
9554                    if (*fmt == ')')
9555                        --pcount;
9556                    else if (*fmt == '(')
9557                        ++pcount;
9558                    fmt++;
9559                }
9560                keylen = fmt - keystart - 1;
9561                if (fmtcnt < 0 || pcount > 0) {
9562                    PyErr_SetString(PyExc_ValueError,
9563                                    "incomplete format key");
9564                    goto onError;
9565                }
9566#if 0
9567                /* keys are converted to strings using UTF-8 and
9568                   then looked up since Python uses strings to hold
9569                   variables names etc. in its namespaces and we
9570                   wouldn't want to break common idioms. */
9571                key = PyUnicode_EncodeUTF8(keystart,
9572                                           keylen,
9573                                           NULL);
9574#else
9575                key = PyUnicode_FromUnicode(keystart, keylen);
9576#endif
9577                if (key == NULL)
9578                    goto onError;
9579                if (args_owned) {
9580                    Py_DECREF(args);
9581                    args_owned = 0;
9582                }
9583                args = PyObject_GetItem(dict, key);
9584                Py_DECREF(key);
9585                if (args == NULL) {
9586                    goto onError;
9587                }
9588                args_owned = 1;
9589                arglen = -1;
9590                argidx = -2;
9591            }
9592            while (--fmtcnt >= 0) {
9593                switch (c = *fmt++) {
9594                case '-': flags |= F_LJUST; continue;
9595                case '+': flags |= F_SIGN; continue;
9596                case ' ': flags |= F_BLANK; continue;
9597                case '#': flags |= F_ALT; continue;
9598                case '0': flags |= F_ZERO; continue;
9599                }
9600                break;
9601            }
9602            if (c == '*') {
9603                v = getnextarg(args, arglen, &argidx);
9604                if (v == NULL)
9605                    goto onError;
9606                if (!PyLong_Check(v)) {
9607                    PyErr_SetString(PyExc_TypeError,
9608                                    "* wants int");
9609                    goto onError;
9610                }
9611                width = PyLong_AsLong(v);
9612                if (width == -1 && PyErr_Occurred())
9613                    goto onError;
9614                if (width < 0) {
9615                    flags |= F_LJUST;
9616                    width = -width;
9617                }
9618                if (--fmtcnt >= 0)
9619                    c = *fmt++;
9620            }
9621            else if (c >= '0' && c <= '9') {
9622                width = c - '0';
9623                while (--fmtcnt >= 0) {
9624                    c = *fmt++;
9625                    if (c < '0' || c > '9')
9626                        break;
9627                    if ((width*10) / 10 != width) {
9628                        PyErr_SetString(PyExc_ValueError,
9629                                        "width too big");
9630                        goto onError;
9631                    }
9632                    width = width*10 + (c - '0');
9633                }
9634            }
9635            if (c == '.') {
9636                prec = 0;
9637                if (--fmtcnt >= 0)
9638                    c = *fmt++;
9639                if (c == '*') {
9640                    v = getnextarg(args, arglen, &argidx);
9641                    if (v == NULL)
9642                        goto onError;
9643                    if (!PyLong_Check(v)) {
9644                        PyErr_SetString(PyExc_TypeError,
9645                                        "* wants int");
9646                        goto onError;
9647                    }
9648                    prec = PyLong_AsLong(v);
9649                    if (prec == -1 && PyErr_Occurred())
9650                        goto onError;
9651                    if (prec < 0)
9652                        prec = 0;
9653                    if (--fmtcnt >= 0)
9654                        c = *fmt++;
9655                }
9656                else if (c >= '0' && c <= '9') {
9657                    prec = c - '0';
9658                    while (--fmtcnt >= 0) {
9659                        c = *fmt++;
9660                        if (c < '0' || c > '9')
9661                            break;
9662                        if ((prec*10) / 10 != prec) {
9663                            PyErr_SetString(PyExc_ValueError,
9664                                            "prec too big");
9665                            goto onError;
9666                        }
9667                        prec = prec*10 + (c - '0');
9668                    }
9669                }
9670            } /* prec */
9671            if (fmtcnt >= 0) {
9672                if (c == 'h' || c == 'l' || c == 'L') {
9673                    if (--fmtcnt >= 0)
9674                        c = *fmt++;
9675                }
9676            }
9677            if (fmtcnt < 0) {
9678                PyErr_SetString(PyExc_ValueError,
9679                                "incomplete format");
9680                goto onError;
9681            }
9682            if (c != '%') {
9683                v = getnextarg(args, arglen, &argidx);
9684                if (v == NULL)
9685                    goto onError;
9686            }
9687            sign = 0;
9688            fill = ' ';
9689            switch (c) {
9690
9691            case '%':
9692                pbuf = formatbuf;
9693                /* presume that buffer length is at least 1 */
9694                pbuf[0] = '%';
9695                len = 1;
9696                break;
9697
9698            case 's':
9699            case 'r':
9700            case 'a':
9701                if (PyUnicode_CheckExact(v) && c == 's') {
9702                    temp = v;
9703                    Py_INCREF(temp);
9704                }
9705                else {
9706                    if (c == 's')
9707                        temp = PyObject_Str(v);
9708                    else if (c == 'r')
9709                        temp = PyObject_Repr(v);
9710                    else
9711                        temp = PyObject_ASCII(v);
9712                    if (temp == NULL)
9713                        goto onError;
9714                    if (PyUnicode_Check(temp))
9715                        /* nothing to do */;
9716                    else {
9717                        Py_DECREF(temp);
9718                        PyErr_SetString(PyExc_TypeError,
9719                                        "%s argument has non-string str()");
9720                        goto onError;
9721                    }
9722                }
9723                pbuf = PyUnicode_AS_UNICODE(temp);
9724                len = PyUnicode_GET_SIZE(temp);
9725                if (prec >= 0 && len > prec)
9726                    len = prec;
9727                break;
9728
9729            case 'i':
9730            case 'd':
9731            case 'u':
9732            case 'o':
9733            case 'x':
9734            case 'X':
9735                isnumok = 0;
9736                if (PyNumber_Check(v)) {
9737                    PyObject *iobj=NULL;
9738
9739                    if (PyLong_Check(v)) {
9740                        iobj = v;
9741                        Py_INCREF(iobj);
9742                    }
9743                    else {
9744                        iobj = PyNumber_Long(v);
9745                    }
9746                    if (iobj!=NULL) {
9747                        if (PyLong_Check(iobj)) {
9748                            isnumok = 1;
9749                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
9750                            Py_DECREF(iobj);
9751                            if (!temp)
9752                                goto onError;
9753                            pbuf = PyUnicode_AS_UNICODE(temp);
9754                            len = PyUnicode_GET_SIZE(temp);
9755                            sign = 1;
9756                        }
9757                        else {
9758                            Py_DECREF(iobj);
9759                        }
9760                    }
9761                }
9762                if (!isnumok) {
9763                    PyErr_Format(PyExc_TypeError,
9764                                 "%%%c format: a number is required, "
9765                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9766                    goto onError;
9767                }
9768                if (flags & F_ZERO)
9769                    fill = '0';
9770                break;
9771
9772            case 'e':
9773            case 'E':
9774            case 'f':
9775            case 'F':
9776            case 'g':
9777            case 'G':
9778                temp = formatfloat(v, flags, prec, c);
9779                if (!temp)
9780                    goto onError;
9781                pbuf = PyUnicode_AS_UNICODE(temp);
9782                len = PyUnicode_GET_SIZE(temp);
9783                sign = 1;
9784                if (flags & F_ZERO)
9785                    fill = '0';
9786                break;
9787
9788            case 'c':
9789                pbuf = formatbuf;
9790                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9791                if (len < 0)
9792                    goto onError;
9793                break;
9794
9795            default:
9796                PyErr_Format(PyExc_ValueError,
9797                             "unsupported format character '%c' (0x%x) "
9798                             "at index %zd",
9799                             (31<=c && c<=126) ? (char)c : '?',
9800                             (int)c,
9801                             (Py_ssize_t)(fmt - 1 -
9802                                          PyUnicode_AS_UNICODE(uformat)));
9803                goto onError;
9804            }
9805            if (sign) {
9806                if (*pbuf == '-' || *pbuf == '+') {
9807                    sign = *pbuf++;
9808                    len--;
9809                }
9810                else if (flags & F_SIGN)
9811                    sign = '+';
9812                else if (flags & F_BLANK)
9813                    sign = ' ';
9814                else
9815                    sign = 0;
9816            }
9817            if (width < len)
9818                width = len;
9819            if (rescnt - (sign != 0) < width) {
9820                reslen -= rescnt;
9821                rescnt = width + fmtcnt + 100;
9822                reslen += rescnt;
9823                if (reslen < 0) {
9824                    Py_XDECREF(temp);
9825                    PyErr_NoMemory();
9826                    goto onError;
9827                }
9828                if (_PyUnicode_Resize(&result, reslen) < 0) {
9829                    Py_XDECREF(temp);
9830                    goto onError;
9831                }
9832                res = PyUnicode_AS_UNICODE(result)
9833                    + reslen - rescnt;
9834            }
9835            if (sign) {
9836                if (fill != ' ')
9837                    *res++ = sign;
9838                rescnt--;
9839                if (width > len)
9840                    width--;
9841            }
9842            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9843                assert(pbuf[0] == '0');
9844                assert(pbuf[1] == c);
9845                if (fill != ' ') {
9846                    *res++ = *pbuf++;
9847                    *res++ = *pbuf++;
9848                }
9849                rescnt -= 2;
9850                width -= 2;
9851                if (width < 0)
9852                    width = 0;
9853                len -= 2;
9854            }
9855            if (width > len && !(flags & F_LJUST)) {
9856                do {
9857                    --rescnt;
9858                    *res++ = fill;
9859                } while (--width > len);
9860            }
9861            if (fill == ' ') {
9862                if (sign)
9863                    *res++ = sign;
9864                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9865                    assert(pbuf[0] == '0');
9866                    assert(pbuf[1] == c);
9867                    *res++ = *pbuf++;
9868                    *res++ = *pbuf++;
9869                }
9870            }
9871            Py_UNICODE_COPY(res, pbuf, len);
9872            res += len;
9873            rescnt -= len;
9874            while (--width >= len) {
9875                --rescnt;
9876                *res++ = ' ';
9877            }
9878            if (dict && (argidx < arglen) && c != '%') {
9879                PyErr_SetString(PyExc_TypeError,
9880                                "not all arguments converted during string formatting");
9881                Py_XDECREF(temp);
9882                goto onError;
9883            }
9884            Py_XDECREF(temp);
9885        } /* '%' */
9886    } /* until end */
9887    if (argidx < arglen && !dict) {
9888        PyErr_SetString(PyExc_TypeError,
9889                        "not all arguments converted during string formatting");
9890        goto onError;
9891    }
9892
9893    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9894        goto onError;
9895    if (args_owned) {
9896        Py_DECREF(args);
9897    }
9898    Py_DECREF(uformat);
9899    return (PyObject *)result;
9900
9901  onError:
9902    Py_XDECREF(result);
9903    Py_DECREF(uformat);
9904    if (args_owned) {
9905        Py_DECREF(args);
9906    }
9907    return NULL;
9908}
9909
9910static PyObject *
9911unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9912
9913static PyObject *
9914unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9915{
9916    PyObject *x = NULL;
9917    static char *kwlist[] = {"object", "encoding", "errors", 0};
9918    char *encoding = NULL;
9919    char *errors = NULL;
9920
9921    if (type != &PyUnicode_Type)
9922        return unicode_subtype_new(type, args, kwds);
9923    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9924                                     kwlist, &x, &encoding, &errors))
9925        return NULL;
9926    if (x == NULL)
9927        return (PyObject *)_PyUnicode_New(0);
9928    if (encoding == NULL && errors == NULL)
9929        return PyObject_Str(x);
9930    else
9931        return PyUnicode_FromEncodedObject(x, encoding, errors);
9932}
9933
9934static PyObject *
9935unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9936{
9937    PyUnicodeObject *tmp, *pnew;
9938    Py_ssize_t n;
9939
9940    assert(PyType_IsSubtype(type, &PyUnicode_Type));
9941    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9942    if (tmp == NULL)
9943        return NULL;
9944    assert(PyUnicode_Check(tmp));
9945    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9946    if (pnew == NULL) {
9947        Py_DECREF(tmp);
9948        return NULL;
9949    }
9950    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9951    if (pnew->str == NULL) {
9952        _Py_ForgetReference((PyObject *)pnew);
9953        PyObject_Del(pnew);
9954        Py_DECREF(tmp);
9955        return PyErr_NoMemory();
9956    }
9957    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9958    pnew->length = n;
9959    pnew->hash = tmp->hash;
9960    Py_DECREF(tmp);
9961    return (PyObject *)pnew;
9962}
9963
9964PyDoc_STRVAR(unicode_doc,
9965             "str(object[, encoding[, errors]]) -> str\n\
9966\n\
9967Create a new string object from the given object. If encoding or\n\
9968errors is specified, then the object must expose a data buffer\n\
9969that will be decoded using the given encoding and error handler.\n\
9970Otherwise, returns the result of object.__str__() (if defined)\n\
9971or repr(object).\n\
9972encoding defaults to sys.getdefaultencoding().\n\
9973errors defaults to 'strict'.");
9974
9975static PyObject *unicode_iter(PyObject *seq);
9976
9977PyTypeObject PyUnicode_Type = {
9978    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9979    "str",              /* tp_name */
9980    sizeof(PyUnicodeObject),        /* tp_size */
9981    0,                  /* tp_itemsize */
9982    /* Slots */
9983    (destructor)unicode_dealloc,    /* tp_dealloc */
9984    0,                  /* tp_print */
9985    0,                  /* tp_getattr */
9986    0,                  /* tp_setattr */
9987    0,                  /* tp_reserved */
9988    unicode_repr,           /* tp_repr */
9989    &unicode_as_number,         /* tp_as_number */
9990    &unicode_as_sequence,       /* tp_as_sequence */
9991    &unicode_as_mapping,        /* tp_as_mapping */
9992    (hashfunc) unicode_hash,        /* tp_hash*/
9993    0,                  /* tp_call*/
9994    (reprfunc) unicode_str,     /* tp_str */
9995    PyObject_GenericGetAttr,        /* tp_getattro */
9996    0,                  /* tp_setattro */
9997    0,                  /* tp_as_buffer */
9998    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9999    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
10000    unicode_doc,            /* tp_doc */
10001    0,                  /* tp_traverse */
10002    0,                  /* tp_clear */
10003    PyUnicode_RichCompare,      /* tp_richcompare */
10004    0,                  /* tp_weaklistoffset */
10005    unicode_iter,           /* tp_iter */
10006    0,                  /* tp_iternext */
10007    unicode_methods,            /* tp_methods */
10008    0,                  /* tp_members */
10009    0,                  /* tp_getset */
10010    &PyBaseObject_Type,         /* tp_base */
10011    0,                  /* tp_dict */
10012    0,                  /* tp_descr_get */
10013    0,                  /* tp_descr_set */
10014    0,                  /* tp_dictoffset */
10015    0,                  /* tp_init */
10016    0,                  /* tp_alloc */
10017    unicode_new,            /* tp_new */
10018    PyObject_Del,           /* tp_free */
10019};
10020
10021/* Initialize the Unicode implementation */
10022
10023void _PyUnicode_Init(void)
10024{
10025    int i;
10026
10027    /* XXX - move this array to unicodectype.c ? */
10028    Py_UNICODE linebreak[] = {
10029        0x000A, /* LINE FEED */
10030        0x000D, /* CARRIAGE RETURN */
10031        0x001C, /* FILE SEPARATOR */
10032        0x001D, /* GROUP SEPARATOR */
10033        0x001E, /* RECORD SEPARATOR */
10034        0x0085, /* NEXT LINE */
10035        0x2028, /* LINE SEPARATOR */
10036        0x2029, /* PARAGRAPH SEPARATOR */
10037    };
10038
10039    /* Init the implementation */
10040    free_list = NULL;
10041    numfree = 0;
10042    unicode_empty = _PyUnicode_New(0);
10043    if (!unicode_empty)
10044        return;
10045
10046    for (i = 0; i < 256; i++)
10047        unicode_latin1[i] = NULL;
10048    if (PyType_Ready(&PyUnicode_Type) < 0)
10049        Py_FatalError("Can't initialize 'unicode'");
10050
10051    /* initialize the linebreak bloom filter */
10052    bloom_linebreak = make_bloom_mask(
10053        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10054        );
10055
10056    PyType_Ready(&EncodingMapType);
10057}
10058
10059/* Finalize the Unicode implementation */
10060
10061int
10062PyUnicode_ClearFreeList(void)
10063{
10064    int freelist_size = numfree;
10065    PyUnicodeObject *u;
10066
10067    for (u = free_list; u != NULL;) {
10068        PyUnicodeObject *v = u;
10069        u = *(PyUnicodeObject **)u;
10070        if (v->str)
10071            PyObject_DEL(v->str);
10072        Py_XDECREF(v->defenc);
10073        PyObject_Del(v);
10074        numfree--;
10075    }
10076    free_list = NULL;
10077    assert(numfree == 0);
10078    return freelist_size;
10079}
10080
10081void
10082_PyUnicode_Fini(void)
10083{
10084    int i;
10085
10086    Py_XDECREF(unicode_empty);
10087    unicode_empty = NULL;
10088
10089    for (i = 0; i < 256; i++) {
10090        if (unicode_latin1[i]) {
10091            Py_DECREF(unicode_latin1[i]);
10092            unicode_latin1[i] = NULL;
10093        }
10094    }
10095    (void)PyUnicode_ClearFreeList();
10096}
10097
10098void
10099PyUnicode_InternInPlace(PyObject **p)
10100{
10101    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10102    PyObject *t;
10103    if (s == NULL || !PyUnicode_Check(s))
10104        Py_FatalError(
10105            "PyUnicode_InternInPlace: unicode strings only please!");
10106    /* If it's a subclass, we don't really know what putting
10107       it in the interned dict might do. */
10108    if (!PyUnicode_CheckExact(s))
10109        return;
10110    if (PyUnicode_CHECK_INTERNED(s))
10111        return;
10112    if (interned == NULL) {
10113        interned = PyDict_New();
10114        if (interned == NULL) {
10115            PyErr_Clear(); /* Don't leave an exception */
10116            return;
10117        }
10118    }
10119    /* It might be that the GetItem call fails even
10120       though the key is present in the dictionary,
10121       namely when this happens during a stack overflow. */
10122    Py_ALLOW_RECURSION
10123        t = PyDict_GetItem(interned, (PyObject *)s);
10124    Py_END_ALLOW_RECURSION
10125
10126        if (t) {
10127            Py_INCREF(t);
10128            Py_DECREF(*p);
10129            *p = t;
10130            return;
10131        }
10132
10133    PyThreadState_GET()->recursion_critical = 1;
10134    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10135        PyErr_Clear();
10136        PyThreadState_GET()->recursion_critical = 0;
10137        return;
10138    }
10139    PyThreadState_GET()->recursion_critical = 0;
10140    /* The two references in interned are not counted by refcnt.
10141       The deallocator will take care of this */
10142    Py_REFCNT(s) -= 2;
10143    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
10144}
10145
10146void
10147PyUnicode_InternImmortal(PyObject **p)
10148{
10149    PyUnicode_InternInPlace(p);
10150    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10151        PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10152        Py_INCREF(*p);
10153    }
10154}
10155
10156PyObject *
10157PyUnicode_InternFromString(const char *cp)
10158{
10159    PyObject *s = PyUnicode_FromString(cp);
10160    if (s == NULL)
10161        return NULL;
10162    PyUnicode_InternInPlace(&s);
10163    return s;
10164}
10165
10166void _Py_ReleaseInternedUnicodeStrings(void)
10167{
10168    PyObject *keys;
10169    PyUnicodeObject *s;
10170    Py_ssize_t i, n;
10171    Py_ssize_t immortal_size = 0, mortal_size = 0;
10172
10173    if (interned == NULL || !PyDict_Check(interned))
10174        return;
10175    keys = PyDict_Keys(interned);
10176    if (keys == NULL || !PyList_Check(keys)) {
10177        PyErr_Clear();
10178        return;
10179    }
10180
10181    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10182       detector, interned unicode strings are not forcibly deallocated;
10183       rather, we give them their stolen references back, and then clear
10184       and DECREF the interned dict. */
10185
10186    n = PyList_GET_SIZE(keys);
10187    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
10188            n);
10189    for (i = 0; i < n; i++) {
10190        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10191        switch (s->state) {
10192        case SSTATE_NOT_INTERNED:
10193            /* XXX Shouldn't happen */
10194            break;
10195        case SSTATE_INTERNED_IMMORTAL:
10196            Py_REFCNT(s) += 1;
10197            immortal_size += s->length;
10198            break;
10199        case SSTATE_INTERNED_MORTAL:
10200            Py_REFCNT(s) += 2;
10201            mortal_size += s->length;
10202            break;
10203        default:
10204            Py_FatalError("Inconsistent interned string state.");
10205        }
10206        s->state = SSTATE_NOT_INTERNED;
10207    }
10208    fprintf(stderr, "total size of all interned strings: "
10209            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10210            "mortal/immortal\n", mortal_size, immortal_size);
10211    Py_DECREF(keys);
10212    PyDict_Clear(interned);
10213    Py_DECREF(interned);
10214    interned = NULL;
10215}
10216
10217
10218/********************* Unicode Iterator **************************/
10219
10220typedef struct {
10221    PyObject_HEAD
10222    Py_ssize_t it_index;
10223    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
10224} unicodeiterobject;
10225
10226static void
10227unicodeiter_dealloc(unicodeiterobject *it)
10228{
10229    _PyObject_GC_UNTRACK(it);
10230    Py_XDECREF(it->it_seq);
10231    PyObject_GC_Del(it);
10232}
10233
10234static int
10235unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10236{
10237    Py_VISIT(it->it_seq);
10238    return 0;
10239}
10240
10241static PyObject *
10242unicodeiter_next(unicodeiterobject *it)
10243{
10244    PyUnicodeObject *seq;
10245    PyObject *item;
10246
10247    assert(it != NULL);
10248    seq = it->it_seq;
10249    if (seq == NULL)
10250        return NULL;
10251    assert(PyUnicode_Check(seq));
10252
10253    if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10254        item = PyUnicode_FromUnicode(
10255            PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
10256        if (item != NULL)
10257            ++it->it_index;
10258        return item;
10259    }
10260
10261    Py_DECREF(seq);
10262    it->it_seq = NULL;
10263    return NULL;
10264}
10265
10266static PyObject *
10267unicodeiter_len(unicodeiterobject *it)
10268{
10269    Py_ssize_t len = 0;
10270    if (it->it_seq)
10271        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10272    return PyLong_FromSsize_t(len);
10273}
10274
10275PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10276
10277static PyMethodDef unicodeiter_methods[] = {
10278    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
10279     length_hint_doc},
10280    {NULL,      NULL}       /* sentinel */
10281};
10282
10283PyTypeObject PyUnicodeIter_Type = {
10284    PyVarObject_HEAD_INIT(&PyType_Type, 0)
10285    "str_iterator",         /* tp_name */
10286    sizeof(unicodeiterobject),      /* tp_basicsize */
10287    0,                  /* tp_itemsize */
10288    /* methods */
10289    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
10290    0,                  /* tp_print */
10291    0,                  /* tp_getattr */
10292    0,                  /* tp_setattr */
10293    0,                  /* tp_reserved */
10294    0,                  /* tp_repr */
10295    0,                  /* tp_as_number */
10296    0,                  /* tp_as_sequence */
10297    0,                  /* tp_as_mapping */
10298    0,                  /* tp_hash */
10299    0,                  /* tp_call */
10300    0,                  /* tp_str */
10301    PyObject_GenericGetAttr,        /* tp_getattro */
10302    0,                  /* tp_setattro */
10303    0,                  /* tp_as_buffer */
10304    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10305    0,                  /* tp_doc */
10306    (traverseproc)unicodeiter_traverse, /* tp_traverse */
10307    0,                  /* tp_clear */
10308    0,                  /* tp_richcompare */
10309    0,                  /* tp_weaklistoffset */
10310    PyObject_SelfIter,          /* tp_iter */
10311    (iternextfunc)unicodeiter_next,     /* tp_iternext */
10312    unicodeiter_methods,            /* tp_methods */
10313    0,
10314};
10315
10316static PyObject *
10317unicode_iter(PyObject *seq)
10318{
10319    unicodeiterobject *it;
10320
10321    if (!PyUnicode_Check(seq)) {
10322        PyErr_BadInternalCall();
10323        return NULL;
10324    }
10325    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10326    if (it == NULL)
10327        return NULL;
10328    it->it_index = 0;
10329    Py_INCREF(seq);
10330    it->it_seq = (PyUnicodeObject *)seq;
10331    _PyObject_GC_TRACK(it);
10332    return (PyObject *)it;
10333}
10334
10335size_t
10336Py_UNICODE_strlen(const Py_UNICODE *u)
10337{
10338    int res = 0;
10339    while(*u++)
10340        res++;
10341    return res;
10342}
10343
10344Py_UNICODE*
10345Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10346{
10347    Py_UNICODE *u = s1;
10348    while ((*u++ = *s2++));
10349    return s1;
10350}
10351
10352Py_UNICODE*
10353Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10354{
10355    Py_UNICODE *u = s1;
10356    while ((*u++ = *s2++))
10357        if (n-- == 0)
10358            break;
10359    return s1;
10360}
10361
10362Py_UNICODE*
10363Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10364{
10365    Py_UNICODE *u1 = s1;
10366    u1 += Py_UNICODE_strlen(u1);
10367    Py_UNICODE_strcpy(u1, s2);
10368    return s1;
10369}
10370
10371int
10372Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10373{
10374    while (*s1 && *s2 && *s1 == *s2)
10375        s1++, s2++;
10376    if (*s1 && *s2)
10377        return (*s1 < *s2) ? -1 : +1;
10378    if (*s1)
10379        return 1;
10380    if (*s2)
10381        return -1;
10382    return 0;
10383}
10384
10385int
10386Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10387{
10388    register Py_UNICODE u1, u2;
10389    for (; n != 0; n--) {
10390        u1 = *s1;
10391        u2 = *s2;
10392        if (u1 != u2)
10393            return (u1 < u2) ? -1 : +1;
10394        if (u1 == '\0')
10395            return 0;
10396        s1++;
10397        s2++;
10398    }
10399    return 0;
10400}
10401
10402Py_UNICODE*
10403Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10404{
10405    const Py_UNICODE *p;
10406    for (p = s; *p; p++)
10407        if (*p == c)
10408            return (Py_UNICODE*)p;
10409    return NULL;
10410}
10411
10412Py_UNICODE*
10413Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10414{
10415    const Py_UNICODE *p;
10416    p = s + Py_UNICODE_strlen(s);
10417    while (p != s) {
10418        p--;
10419        if (*p == c)
10420            return (Py_UNICODE*)p;
10421    }
10422    return NULL;
10423}
10424
10425Py_UNICODE*
10426PyUnicode_AsUnicodeCopy(PyObject *object)
10427{
10428    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10429    Py_UNICODE *copy;
10430    Py_ssize_t size;
10431
10432    /* Ensure we won't overflow the size. */
10433    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10434        PyErr_NoMemory();
10435        return NULL;
10436    }
10437    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10438    size *= sizeof(Py_UNICODE);
10439    copy = PyMem_Malloc(size);
10440    if (copy == NULL) {
10441        PyErr_NoMemory();
10442        return NULL;
10443    }
10444    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10445    return copy;
10446}
10447
10448/* A _string module, to export formatter_parser and formatter_field_name_split
10449   to the string.Formatter class implemented in Python. */
10450
10451static PyMethodDef _string_methods[] = {
10452    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10453     METH_O, PyDoc_STR("split the argument as a field name")},
10454    {"formatter_parser", (PyCFunction) formatter_parser,
10455     METH_O, PyDoc_STR("parse the argument as a format string")},
10456    {NULL, NULL}
10457};
10458
10459static struct PyModuleDef _string_module = {
10460    PyModuleDef_HEAD_INIT,
10461    "_string",
10462    PyDoc_STR("string helper module"),
10463    0,
10464    _string_methods,
10465    NULL,
10466    NULL,
10467    NULL,
10468    NULL
10469};
10470
10471PyMODINIT_FUNC
10472PyInit__string(void)
10473{
10474    return PyModule_Create(&_string_module);
10475}
10476
10477
10478#ifdef __cplusplus
10479}
10480#endif
10481