unicodeobject.c revision 6d5ad227a50c6c5a78e48a98095788953ab49512
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
58/* --- Globals ------------------------------------------------------------
59
60   The globals are initialized by the _PyUnicode_Init() API and should
61   not be used before calling that API.
62
63*/
64
65
66#ifdef __cplusplus
67extern "C" {
68#endif
69
70/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
73#ifdef Py_DEBUG
74#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
75#else
76#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
78
79#define _PyUnicode_UTF8(op)                             \
80    (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op)                              \
82    (assert(_PyUnicode_CHECK(op)),                      \
83     assert(PyUnicode_IS_READY(op)),                    \
84     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
85         ((char*)((PyASCIIObject*)(op) + 1)) :          \
86         _PyUnicode_UTF8(op))
87#define _PyUnicode_UTF8_LENGTH(op)                      \
88    (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op)                       \
90    (assert(_PyUnicode_CHECK(op)),                      \
91     assert(PyUnicode_IS_READY(op)),                    \
92     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
93         ((PyASCIIObject*)(op))->length :               \
94         _PyUnicode_UTF8_LENGTH(op))
95#define _PyUnicode_WSTR(op)                             \
96    (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op)                      \
98    (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op)                           \
100    (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op)                            \
102    (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op)                             \
104    (((PyASCIIObject *)(op))->hash)
105#define _PyUnicode_KIND(op)                             \
106    (assert(_PyUnicode_CHECK(op)),                      \
107     ((PyASCIIObject *)(op))->state.kind)
108#define _PyUnicode_GET_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     ((PyASCIIObject *)(op))->length)
111#define _PyUnicode_DATA_ANY(op)                         \
112    (((PyUnicodeObject*)(op))->data.any)
113
114/* Optimized version of Py_MAX() to compute the maximum character:
115   use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2)                 \
117    ((maxchar1) | (maxchar2))
118
119#undef PyUnicode_READY
120#define PyUnicode_READY(op)                             \
121    (assert(_PyUnicode_CHECK(op)),                      \
122     (PyUnicode_IS_READY(op) ?                          \
123      0 :                                               \
124      _PyUnicode_Ready(op)))
125
126#define _PyUnicode_SHARE_UTF8(op)                       \
127    (assert(_PyUnicode_CHECK(op)),                      \
128     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
129     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op)                       \
131    (assert(_PyUnicode_CHECK(op)),                      \
132     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated UTF-8 memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
137    (assert(_PyUnicode_CHECK(op)),                      \
138     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
139      && _PyUnicode_UTF8(op)                            \
140      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
142/* true if the Unicode object has an allocated wstr memory block
143   (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
145    (assert(_PyUnicode_CHECK(op)),                      \
146     (_PyUnicode_WSTR(op) &&                            \
147      (!PyUnicode_IS_READY(op) ||                       \
148       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
150/* Generic helper macro to convert characters of different types.
151   from_type and to_type have to be valid type names, begin and end
152   are pointers to the source characters which should be of type
153   "from_type *".  to is a pointer of type "to_type *" and points to the
154   buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156    do {                                                \
157        to_type *_to = (to_type *) to;                  \
158        const from_type *_iter = (begin);               \
159        const from_type *_end = (end);                  \
160        Py_ssize_t n = (_end) - (_iter);                \
161        const from_type *_unrolled_end =                \
162            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
163        while (_iter < (_unrolled_end)) {               \
164            _to[0] = (to_type) _iter[0];                \
165            _to[1] = (to_type) _iter[1];                \
166            _to[2] = (to_type) _iter[2];                \
167            _to[3] = (to_type) _iter[3];                \
168            _iter += 4; _to += 4;                       \
169        }                                               \
170        while (_iter < (_end))                          \
171            *_to++ = (to_type) *_iter++;                \
172    } while (0)
173
174/* This dictionary holds all interned unicode strings.  Note that references
175   to strings in this dictionary are *not* counted in the string's ob_refcnt.
176   When the interned string reaches a refcnt of 0 the string deallocation
177   function will delete the reference from this dictionary.
178
179   Another way to look at this is that to say that the actual reference
180   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
181*/
182static PyObject *interned;
183
184/* The empty Unicode object is shared to improve performance. */
185static PyObject *unicode_empty;
186
187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
190/* Single character Unicode strings in the Latin-1 range are being
191   shared as well. */
192static PyObject *unicode_latin1[256];
193
194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
196    0, 0, 0, 0, 0, 0, 0, 0,
197/*     case 0x0009: * CHARACTER TABULATION */
198/*     case 0x000A: * LINE FEED */
199/*     case 0x000B: * LINE TABULATION */
200/*     case 0x000C: * FORM FEED */
201/*     case 0x000D: * CARRIAGE RETURN */
202    0, 1, 1, 1, 1, 1, 0, 0,
203    0, 0, 0, 0, 0, 0, 0, 0,
204/*     case 0x001C: * FILE SEPARATOR */
205/*     case 0x001D: * GROUP SEPARATOR */
206/*     case 0x001E: * RECORD SEPARATOR */
207/*     case 0x001F: * UNIT SEPARATOR */
208    0, 0, 0, 0, 1, 1, 1, 1,
209/*     case 0x0020: * SPACE */
210    1, 0, 0, 0, 0, 0, 0, 0,
211    0, 0, 0, 0, 0, 0, 0, 0,
212    0, 0, 0, 0, 0, 0, 0, 0,
213    0, 0, 0, 0, 0, 0, 0, 0,
214
215    0, 0, 0, 0, 0, 0, 0, 0,
216    0, 0, 0, 0, 0, 0, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218    0, 0, 0, 0, 0, 0, 0, 0,
219    0, 0, 0, 0, 0, 0, 0, 0,
220    0, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0
223};
224
225/* forward */
226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
227static PyObject* get_latin1_char(unsigned char ch);
228static int unicode_modifiable(PyObject *unicode);
229
230
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
239unicode_encode_call_errorhandler(const char *errors,
240       PyObject **errorHandler,const char *encoding, const char *reason,
241       PyObject *unicode, PyObject **exceptionObject,
242       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
244static void
245raise_encode_exception(PyObject **exceptionObject,
246                       const char *encoding,
247                       PyObject *unicode,
248                       Py_ssize_t startpos, Py_ssize_t endpos,
249                       const char *reason);
250
251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
253    0, 0, 0, 0, 0, 0, 0, 0,
254/*         0x000A, * LINE FEED */
255/*         0x000B, * LINE TABULATION */
256/*         0x000C, * FORM FEED */
257/*         0x000D, * CARRIAGE RETURN */
258    0, 0, 1, 1, 1, 1, 0, 0,
259    0, 0, 0, 0, 0, 0, 0, 0,
260/*         0x001C, * FILE SEPARATOR */
261/*         0x001D, * GROUP SEPARATOR */
262/*         0x001E, * RECORD SEPARATOR */
263    0, 0, 0, 0, 1, 1, 1, 0,
264    0, 0, 0, 0, 0, 0, 0, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268
269    0, 0, 0, 0, 0, 0, 0, 0,
270    0, 0, 0, 0, 0, 0, 0, 0,
271    0, 0, 0, 0, 0, 0, 0, 0,
272    0, 0, 0, 0, 0, 0, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0
277};
278
279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280   This function is kept for backward compatibility with the old API. */
281Py_UNICODE
282PyUnicode_GetMax(void)
283{
284#ifdef Py_UNICODE_WIDE
285    return 0x10FFFF;
286#else
287    /* This is actually an illegal character, so it should
288       not be passed to unichr. */
289    return 0xFFFF;
290#endif
291}
292
293#ifdef Py_DEBUG
294int
295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
296{
297    PyASCIIObject *ascii;
298    unsigned int kind;
299
300    assert(PyUnicode_Check(op));
301
302    ascii = (PyASCIIObject *)op;
303    kind = ascii->state.kind;
304
305    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
306        assert(kind == PyUnicode_1BYTE_KIND);
307        assert(ascii->state.ready == 1);
308    }
309    else {
310        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
311        void *data;
312
313        if (ascii->state.compact == 1) {
314            data = compact + 1;
315            assert(kind == PyUnicode_1BYTE_KIND
316                   || kind == PyUnicode_2BYTE_KIND
317                   || kind == PyUnicode_4BYTE_KIND);
318            assert(ascii->state.ascii == 0);
319            assert(ascii->state.ready == 1);
320            assert (compact->utf8 != data);
321        }
322        else {
323            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325            data = unicode->data.any;
326            if (kind == PyUnicode_WCHAR_KIND) {
327                assert(ascii->length == 0);
328                assert(ascii->hash == -1);
329                assert(ascii->state.compact == 0);
330                assert(ascii->state.ascii == 0);
331                assert(ascii->state.ready == 0);
332                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
333                assert(ascii->wstr != NULL);
334                assert(data == NULL);
335                assert(compact->utf8 == NULL);
336            }
337            else {
338                assert(kind == PyUnicode_1BYTE_KIND
339                       || kind == PyUnicode_2BYTE_KIND
340                       || kind == PyUnicode_4BYTE_KIND);
341                assert(ascii->state.compact == 0);
342                assert(ascii->state.ready == 1);
343                assert(data != NULL);
344                if (ascii->state.ascii) {
345                    assert (compact->utf8 == data);
346                    assert (compact->utf8_length == ascii->length);
347                }
348                else
349                    assert (compact->utf8 != data);
350            }
351        }
352        if (kind != PyUnicode_WCHAR_KIND) {
353            if (
354#if SIZEOF_WCHAR_T == 2
355                kind == PyUnicode_2BYTE_KIND
356#else
357                kind == PyUnicode_4BYTE_KIND
358#endif
359               )
360            {
361                assert(ascii->wstr == data);
362                assert(compact->wstr_length == ascii->length);
363            } else
364                assert(ascii->wstr != data);
365        }
366
367        if (compact->utf8 == NULL)
368            assert(compact->utf8_length == 0);
369        if (ascii->wstr == NULL)
370            assert(compact->wstr_length == 0);
371    }
372    /* check that the best kind is used */
373    if (check_content && kind != PyUnicode_WCHAR_KIND)
374    {
375        Py_ssize_t i;
376        Py_UCS4 maxchar = 0;
377        void *data;
378        Py_UCS4 ch;
379
380        data = PyUnicode_DATA(ascii);
381        for (i=0; i < ascii->length; i++)
382        {
383            ch = PyUnicode_READ(kind, data, i);
384            if (ch > maxchar)
385                maxchar = ch;
386        }
387        if (kind == PyUnicode_1BYTE_KIND) {
388            if (ascii->state.ascii == 0) {
389                assert(maxchar >= 128);
390                assert(maxchar <= 255);
391            }
392            else
393                assert(maxchar < 128);
394        }
395        else if (kind == PyUnicode_2BYTE_KIND) {
396            assert(maxchar >= 0x100);
397            assert(maxchar <= 0xFFFF);
398        }
399        else {
400            assert(maxchar >= 0x10000);
401            assert(maxchar <= MAX_UNICODE);
402        }
403        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
404    }
405    return 1;
406}
407#endif
408
409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413    Py_ssize_t len;
414
415    assert(Py_REFCNT(unicode) == 1);
416
417    len = _PyUnicode_WSTR_LENGTH(unicode);
418    if (len == 0) {
419        Py_INCREF(unicode_empty);
420        Py_DECREF(unicode);
421        return unicode_empty;
422    }
423
424    if (len == 1) {
425        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
426        if (ch < 256) {
427            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
428            Py_DECREF(unicode);
429            return latin1_char;
430        }
431    }
432
433    if (_PyUnicode_Ready(unicode) < 0) {
434        Py_XDECREF(unicode);
435        return NULL;
436    }
437#else
438    /* don't make the result ready in debug mode to ensure that the caller
439       makes the string ready before using it */
440    assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442    return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448    Py_ssize_t length;
449
450    length = PyUnicode_GET_LENGTH(unicode);
451    if (length == 0) {
452        if (unicode != unicode_empty) {
453            Py_INCREF(unicode_empty);
454            Py_DECREF(unicode);
455        }
456        return unicode_empty;
457    }
458
459    if (length == 1) {
460        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461        if (ch < 256) {
462            PyObject *latin1_char = unicode_latin1[ch];
463            if (latin1_char != NULL) {
464                if (unicode != latin1_char) {
465                    Py_INCREF(latin1_char);
466                    Py_DECREF(unicode);
467                }
468                return latin1_char;
469            }
470            else {
471                assert(_PyUnicode_CheckConsistency(unicode, 1));
472                Py_INCREF(unicode);
473                unicode_latin1[ch] = unicode;
474                return unicode;
475            }
476        }
477    }
478
479    assert(_PyUnicode_CheckConsistency(unicode, 1));
480    return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486    assert(_PyUnicode_CHECK(unicode));
487    if (PyUnicode_IS_READY(unicode))
488        return unicode_result_ready(unicode);
489    else
490        return unicode_result_wchar(unicode);
491}
492
493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496    if (PyUnicode_CheckExact(unicode)) {
497        if (PyUnicode_READY(unicode) == -1)
498            return NULL;
499        Py_INCREF(unicode);
500        return unicode;
501    }
502    else
503        /* Subtype -- return genuine unicode string with the same value. */
504        return _PyUnicode_Copy(unicode);
505}
506
507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514   to keep things simple, we use a single bitmask, using the least 5
515   bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
535
536#define BLOOM_LINEBREAK(ch)                                             \
537    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
538     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
539
540Py_LOCAL_INLINE(BLOOM_MASK)
541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
542{
543    /* calculate simple bloom-style bitmask for a given unicode string */
544
545    BLOOM_MASK mask;
546    Py_ssize_t i;
547
548    mask = 0;
549    for (i = 0; i < len; i++)
550        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
551
552    return mask;
553}
554
555#define BLOOM_MEMBER(mask, chr, str) \
556    (BLOOM(mask, chr) \
557     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
558
559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
605#include "stringlib/undef.h"
606
607/* --- Unicode Object ----------------------------------------------------- */
608
609static PyObject *
610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
611
612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613                                     Py_ssize_t size, Py_UCS4 ch,
614                                     int direction)
615{
616    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618    switch (kind) {
619    case PyUnicode_1BYTE_KIND:
620        {
621            Py_UCS1 ch1 = (Py_UCS1) ch;
622            if (ch1 == ch)
623                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624            else
625                return -1;
626        }
627    case PyUnicode_2BYTE_KIND:
628        {
629            Py_UCS2 ch2 = (Py_UCS2) ch;
630            if (ch2 == ch)
631                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632            else
633                return -1;
634        }
635    case PyUnicode_4BYTE_KIND:
636        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637    default:
638        assert(0);
639        return -1;
640    }
641}
642
643static PyObject*
644resize_compact(PyObject *unicode, Py_ssize_t length)
645{
646    Py_ssize_t char_size;
647    Py_ssize_t struct_size;
648    Py_ssize_t new_size;
649    int share_wstr;
650    PyObject *new_unicode;
651    assert(unicode_modifiable(unicode));
652    assert(PyUnicode_IS_READY(unicode));
653    assert(PyUnicode_IS_COMPACT(unicode));
654
655    char_size = PyUnicode_KIND(unicode);
656    if (PyUnicode_IS_ASCII(unicode))
657        struct_size = sizeof(PyASCIIObject);
658    else
659        struct_size = sizeof(PyCompactUnicodeObject);
660    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
661
662    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663        PyErr_NoMemory();
664        return NULL;
665    }
666    new_size = (struct_size + (length + 1) * char_size);
667
668    _Py_DEC_REFTOTAL;
669    _Py_ForgetReference(unicode);
670
671    new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672    if (new_unicode == NULL) {
673        _Py_NewReference(unicode);
674        PyErr_NoMemory();
675        return NULL;
676    }
677    unicode = new_unicode;
678    _Py_NewReference(unicode);
679
680    _PyUnicode_LENGTH(unicode) = length;
681    if (share_wstr) {
682        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
683        if (!PyUnicode_IS_ASCII(unicode))
684            _PyUnicode_WSTR_LENGTH(unicode) = length;
685    }
686    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687                    length, 0);
688    assert(_PyUnicode_CheckConsistency(unicode, 0));
689    return unicode;
690}
691
692static int
693resize_inplace(PyObject *unicode, Py_ssize_t length)
694{
695    wchar_t *wstr;
696    Py_ssize_t new_size;
697    assert(!PyUnicode_IS_COMPACT(unicode));
698    assert(Py_REFCNT(unicode) == 1);
699
700    if (PyUnicode_IS_READY(unicode)) {
701        Py_ssize_t char_size;
702        int share_wstr, share_utf8;
703        void *data;
704
705        data = _PyUnicode_DATA_ANY(unicode);
706        char_size = PyUnicode_KIND(unicode);
707        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
708        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
709
710        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
711            PyErr_NoMemory();
712            return -1;
713        }
714        new_size = (length + 1) * char_size;
715
716        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
717        {
718            PyObject_DEL(_PyUnicode_UTF8(unicode));
719            _PyUnicode_UTF8(unicode) = NULL;
720            _PyUnicode_UTF8_LENGTH(unicode) = 0;
721        }
722
723        data = (PyObject *)PyObject_REALLOC(data, new_size);
724        if (data == NULL) {
725            PyErr_NoMemory();
726            return -1;
727        }
728        _PyUnicode_DATA_ANY(unicode) = data;
729        if (share_wstr) {
730            _PyUnicode_WSTR(unicode) = data;
731            _PyUnicode_WSTR_LENGTH(unicode) = length;
732        }
733        if (share_utf8) {
734            _PyUnicode_UTF8(unicode) = data;
735            _PyUnicode_UTF8_LENGTH(unicode) = length;
736        }
737        _PyUnicode_LENGTH(unicode) = length;
738        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
739        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
740            assert(_PyUnicode_CheckConsistency(unicode, 0));
741            return 0;
742        }
743    }
744    assert(_PyUnicode_WSTR(unicode) != NULL);
745
746    /* check for integer overflow */
747    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
748        PyErr_NoMemory();
749        return -1;
750    }
751    new_size = sizeof(wchar_t) * (length + 1);
752    wstr =  _PyUnicode_WSTR(unicode);
753    wstr = PyObject_REALLOC(wstr, new_size);
754    if (!wstr) {
755        PyErr_NoMemory();
756        return -1;
757    }
758    _PyUnicode_WSTR(unicode) = wstr;
759    _PyUnicode_WSTR(unicode)[length] = 0;
760    _PyUnicode_WSTR_LENGTH(unicode) = length;
761    assert(_PyUnicode_CheckConsistency(unicode, 0));
762    return 0;
763}
764
765static PyObject*
766resize_copy(PyObject *unicode, Py_ssize_t length)
767{
768    Py_ssize_t copy_length;
769    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
770        PyObject *copy;
771
772        if (PyUnicode_READY(unicode) == -1)
773            return NULL;
774
775        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
776        if (copy == NULL)
777            return NULL;
778
779        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
780        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
781        return copy;
782    }
783    else {
784        PyObject *w;
785
786        w = (PyObject*)_PyUnicode_New(length);
787        if (w == NULL)
788            return NULL;
789        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
790        copy_length = Py_MIN(copy_length, length);
791        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
792                        copy_length);
793        return w;
794    }
795}
796
797/* We allocate one more byte to make sure the string is
798   Ux0000 terminated; some code (e.g. new_identifier)
799   relies on that.
800
801   XXX This allocator could further be enhanced by assuring that the
802   free list never reduces its size below 1.
803
804*/
805
806static PyUnicodeObject *
807_PyUnicode_New(Py_ssize_t length)
808{
809    register PyUnicodeObject *unicode;
810    size_t new_size;
811
812    /* Optimization for empty strings */
813    if (length == 0 && unicode_empty != NULL) {
814        Py_INCREF(unicode_empty);
815        return (PyUnicodeObject*)unicode_empty;
816    }
817
818    /* Ensure we won't overflow the size. */
819    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
820        return (PyUnicodeObject *)PyErr_NoMemory();
821    }
822    if (length < 0) {
823        PyErr_SetString(PyExc_SystemError,
824                        "Negative size passed to _PyUnicode_New");
825        return NULL;
826    }
827
828    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
829    if (unicode == NULL)
830        return NULL;
831    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
832    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
833    if (!_PyUnicode_WSTR(unicode)) {
834        Py_DECREF(unicode);
835        PyErr_NoMemory();
836        return NULL;
837    }
838
839    /* Initialize the first element to guard against cases where
840     * the caller fails before initializing str -- unicode_resize()
841     * reads str[0], and the Keep-Alive optimization can keep memory
842     * allocated for str alive across a call to unicode_dealloc(unicode).
843     * We don't want unicode_resize to read uninitialized memory in
844     * that case.
845     */
846    _PyUnicode_WSTR(unicode)[0] = 0;
847    _PyUnicode_WSTR(unicode)[length] = 0;
848    _PyUnicode_WSTR_LENGTH(unicode) = length;
849    _PyUnicode_HASH(unicode) = -1;
850    _PyUnicode_STATE(unicode).interned = 0;
851    _PyUnicode_STATE(unicode).kind = 0;
852    _PyUnicode_STATE(unicode).compact = 0;
853    _PyUnicode_STATE(unicode).ready = 0;
854    _PyUnicode_STATE(unicode).ascii = 0;
855    _PyUnicode_DATA_ANY(unicode) = NULL;
856    _PyUnicode_LENGTH(unicode) = 0;
857    _PyUnicode_UTF8(unicode) = NULL;
858    _PyUnicode_UTF8_LENGTH(unicode) = 0;
859    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
860    return unicode;
861}
862
863static const char*
864unicode_kind_name(PyObject *unicode)
865{
866    /* don't check consistency: unicode_kind_name() is called from
867       _PyUnicode_Dump() */
868    if (!PyUnicode_IS_COMPACT(unicode))
869    {
870        if (!PyUnicode_IS_READY(unicode))
871            return "wstr";
872        switch (PyUnicode_KIND(unicode))
873        {
874        case PyUnicode_1BYTE_KIND:
875            if (PyUnicode_IS_ASCII(unicode))
876                return "legacy ascii";
877            else
878                return "legacy latin1";
879        case PyUnicode_2BYTE_KIND:
880            return "legacy UCS2";
881        case PyUnicode_4BYTE_KIND:
882            return "legacy UCS4";
883        default:
884            return "<legacy invalid kind>";
885        }
886    }
887    assert(PyUnicode_IS_READY(unicode));
888    switch (PyUnicode_KIND(unicode)) {
889    case PyUnicode_1BYTE_KIND:
890        if (PyUnicode_IS_ASCII(unicode))
891            return "ascii";
892        else
893            return "latin1";
894    case PyUnicode_2BYTE_KIND:
895        return "UCS2";
896    case PyUnicode_4BYTE_KIND:
897        return "UCS4";
898    default:
899        return "<invalid compact kind>";
900    }
901}
902
903#ifdef Py_DEBUG
904/* Functions wrapping macros for use in debugger */
905char *_PyUnicode_utf8(void *unicode){
906    return PyUnicode_UTF8(unicode);
907}
908
909void *_PyUnicode_compact_data(void *unicode) {
910    return _PyUnicode_COMPACT_DATA(unicode);
911}
912void *_PyUnicode_data(void *unicode){
913    printf("obj %p\n", unicode);
914    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
915    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
916    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
917    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
918    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
919    return PyUnicode_DATA(unicode);
920}
921
922void
923_PyUnicode_Dump(PyObject *op)
924{
925    PyASCIIObject *ascii = (PyASCIIObject *)op;
926    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
927    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
928    void *data;
929
930    if (ascii->state.compact)
931    {
932        if (ascii->state.ascii)
933            data = (ascii + 1);
934        else
935            data = (compact + 1);
936    }
937    else
938        data = unicode->data.any;
939    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
940
941    if (ascii->wstr == data)
942        printf("shared ");
943    printf("wstr=%p", ascii->wstr);
944
945    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
946        printf(" (%zu), ", compact->wstr_length);
947        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
948            printf("shared ");
949        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
950    }
951    printf(", data=%p\n", data);
952}
953#endif
954
955PyObject *
956PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
957{
958    PyObject *obj;
959    PyCompactUnicodeObject *unicode;
960    void *data;
961    enum PyUnicode_Kind kind;
962    int is_sharing, is_ascii;
963    Py_ssize_t char_size;
964    Py_ssize_t struct_size;
965
966    /* Optimization for empty strings */
967    if (size == 0 && unicode_empty != NULL) {
968        Py_INCREF(unicode_empty);
969        return unicode_empty;
970    }
971
972    is_ascii = 0;
973    is_sharing = 0;
974    struct_size = sizeof(PyCompactUnicodeObject);
975    if (maxchar < 128) {
976        kind = PyUnicode_1BYTE_KIND;
977        char_size = 1;
978        is_ascii = 1;
979        struct_size = sizeof(PyASCIIObject);
980    }
981    else if (maxchar < 256) {
982        kind = PyUnicode_1BYTE_KIND;
983        char_size = 1;
984    }
985    else if (maxchar < 65536) {
986        kind = PyUnicode_2BYTE_KIND;
987        char_size = 2;
988        if (sizeof(wchar_t) == 2)
989            is_sharing = 1;
990    }
991    else {
992        if (maxchar > MAX_UNICODE) {
993            PyErr_SetString(PyExc_SystemError,
994                            "invalid maximum character passed to PyUnicode_New");
995            return NULL;
996        }
997        kind = PyUnicode_4BYTE_KIND;
998        char_size = 4;
999        if (sizeof(wchar_t) == 4)
1000            is_sharing = 1;
1001    }
1002
1003    /* Ensure we won't overflow the size. */
1004    if (size < 0) {
1005        PyErr_SetString(PyExc_SystemError,
1006                        "Negative size passed to PyUnicode_New");
1007        return NULL;
1008    }
1009    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1010        return PyErr_NoMemory();
1011
1012    /* Duplicated allocation code from _PyObject_New() instead of a call to
1013     * PyObject_New() so we are able to allocate space for the object and
1014     * it's data buffer.
1015     */
1016    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1017    if (obj == NULL)
1018        return PyErr_NoMemory();
1019    obj = PyObject_INIT(obj, &PyUnicode_Type);
1020    if (obj == NULL)
1021        return NULL;
1022
1023    unicode = (PyCompactUnicodeObject *)obj;
1024    if (is_ascii)
1025        data = ((PyASCIIObject*)obj) + 1;
1026    else
1027        data = unicode + 1;
1028    _PyUnicode_LENGTH(unicode) = size;
1029    _PyUnicode_HASH(unicode) = -1;
1030    _PyUnicode_STATE(unicode).interned = 0;
1031    _PyUnicode_STATE(unicode).kind = kind;
1032    _PyUnicode_STATE(unicode).compact = 1;
1033    _PyUnicode_STATE(unicode).ready = 1;
1034    _PyUnicode_STATE(unicode).ascii = is_ascii;
1035    if (is_ascii) {
1036        ((char*)data)[size] = 0;
1037        _PyUnicode_WSTR(unicode) = NULL;
1038    }
1039    else if (kind == PyUnicode_1BYTE_KIND) {
1040        ((char*)data)[size] = 0;
1041        _PyUnicode_WSTR(unicode) = NULL;
1042        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1043        unicode->utf8 = NULL;
1044        unicode->utf8_length = 0;
1045    }
1046    else {
1047        unicode->utf8 = NULL;
1048        unicode->utf8_length = 0;
1049        if (kind == PyUnicode_2BYTE_KIND)
1050            ((Py_UCS2*)data)[size] = 0;
1051        else /* kind == PyUnicode_4BYTE_KIND */
1052            ((Py_UCS4*)data)[size] = 0;
1053        if (is_sharing) {
1054            _PyUnicode_WSTR_LENGTH(unicode) = size;
1055            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1056        }
1057        else {
1058            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1059            _PyUnicode_WSTR(unicode) = NULL;
1060        }
1061    }
1062#ifdef Py_DEBUG
1063    /* Fill the data with invalid characters to detect bugs earlier.
1064       _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1065       at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1066       and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1067    memset(data, 0xff, size * kind);
1068#endif
1069    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1070    return obj;
1071}
1072
1073#if SIZEOF_WCHAR_T == 2
1074/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1075   will decode surrogate pairs, the other conversions are implemented as macros
1076   for efficiency.
1077
1078   This function assumes that unicode can hold one more code point than wstr
1079   characters for a terminating null character. */
1080static void
1081unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1082                              PyObject *unicode)
1083{
1084    const wchar_t *iter;
1085    Py_UCS4 *ucs4_out;
1086
1087    assert(unicode != NULL);
1088    assert(_PyUnicode_CHECK(unicode));
1089    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1090    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1091
1092    for (iter = begin; iter < end; ) {
1093        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1094                           _PyUnicode_GET_LENGTH(unicode)));
1095        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1096            && (iter+1) < end
1097            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1098        {
1099            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1100            iter += 2;
1101        }
1102        else {
1103            *ucs4_out++ = *iter;
1104            iter++;
1105        }
1106    }
1107    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1108                        _PyUnicode_GET_LENGTH(unicode)));
1109
1110}
1111#endif
1112
1113static int
1114unicode_check_modifiable(PyObject *unicode)
1115{
1116    if (!unicode_modifiable(unicode)) {
1117        PyErr_SetString(PyExc_SystemError,
1118                        "Cannot modify a string currently used");
1119        return -1;
1120    }
1121    return 0;
1122}
1123
1124static int
1125_copy_characters(PyObject *to, Py_ssize_t to_start,
1126                 PyObject *from, Py_ssize_t from_start,
1127                 Py_ssize_t how_many, int check_maxchar)
1128{
1129    unsigned int from_kind, to_kind;
1130    void *from_data, *to_data;
1131
1132    assert(0 <= how_many);
1133    assert(0 <= from_start);
1134    assert(0 <= to_start);
1135    assert(PyUnicode_Check(from));
1136    assert(PyUnicode_IS_READY(from));
1137    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1138
1139    assert(PyUnicode_Check(to));
1140    assert(PyUnicode_IS_READY(to));
1141    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1142
1143    if (how_many == 0)
1144        return 0;
1145
1146    from_kind = PyUnicode_KIND(from);
1147    from_data = PyUnicode_DATA(from);
1148    to_kind = PyUnicode_KIND(to);
1149    to_data = PyUnicode_DATA(to);
1150
1151#ifdef Py_DEBUG
1152    if (!check_maxchar
1153        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1154    {
1155        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1156        Py_UCS4 ch;
1157        Py_ssize_t i;
1158        for (i=0; i < how_many; i++) {
1159            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1160            assert(ch <= to_maxchar);
1161        }
1162    }
1163#endif
1164
1165    if (from_kind == to_kind) {
1166        if (check_maxchar
1167            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1168        {
1169            /* Writing Latin-1 characters into an ASCII string requires to
1170               check that all written characters are pure ASCII */
1171            Py_UCS4 max_char;
1172            max_char = ucs1lib_find_max_char(from_data,
1173                                             (Py_UCS1*)from_data + how_many);
1174            if (max_char >= 128)
1175                return -1;
1176        }
1177        Py_MEMCPY((char*)to_data + to_kind * to_start,
1178                  (char*)from_data + from_kind * from_start,
1179                  to_kind * how_many);
1180    }
1181    else if (from_kind == PyUnicode_1BYTE_KIND
1182             && to_kind == PyUnicode_2BYTE_KIND)
1183    {
1184        _PyUnicode_CONVERT_BYTES(
1185            Py_UCS1, Py_UCS2,
1186            PyUnicode_1BYTE_DATA(from) + from_start,
1187            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1188            PyUnicode_2BYTE_DATA(to) + to_start
1189            );
1190    }
1191    else if (from_kind == PyUnicode_1BYTE_KIND
1192             && to_kind == PyUnicode_4BYTE_KIND)
1193    {
1194        _PyUnicode_CONVERT_BYTES(
1195            Py_UCS1, Py_UCS4,
1196            PyUnicode_1BYTE_DATA(from) + from_start,
1197            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1198            PyUnicode_4BYTE_DATA(to) + to_start
1199            );
1200    }
1201    else if (from_kind == PyUnicode_2BYTE_KIND
1202             && to_kind == PyUnicode_4BYTE_KIND)
1203    {
1204        _PyUnicode_CONVERT_BYTES(
1205            Py_UCS2, Py_UCS4,
1206            PyUnicode_2BYTE_DATA(from) + from_start,
1207            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1208            PyUnicode_4BYTE_DATA(to) + to_start
1209            );
1210    }
1211    else {
1212        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1213
1214        if (!check_maxchar) {
1215            if (from_kind == PyUnicode_2BYTE_KIND
1216                && to_kind == PyUnicode_1BYTE_KIND)
1217            {
1218                _PyUnicode_CONVERT_BYTES(
1219                    Py_UCS2, Py_UCS1,
1220                    PyUnicode_2BYTE_DATA(from) + from_start,
1221                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1222                    PyUnicode_1BYTE_DATA(to) + to_start
1223                    );
1224            }
1225            else if (from_kind == PyUnicode_4BYTE_KIND
1226                     && to_kind == PyUnicode_1BYTE_KIND)
1227            {
1228                _PyUnicode_CONVERT_BYTES(
1229                    Py_UCS4, Py_UCS1,
1230                    PyUnicode_4BYTE_DATA(from) + from_start,
1231                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1232                    PyUnicode_1BYTE_DATA(to) + to_start
1233                    );
1234            }
1235            else if (from_kind == PyUnicode_4BYTE_KIND
1236                     && to_kind == PyUnicode_2BYTE_KIND)
1237            {
1238                _PyUnicode_CONVERT_BYTES(
1239                    Py_UCS4, Py_UCS2,
1240                    PyUnicode_4BYTE_DATA(from) + from_start,
1241                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1242                    PyUnicode_2BYTE_DATA(to) + to_start
1243                    );
1244            }
1245            else {
1246                assert(0);
1247                return -1;
1248            }
1249        }
1250        else {
1251            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1252            Py_UCS4 ch;
1253            Py_ssize_t i;
1254
1255            for (i=0; i < how_many; i++) {
1256                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1257                if (ch > to_maxchar)
1258                    return -1;
1259                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1260            }
1261        }
1262    }
1263    return 0;
1264}
1265
1266void
1267_PyUnicode_FastCopyCharacters(
1268    PyObject *to, Py_ssize_t to_start,
1269    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1270{
1271    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1272}
1273
1274Py_ssize_t
1275PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1276                         PyObject *from, Py_ssize_t from_start,
1277                         Py_ssize_t how_many)
1278{
1279    int err;
1280
1281    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1282        PyErr_BadInternalCall();
1283        return -1;
1284    }
1285
1286    if (PyUnicode_READY(from) == -1)
1287        return -1;
1288    if (PyUnicode_READY(to) == -1)
1289        return -1;
1290
1291    if (from_start < 0) {
1292        PyErr_SetString(PyExc_IndexError, "string index out of range");
1293        return -1;
1294    }
1295    if (to_start < 0) {
1296        PyErr_SetString(PyExc_IndexError, "string index out of range");
1297        return -1;
1298    }
1299    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1300    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1301        PyErr_Format(PyExc_SystemError,
1302                     "Cannot write %zi characters at %zi "
1303                     "in a string of %zi characters",
1304                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1305        return -1;
1306    }
1307
1308    if (how_many == 0)
1309        return 0;
1310
1311    if (unicode_check_modifiable(to))
1312        return -1;
1313
1314    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1315    if (err) {
1316        PyErr_Format(PyExc_SystemError,
1317                     "Cannot copy %s characters "
1318                     "into a string of %s characters",
1319                     unicode_kind_name(from),
1320                     unicode_kind_name(to));
1321        return -1;
1322    }
1323    return how_many;
1324}
1325
1326/* Find the maximum code point and count the number of surrogate pairs so a
1327   correct string length can be computed before converting a string to UCS4.
1328   This function counts single surrogates as a character and not as a pair.
1329
1330   Return 0 on success, or -1 on error. */
1331static int
1332find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1333                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1334{
1335    const wchar_t *iter;
1336    Py_UCS4 ch;
1337
1338    assert(num_surrogates != NULL && maxchar != NULL);
1339    *num_surrogates = 0;
1340    *maxchar = 0;
1341
1342    for (iter = begin; iter < end; ) {
1343#if SIZEOF_WCHAR_T == 2
1344        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1345            && (iter+1) < end
1346            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1347        {
1348            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1349            ++(*num_surrogates);
1350            iter += 2;
1351        }
1352        else
1353#endif
1354        {
1355            ch = *iter;
1356            iter++;
1357        }
1358        if (ch > *maxchar) {
1359            *maxchar = ch;
1360            if (*maxchar > MAX_UNICODE) {
1361                PyErr_Format(PyExc_ValueError,
1362                             "character U+%x is not in range [U+0000; U+10ffff]",
1363                             ch);
1364                return -1;
1365            }
1366        }
1367    }
1368    return 0;
1369}
1370
1371int
1372_PyUnicode_Ready(PyObject *unicode)
1373{
1374    wchar_t *end;
1375    Py_UCS4 maxchar = 0;
1376    Py_ssize_t num_surrogates;
1377#if SIZEOF_WCHAR_T == 2
1378    Py_ssize_t length_wo_surrogates;
1379#endif
1380
1381    /* _PyUnicode_Ready() is only intended for old-style API usage where
1382       strings were created using _PyObject_New() and where no canonical
1383       representation (the str field) has been set yet aka strings
1384       which are not yet ready. */
1385    assert(_PyUnicode_CHECK(unicode));
1386    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1387    assert(_PyUnicode_WSTR(unicode) != NULL);
1388    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1389    assert(_PyUnicode_UTF8(unicode) == NULL);
1390    /* Actually, it should neither be interned nor be anything else: */
1391    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1392
1393    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1394    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1395                                &maxchar, &num_surrogates) == -1)
1396        return -1;
1397
1398    if (maxchar < 256) {
1399        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1400        if (!_PyUnicode_DATA_ANY(unicode)) {
1401            PyErr_NoMemory();
1402            return -1;
1403        }
1404        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1405                                _PyUnicode_WSTR(unicode), end,
1406                                PyUnicode_1BYTE_DATA(unicode));
1407        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1408        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1409        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1410        if (maxchar < 128) {
1411            _PyUnicode_STATE(unicode).ascii = 1;
1412            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1413            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1414        }
1415        else {
1416            _PyUnicode_STATE(unicode).ascii = 0;
1417            _PyUnicode_UTF8(unicode) = NULL;
1418            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1419        }
1420        PyObject_FREE(_PyUnicode_WSTR(unicode));
1421        _PyUnicode_WSTR(unicode) = NULL;
1422        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1423    }
1424    /* In this case we might have to convert down from 4-byte native
1425       wchar_t to 2-byte unicode. */
1426    else if (maxchar < 65536) {
1427        assert(num_surrogates == 0 &&
1428               "FindMaxCharAndNumSurrogatePairs() messed up");
1429
1430#if SIZEOF_WCHAR_T == 2
1431        /* We can share representations and are done. */
1432        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1433        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1434        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1435        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1436        _PyUnicode_UTF8(unicode) = NULL;
1437        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1438#else
1439        /* sizeof(wchar_t) == 4 */
1440        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1441            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1442        if (!_PyUnicode_DATA_ANY(unicode)) {
1443            PyErr_NoMemory();
1444            return -1;
1445        }
1446        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1447                                _PyUnicode_WSTR(unicode), end,
1448                                PyUnicode_2BYTE_DATA(unicode));
1449        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1450        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1451        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1452        _PyUnicode_UTF8(unicode) = NULL;
1453        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1454        PyObject_FREE(_PyUnicode_WSTR(unicode));
1455        _PyUnicode_WSTR(unicode) = NULL;
1456        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1457#endif
1458    }
1459    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1460    else {
1461#if SIZEOF_WCHAR_T == 2
1462        /* in case the native representation is 2-bytes, we need to allocate a
1463           new normalized 4-byte version. */
1464        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1465        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1466        if (!_PyUnicode_DATA_ANY(unicode)) {
1467            PyErr_NoMemory();
1468            return -1;
1469        }
1470        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1471        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1472        _PyUnicode_UTF8(unicode) = NULL;
1473        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1474        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1475        _PyUnicode_STATE(unicode).ready = 1;
1476        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1477        PyObject_FREE(_PyUnicode_WSTR(unicode));
1478        _PyUnicode_WSTR(unicode) = NULL;
1479        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1480#else
1481        assert(num_surrogates == 0);
1482
1483        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1484        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1485        _PyUnicode_UTF8(unicode) = NULL;
1486        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1487        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1488#endif
1489        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1490    }
1491    _PyUnicode_STATE(unicode).ready = 1;
1492    assert(_PyUnicode_CheckConsistency(unicode, 1));
1493    return 0;
1494}
1495
1496static void
1497unicode_dealloc(register PyObject *unicode)
1498{
1499    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1500    case SSTATE_NOT_INTERNED:
1501        break;
1502
1503    case SSTATE_INTERNED_MORTAL:
1504        /* revive dead object temporarily for DelItem */
1505        Py_REFCNT(unicode) = 3;
1506        if (PyDict_DelItem(interned, unicode) != 0)
1507            Py_FatalError(
1508                "deletion of interned string failed");
1509        break;
1510
1511    case SSTATE_INTERNED_IMMORTAL:
1512        Py_FatalError("Immortal interned string died.");
1513
1514    default:
1515        Py_FatalError("Inconsistent interned string state.");
1516    }
1517
1518    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1519        PyObject_DEL(_PyUnicode_WSTR(unicode));
1520    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1521        PyObject_DEL(_PyUnicode_UTF8(unicode));
1522    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1523        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1524
1525    Py_TYPE(unicode)->tp_free(unicode);
1526}
1527
1528#ifdef Py_DEBUG
1529static int
1530unicode_is_singleton(PyObject *unicode)
1531{
1532    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1533    if (unicode == unicode_empty)
1534        return 1;
1535    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1536    {
1537        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1538        if (ch < 256 && unicode_latin1[ch] == unicode)
1539            return 1;
1540    }
1541    return 0;
1542}
1543#endif
1544
1545static int
1546unicode_modifiable(PyObject *unicode)
1547{
1548    assert(_PyUnicode_CHECK(unicode));
1549    if (Py_REFCNT(unicode) != 1)
1550        return 0;
1551    if (_PyUnicode_HASH(unicode) != -1)
1552        return 0;
1553    if (PyUnicode_CHECK_INTERNED(unicode))
1554        return 0;
1555    if (!PyUnicode_CheckExact(unicode))
1556        return 0;
1557#ifdef Py_DEBUG
1558    /* singleton refcount is greater than 1 */
1559    assert(!unicode_is_singleton(unicode));
1560#endif
1561    return 1;
1562}
1563
1564static int
1565unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1566{
1567    PyObject *unicode;
1568    Py_ssize_t old_length;
1569
1570    assert(p_unicode != NULL);
1571    unicode = *p_unicode;
1572
1573    assert(unicode != NULL);
1574    assert(PyUnicode_Check(unicode));
1575    assert(0 <= length);
1576
1577    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1578        old_length = PyUnicode_WSTR_LENGTH(unicode);
1579    else
1580        old_length = PyUnicode_GET_LENGTH(unicode);
1581    if (old_length == length)
1582        return 0;
1583
1584    if (length == 0) {
1585        Py_DECREF(*p_unicode);
1586        *p_unicode = unicode_empty;
1587        Py_INCREF(*p_unicode);
1588        return 0;
1589    }
1590
1591    if (!unicode_modifiable(unicode)) {
1592        PyObject *copy = resize_copy(unicode, length);
1593        if (copy == NULL)
1594            return -1;
1595        Py_DECREF(*p_unicode);
1596        *p_unicode = copy;
1597        return 0;
1598    }
1599
1600    if (PyUnicode_IS_COMPACT(unicode)) {
1601        PyObject *new_unicode = resize_compact(unicode, length);
1602        if (new_unicode == NULL)
1603            return -1;
1604        *p_unicode = new_unicode;
1605        return 0;
1606    }
1607    return resize_inplace(unicode, length);
1608}
1609
1610int
1611PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1612{
1613    PyObject *unicode;
1614    if (p_unicode == NULL) {
1615        PyErr_BadInternalCall();
1616        return -1;
1617    }
1618    unicode = *p_unicode;
1619    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1620    {
1621        PyErr_BadInternalCall();
1622        return -1;
1623    }
1624    return unicode_resize(p_unicode, length);
1625}
1626
1627static int
1628unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1629              unsigned int maxchar)
1630{
1631    PyObject *result;
1632    assert(PyUnicode_IS_READY(*p_unicode));
1633    assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
1634    if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1635        return 0;
1636    result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1637                           maxchar);
1638    if (result == NULL)
1639        return -1;
1640    _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
1641    Py_DECREF(*p_unicode);
1642    *p_unicode = result;
1643    return 0;
1644}
1645
1646static int
1647unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1648                Py_UCS4 ch)
1649{
1650    assert(ch <= MAX_UNICODE);
1651    if (unicode_widen(p_unicode, *pos, ch) < 0)
1652        return -1;
1653    PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1654                    PyUnicode_DATA(*p_unicode),
1655                    (*pos)++, ch);
1656    return 0;
1657}
1658
1659/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1660
1661   WARNING: The function doesn't copy the terminating null character and
1662   doesn't check the maximum character (may write a latin1 character in an
1663   ASCII string). */
1664static void
1665unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1666                   const char *str, Py_ssize_t len)
1667{
1668    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1669    void *data = PyUnicode_DATA(unicode);
1670    const char *end = str + len;
1671
1672    switch (kind) {
1673    case PyUnicode_1BYTE_KIND: {
1674        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1675        memcpy((char *) data + index, str, len);
1676        break;
1677    }
1678    case PyUnicode_2BYTE_KIND: {
1679        Py_UCS2 *start = (Py_UCS2 *)data + index;
1680        Py_UCS2 *ucs2 = start;
1681        assert(index <= PyUnicode_GET_LENGTH(unicode));
1682
1683        for (; str < end; ++ucs2, ++str)
1684            *ucs2 = (Py_UCS2)*str;
1685
1686        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1687        break;
1688    }
1689    default: {
1690        Py_UCS4 *start = (Py_UCS4 *)data + index;
1691        Py_UCS4 *ucs4 = start;
1692        assert(kind == PyUnicode_4BYTE_KIND);
1693        assert(index <= PyUnicode_GET_LENGTH(unicode));
1694
1695        for (; str < end; ++ucs4, ++str)
1696            *ucs4 = (Py_UCS4)*str;
1697
1698        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1699    }
1700    }
1701}
1702
1703
1704static PyObject*
1705get_latin1_char(unsigned char ch)
1706{
1707    PyObject *unicode = unicode_latin1[ch];
1708    if (!unicode) {
1709        unicode = PyUnicode_New(1, ch);
1710        if (!unicode)
1711            return NULL;
1712        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1713        assert(_PyUnicode_CheckConsistency(unicode, 1));
1714        unicode_latin1[ch] = unicode;
1715    }
1716    Py_INCREF(unicode);
1717    return unicode;
1718}
1719
1720PyObject *
1721PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1722{
1723    PyObject *unicode;
1724    Py_UCS4 maxchar = 0;
1725    Py_ssize_t num_surrogates;
1726
1727    if (u == NULL)
1728        return (PyObject*)_PyUnicode_New(size);
1729
1730    /* If the Unicode data is known at construction time, we can apply
1731       some optimizations which share commonly used objects. */
1732
1733    /* Optimization for empty strings */
1734    if (size == 0 && unicode_empty != NULL) {
1735        Py_INCREF(unicode_empty);
1736        return unicode_empty;
1737    }
1738
1739    /* Single character Unicode objects in the Latin-1 range are
1740       shared when using this constructor */
1741    if (size == 1 && *u < 256)
1742        return get_latin1_char((unsigned char)*u);
1743
1744    /* If not empty and not single character, copy the Unicode data
1745       into the new object */
1746    if (find_maxchar_surrogates(u, u + size,
1747                                &maxchar, &num_surrogates) == -1)
1748        return NULL;
1749
1750    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1751    if (!unicode)
1752        return NULL;
1753
1754    switch (PyUnicode_KIND(unicode)) {
1755    case PyUnicode_1BYTE_KIND:
1756        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1757                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1758        break;
1759    case PyUnicode_2BYTE_KIND:
1760#if Py_UNICODE_SIZE == 2
1761        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1762#else
1763        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1764                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1765#endif
1766        break;
1767    case PyUnicode_4BYTE_KIND:
1768#if SIZEOF_WCHAR_T == 2
1769        /* This is the only case which has to process surrogates, thus
1770           a simple copy loop is not enough and we need a function. */
1771        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1772#else
1773        assert(num_surrogates == 0);
1774        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1775#endif
1776        break;
1777    default:
1778        assert(0 && "Impossible state");
1779    }
1780
1781    return unicode_result(unicode);
1782}
1783
1784PyObject *
1785PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1786{
1787    if (size < 0) {
1788        PyErr_SetString(PyExc_SystemError,
1789                        "Negative size passed to PyUnicode_FromStringAndSize");
1790        return NULL;
1791    }
1792    if (u != NULL)
1793        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1794    else
1795        return (PyObject *)_PyUnicode_New(size);
1796}
1797
1798PyObject *
1799PyUnicode_FromString(const char *u)
1800{
1801    size_t size = strlen(u);
1802    if (size > PY_SSIZE_T_MAX) {
1803        PyErr_SetString(PyExc_OverflowError, "input too long");
1804        return NULL;
1805    }
1806    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1807}
1808
1809PyObject *
1810_PyUnicode_FromId(_Py_Identifier *id)
1811{
1812    if (!id->object) {
1813        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1814                                                  strlen(id->string),
1815                                                  NULL, NULL);
1816        if (!id->object)
1817            return NULL;
1818        PyUnicode_InternInPlace(&id->object);
1819        assert(!id->next);
1820        id->next = static_strings;
1821        static_strings = id;
1822    }
1823    return id->object;
1824}
1825
1826void
1827_PyUnicode_ClearStaticStrings()
1828{
1829    _Py_Identifier *i;
1830    for (i = static_strings; i; i = i->next) {
1831        Py_DECREF(i->object);
1832        i->object = NULL;
1833        i->next = NULL;
1834    }
1835}
1836
1837/* Internal function, doesn't check maximum character */
1838
1839PyObject*
1840_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1841{
1842    const unsigned char *s = (const unsigned char *)buffer;
1843    PyObject *unicode;
1844    if (size == 1) {
1845#ifdef Py_DEBUG
1846        assert(s[0] < 128);
1847#endif
1848        return get_latin1_char(s[0]);
1849    }
1850    unicode = PyUnicode_New(size, 127);
1851    if (!unicode)
1852        return NULL;
1853    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1854    assert(_PyUnicode_CheckConsistency(unicode, 1));
1855    return unicode;
1856}
1857
1858static Py_UCS4
1859kind_maxchar_limit(unsigned int kind)
1860{
1861    switch (kind) {
1862    case PyUnicode_1BYTE_KIND:
1863        return 0x80;
1864    case PyUnicode_2BYTE_KIND:
1865        return 0x100;
1866    case PyUnicode_4BYTE_KIND:
1867        return 0x10000;
1868    default:
1869        assert(0 && "invalid kind");
1870        return MAX_UNICODE;
1871    }
1872}
1873
1874Py_LOCAL_INLINE(Py_UCS4)
1875align_maxchar(Py_UCS4 maxchar)
1876{
1877    if (maxchar <= 127)
1878        return 127;
1879    else if (maxchar <= 255)
1880        return 255;
1881    else if (maxchar <= 65535)
1882        return 65535;
1883    else
1884        return MAX_UNICODE;
1885}
1886
1887static PyObject*
1888_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1889{
1890    PyObject *res;
1891    unsigned char max_char;
1892
1893    if (size == 0) {
1894        Py_INCREF(unicode_empty);
1895        return unicode_empty;
1896    }
1897    assert(size > 0);
1898    if (size == 1)
1899        return get_latin1_char(u[0]);
1900
1901    max_char = ucs1lib_find_max_char(u, u + size);
1902    res = PyUnicode_New(size, max_char);
1903    if (!res)
1904        return NULL;
1905    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1906    assert(_PyUnicode_CheckConsistency(res, 1));
1907    return res;
1908}
1909
1910static PyObject*
1911_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1912{
1913    PyObject *res;
1914    Py_UCS2 max_char;
1915
1916    if (size == 0) {
1917        Py_INCREF(unicode_empty);
1918        return unicode_empty;
1919    }
1920    assert(size > 0);
1921    if (size == 1) {
1922        Py_UCS4 ch = u[0];
1923        if (ch < 256)
1924            return get_latin1_char((unsigned char)ch);
1925
1926        res = PyUnicode_New(1, ch);
1927        if (res == NULL)
1928            return NULL;
1929        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1930        assert(_PyUnicode_CheckConsistency(res, 1));
1931        return res;
1932    }
1933
1934    max_char = ucs2lib_find_max_char(u, u + size);
1935    res = PyUnicode_New(size, max_char);
1936    if (!res)
1937        return NULL;
1938    if (max_char >= 256)
1939        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1940    else {
1941        _PyUnicode_CONVERT_BYTES(
1942            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1943    }
1944    assert(_PyUnicode_CheckConsistency(res, 1));
1945    return res;
1946}
1947
1948static PyObject*
1949_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1950{
1951    PyObject *res;
1952    Py_UCS4 max_char;
1953
1954    if (size == 0) {
1955        Py_INCREF(unicode_empty);
1956        return unicode_empty;
1957    }
1958    assert(size > 0);
1959    if (size == 1) {
1960        Py_UCS4 ch = u[0];
1961        if (ch < 256)
1962            return get_latin1_char((unsigned char)ch);
1963
1964        res = PyUnicode_New(1, ch);
1965        if (res == NULL)
1966            return NULL;
1967        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1968        assert(_PyUnicode_CheckConsistency(res, 1));
1969        return res;
1970    }
1971
1972    max_char = ucs4lib_find_max_char(u, u + size);
1973    res = PyUnicode_New(size, max_char);
1974    if (!res)
1975        return NULL;
1976    if (max_char < 256)
1977        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1978                                 PyUnicode_1BYTE_DATA(res));
1979    else if (max_char < 0x10000)
1980        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1981                                 PyUnicode_2BYTE_DATA(res));
1982    else
1983        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1984    assert(_PyUnicode_CheckConsistency(res, 1));
1985    return res;
1986}
1987
1988PyObject*
1989PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1990{
1991    if (size < 0) {
1992        PyErr_SetString(PyExc_ValueError, "size must be positive");
1993        return NULL;
1994    }
1995    switch (kind) {
1996    case PyUnicode_1BYTE_KIND:
1997        return _PyUnicode_FromUCS1(buffer, size);
1998    case PyUnicode_2BYTE_KIND:
1999        return _PyUnicode_FromUCS2(buffer, size);
2000    case PyUnicode_4BYTE_KIND:
2001        return _PyUnicode_FromUCS4(buffer, size);
2002    default:
2003        PyErr_SetString(PyExc_SystemError, "invalid kind");
2004        return NULL;
2005    }
2006}
2007
2008Py_UCS4
2009_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2010{
2011    enum PyUnicode_Kind kind;
2012    void *startptr, *endptr;
2013
2014    assert(PyUnicode_IS_READY(unicode));
2015    assert(0 <= start);
2016    assert(end <= PyUnicode_GET_LENGTH(unicode));
2017    assert(start <= end);
2018
2019    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2020        return PyUnicode_MAX_CHAR_VALUE(unicode);
2021
2022    if (start == end)
2023        return 127;
2024
2025    if (PyUnicode_IS_ASCII(unicode))
2026        return 127;
2027
2028    kind = PyUnicode_KIND(unicode);
2029    startptr = PyUnicode_DATA(unicode);
2030    endptr = (char *)startptr + end * kind;
2031    startptr = (char *)startptr + start * kind;
2032    switch(kind) {
2033    case PyUnicode_1BYTE_KIND:
2034        return ucs1lib_find_max_char(startptr, endptr);
2035    case PyUnicode_2BYTE_KIND:
2036        return ucs2lib_find_max_char(startptr, endptr);
2037    case PyUnicode_4BYTE_KIND:
2038        return ucs4lib_find_max_char(startptr, endptr);
2039    default:
2040        assert(0);
2041        return 0;
2042    }
2043}
2044
2045/* Ensure that a string uses the most efficient storage, if it is not the
2046   case: create a new string with of the right kind. Write NULL into *p_unicode
2047   on error. */
2048static void
2049unicode_adjust_maxchar(PyObject **p_unicode)
2050{
2051    PyObject *unicode, *copy;
2052    Py_UCS4 max_char;
2053    Py_ssize_t len;
2054    unsigned int kind;
2055
2056    assert(p_unicode != NULL);
2057    unicode = *p_unicode;
2058    assert(PyUnicode_IS_READY(unicode));
2059    if (PyUnicode_IS_ASCII(unicode))
2060        return;
2061
2062    len = PyUnicode_GET_LENGTH(unicode);
2063    kind = PyUnicode_KIND(unicode);
2064    if (kind == PyUnicode_1BYTE_KIND) {
2065        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2066        max_char = ucs1lib_find_max_char(u, u + len);
2067        if (max_char >= 128)
2068            return;
2069    }
2070    else if (kind == PyUnicode_2BYTE_KIND) {
2071        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2072        max_char = ucs2lib_find_max_char(u, u + len);
2073        if (max_char >= 256)
2074            return;
2075    }
2076    else {
2077        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2078        assert(kind == PyUnicode_4BYTE_KIND);
2079        max_char = ucs4lib_find_max_char(u, u + len);
2080        if (max_char >= 0x10000)
2081            return;
2082    }
2083    copy = PyUnicode_New(len, max_char);
2084    if (copy != NULL)
2085        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2086    Py_DECREF(unicode);
2087    *p_unicode = copy;
2088}
2089
2090PyObject*
2091_PyUnicode_Copy(PyObject *unicode)
2092{
2093    Py_ssize_t length;
2094    PyObject *copy;
2095
2096    if (!PyUnicode_Check(unicode)) {
2097        PyErr_BadInternalCall();
2098        return NULL;
2099    }
2100    if (PyUnicode_READY(unicode) == -1)
2101        return NULL;
2102
2103    length = PyUnicode_GET_LENGTH(unicode);
2104    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2105    if (!copy)
2106        return NULL;
2107    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2108
2109    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2110              length * PyUnicode_KIND(unicode));
2111    assert(_PyUnicode_CheckConsistency(copy, 1));
2112    return copy;
2113}
2114
2115
2116/* Widen Unicode objects to larger buffers. Don't write terminating null
2117   character. Return NULL on error. */
2118
2119void*
2120_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2121{
2122    Py_ssize_t len;
2123    void *result;
2124    unsigned int skind;
2125
2126    if (PyUnicode_READY(s) == -1)
2127        return NULL;
2128
2129    len = PyUnicode_GET_LENGTH(s);
2130    skind = PyUnicode_KIND(s);
2131    if (skind >= kind) {
2132        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2133        return NULL;
2134    }
2135    switch (kind) {
2136    case PyUnicode_2BYTE_KIND:
2137        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2138        if (!result)
2139            return PyErr_NoMemory();
2140        assert(skind == PyUnicode_1BYTE_KIND);
2141        _PyUnicode_CONVERT_BYTES(
2142            Py_UCS1, Py_UCS2,
2143            PyUnicode_1BYTE_DATA(s),
2144            PyUnicode_1BYTE_DATA(s) + len,
2145            result);
2146        return result;
2147    case PyUnicode_4BYTE_KIND:
2148        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2149        if (!result)
2150            return PyErr_NoMemory();
2151        if (skind == PyUnicode_2BYTE_KIND) {
2152            _PyUnicode_CONVERT_BYTES(
2153                Py_UCS2, Py_UCS4,
2154                PyUnicode_2BYTE_DATA(s),
2155                PyUnicode_2BYTE_DATA(s) + len,
2156                result);
2157        }
2158        else {
2159            assert(skind == PyUnicode_1BYTE_KIND);
2160            _PyUnicode_CONVERT_BYTES(
2161                Py_UCS1, Py_UCS4,
2162                PyUnicode_1BYTE_DATA(s),
2163                PyUnicode_1BYTE_DATA(s) + len,
2164                result);
2165        }
2166        return result;
2167    default:
2168        break;
2169    }
2170    PyErr_SetString(PyExc_SystemError, "invalid kind");
2171    return NULL;
2172}
2173
2174static Py_UCS4*
2175as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2176        int copy_null)
2177{
2178    int kind;
2179    void *data;
2180    Py_ssize_t len, targetlen;
2181    if (PyUnicode_READY(string) == -1)
2182        return NULL;
2183    kind = PyUnicode_KIND(string);
2184    data = PyUnicode_DATA(string);
2185    len = PyUnicode_GET_LENGTH(string);
2186    targetlen = len;
2187    if (copy_null)
2188        targetlen++;
2189    if (!target) {
2190        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2191            PyErr_NoMemory();
2192            return NULL;
2193        }
2194        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2195        if (!target) {
2196            PyErr_NoMemory();
2197            return NULL;
2198        }
2199    }
2200    else {
2201        if (targetsize < targetlen) {
2202            PyErr_Format(PyExc_SystemError,
2203                         "string is longer than the buffer");
2204            if (copy_null && 0 < targetsize)
2205                target[0] = 0;
2206            return NULL;
2207        }
2208    }
2209    if (kind == PyUnicode_1BYTE_KIND) {
2210        Py_UCS1 *start = (Py_UCS1 *) data;
2211        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2212    }
2213    else if (kind == PyUnicode_2BYTE_KIND) {
2214        Py_UCS2 *start = (Py_UCS2 *) data;
2215        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2216    }
2217    else {
2218        assert(kind == PyUnicode_4BYTE_KIND);
2219        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2220    }
2221    if (copy_null)
2222        target[len] = 0;
2223    return target;
2224}
2225
2226Py_UCS4*
2227PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228                 int copy_null)
2229{
2230    if (target == NULL || targetsize < 0) {
2231        PyErr_BadInternalCall();
2232        return NULL;
2233    }
2234    return as_ucs4(string, target, targetsize, copy_null);
2235}
2236
2237Py_UCS4*
2238PyUnicode_AsUCS4Copy(PyObject *string)
2239{
2240    return as_ucs4(string, NULL, 0, 1);
2241}
2242
2243#ifdef HAVE_WCHAR_H
2244
2245PyObject *
2246PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2247{
2248    if (w == NULL) {
2249        if (size == 0) {
2250            Py_INCREF(unicode_empty);
2251            return unicode_empty;
2252        }
2253        PyErr_BadInternalCall();
2254        return NULL;
2255    }
2256
2257    if (size == -1) {
2258        size = wcslen(w);
2259    }
2260
2261    return PyUnicode_FromUnicode(w, size);
2262}
2263
2264#endif /* HAVE_WCHAR_H */
2265
2266static void
2267makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2268        int zeropad, int width, int precision, char c)
2269{
2270    *fmt++ = '%';
2271    if (width) {
2272        if (zeropad)
2273            *fmt++ = '0';
2274        fmt += sprintf(fmt, "%d", width);
2275    }
2276    if (precision)
2277        fmt += sprintf(fmt, ".%d", precision);
2278    if (longflag)
2279        *fmt++ = 'l';
2280    else if (longlongflag) {
2281        /* longlongflag should only ever be nonzero on machines with
2282           HAVE_LONG_LONG defined */
2283#ifdef HAVE_LONG_LONG
2284        char *f = PY_FORMAT_LONG_LONG;
2285        while (*f)
2286            *fmt++ = *f++;
2287#else
2288        /* we shouldn't ever get here */
2289        assert(0);
2290        *fmt++ = 'l';
2291#endif
2292    }
2293    else if (size_tflag) {
2294        char *f = PY_FORMAT_SIZE_T;
2295        while (*f)
2296            *fmt++ = *f++;
2297    }
2298    *fmt++ = c;
2299    *fmt = '\0';
2300}
2301
2302/* helper for PyUnicode_FromFormatV() */
2303
2304static const char*
2305parse_format_flags(const char *f,
2306                   int *p_width, int *p_precision,
2307                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2308{
2309    int width, precision, longflag, longlongflag, size_tflag;
2310
2311    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2312    f++;
2313    width = 0;
2314    while (Py_ISDIGIT((unsigned)*f))
2315        width = (width*10) + *f++ - '0';
2316    precision = 0;
2317    if (*f == '.') {
2318        f++;
2319        while (Py_ISDIGIT((unsigned)*f))
2320            precision = (precision*10) + *f++ - '0';
2321        if (*f == '%') {
2322            /* "%.3%s" => f points to "3" */
2323            f--;
2324        }
2325    }
2326    if (*f == '\0') {
2327        /* bogus format "%.1" => go backward, f points to "1" */
2328        f--;
2329    }
2330    if (p_width != NULL)
2331        *p_width = width;
2332    if (p_precision != NULL)
2333        *p_precision = precision;
2334
2335    /* Handle %ld, %lu, %lld and %llu. */
2336    longflag = 0;
2337    longlongflag = 0;
2338    size_tflag = 0;
2339
2340    if (*f == 'l') {
2341        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2342            longflag = 1;
2343            ++f;
2344        }
2345#ifdef HAVE_LONG_LONG
2346        else if (f[1] == 'l' &&
2347                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2348            longlongflag = 1;
2349            f += 2;
2350        }
2351#endif
2352    }
2353    /* handle the size_t flag. */
2354    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2355        size_tflag = 1;
2356        ++f;
2357    }
2358    if (p_longflag != NULL)
2359        *p_longflag = longflag;
2360    if (p_longlongflag != NULL)
2361        *p_longlongflag = longlongflag;
2362    if (p_size_tflag != NULL)
2363        *p_size_tflag = size_tflag;
2364    return f;
2365}
2366
2367/* maximum number of characters required for output of %ld.  21 characters
2368   allows for 64-bit integers (in decimal) and an optional sign. */
2369#define MAX_LONG_CHARS 21
2370/* maximum number of characters required for output of %lld.
2371   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2372   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2373#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2374
2375PyObject *
2376PyUnicode_FromFormatV(const char *format, va_list vargs)
2377{
2378    va_list count;
2379    Py_ssize_t callcount = 0;
2380    PyObject **callresults = NULL;
2381    PyObject **callresult = NULL;
2382    Py_ssize_t n = 0;
2383    int width = 0;
2384    int precision = 0;
2385    int zeropad;
2386    const char* f;
2387    PyObject *string;
2388    /* used by sprintf */
2389    char fmt[61]; /* should be enough for %0width.precisionlld */
2390    Py_UCS4 maxchar = 127; /* result is ASCII by default */
2391    Py_UCS4 argmaxchar;
2392    Py_ssize_t numbersize = 0;
2393    char *numberresults = NULL;
2394    char *numberresult = NULL;
2395    Py_ssize_t i;
2396    int kind;
2397    void *data;
2398
2399    Py_VA_COPY(count, vargs);
2400    /* step 1: count the number of %S/%R/%A/%s format specifications
2401     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2402     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
2403     * result in an array)
2404     * also estimate a upper bound for all the number formats in the string,
2405     * numbers will be formatted in step 3 and be kept in a '\0'-separated
2406     * buffer before putting everything together. */
2407    for (f = format; *f; f++) {
2408        if (*f == '%') {
2409            int longlongflag;
2410            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2411            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2412            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2413                ++callcount;
2414
2415            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
2416#ifdef HAVE_LONG_LONG
2417                if (longlongflag) {
2418                    if (width < MAX_LONG_LONG_CHARS)
2419                        width = MAX_LONG_LONG_CHARS;
2420                }
2421                else
2422#endif
2423                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2424                       including sign.  Decimal takes the most space.  This
2425                       isn't enough for octal.  If a width is specified we
2426                       need more (which we allocate later). */
2427                    if (width < MAX_LONG_CHARS)
2428                        width = MAX_LONG_CHARS;
2429
2430                /* account for the size + '\0' to separate numbers
2431                   inside of the numberresults buffer */
2432                numbersize += (width + 1);
2433            }
2434        }
2435        else if ((unsigned char)*f > 127) {
2436            PyErr_Format(PyExc_ValueError,
2437                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2438                "string, got a non-ASCII byte: 0x%02x",
2439                (unsigned char)*f);
2440            return NULL;
2441        }
2442    }
2443    /* step 2: allocate memory for the results of
2444     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2445    if (callcount) {
2446        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2447        if (!callresults) {
2448            PyErr_NoMemory();
2449            return NULL;
2450        }
2451        callresult = callresults;
2452    }
2453    /* step 2.5: allocate memory for the results of formating numbers */
2454    if (numbersize) {
2455        numberresults = PyObject_Malloc(numbersize);
2456        if (!numberresults) {
2457            PyErr_NoMemory();
2458            goto fail;
2459        }
2460        numberresult = numberresults;
2461    }
2462
2463    /* step 3: format numbers and figure out how large a buffer we need */
2464    for (f = format; *f; f++) {
2465        if (*f == '%') {
2466            const char* p;
2467            int longflag;
2468            int longlongflag;
2469            int size_tflag;
2470            int numprinted;
2471
2472            p = f;
2473            zeropad = (f[1] == '0');
2474            f = parse_format_flags(f, &width, &precision,
2475                                   &longflag, &longlongflag, &size_tflag);
2476            switch (*f) {
2477            case 'c':
2478            {
2479                Py_UCS4 ordinal = va_arg(count, int);
2480                maxchar = MAX_MAXCHAR(maxchar, ordinal);
2481                n++;
2482                break;
2483            }
2484            case '%':
2485                n++;
2486                break;
2487            case 'i':
2488            case 'd':
2489                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2490                        width, precision, *f);
2491                if (longflag)
2492                    numprinted = sprintf(numberresult, fmt,
2493                                         va_arg(count, long));
2494#ifdef HAVE_LONG_LONG
2495                else if (longlongflag)
2496                    numprinted = sprintf(numberresult, fmt,
2497                                         va_arg(count, PY_LONG_LONG));
2498#endif
2499                else if (size_tflag)
2500                    numprinted = sprintf(numberresult, fmt,
2501                                         va_arg(count, Py_ssize_t));
2502                else
2503                    numprinted = sprintf(numberresult, fmt,
2504                                         va_arg(count, int));
2505                n += numprinted;
2506                /* advance by +1 to skip over the '\0' */
2507                numberresult += (numprinted + 1);
2508                assert(*(numberresult - 1) == '\0');
2509                assert(*(numberresult - 2) != '\0');
2510                assert(numprinted >= 0);
2511                assert(numberresult <= numberresults + numbersize);
2512                break;
2513            case 'u':
2514                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2515                        width, precision, 'u');
2516                if (longflag)
2517                    numprinted = sprintf(numberresult, fmt,
2518                                         va_arg(count, unsigned long));
2519#ifdef HAVE_LONG_LONG
2520                else if (longlongflag)
2521                    numprinted = sprintf(numberresult, fmt,
2522                                         va_arg(count, unsigned PY_LONG_LONG));
2523#endif
2524                else if (size_tflag)
2525                    numprinted = sprintf(numberresult, fmt,
2526                                         va_arg(count, size_t));
2527                else
2528                    numprinted = sprintf(numberresult, fmt,
2529                                         va_arg(count, unsigned int));
2530                n += numprinted;
2531                numberresult += (numprinted + 1);
2532                assert(*(numberresult - 1) == '\0');
2533                assert(*(numberresult - 2) != '\0');
2534                assert(numprinted >= 0);
2535                assert(numberresult <= numberresults + numbersize);
2536                break;
2537            case 'x':
2538                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2539                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2540                n += numprinted;
2541                numberresult += (numprinted + 1);
2542                assert(*(numberresult - 1) == '\0');
2543                assert(*(numberresult - 2) != '\0');
2544                assert(numprinted >= 0);
2545                assert(numberresult <= numberresults + numbersize);
2546                break;
2547            case 'p':
2548                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2549                /* %p is ill-defined:  ensure leading 0x. */
2550                if (numberresult[1] == 'X')
2551                    numberresult[1] = 'x';
2552                else if (numberresult[1] != 'x') {
2553                    memmove(numberresult + 2, numberresult,
2554                            strlen(numberresult) + 1);
2555                    numberresult[0] = '0';
2556                    numberresult[1] = 'x';
2557                    numprinted += 2;
2558                }
2559                n += numprinted;
2560                numberresult += (numprinted + 1);
2561                assert(*(numberresult - 1) == '\0');
2562                assert(*(numberresult - 2) != '\0');
2563                assert(numprinted >= 0);
2564                assert(numberresult <= numberresults + numbersize);
2565                break;
2566            case 's':
2567            {
2568                /* UTF-8 */
2569                const char *s = va_arg(count, const char*);
2570                PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2571                if (!str)
2572                    goto fail;
2573                /* since PyUnicode_DecodeUTF8 returns already flexible
2574                   unicode objects, there is no need to call ready on them */
2575                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2576                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2577                n += PyUnicode_GET_LENGTH(str);
2578                /* Remember the str and switch to the next slot */
2579                *callresult++ = str;
2580                break;
2581            }
2582            case 'U':
2583            {
2584                PyObject *obj = va_arg(count, PyObject *);
2585                assert(obj && _PyUnicode_CHECK(obj));
2586                if (PyUnicode_READY(obj) == -1)
2587                    goto fail;
2588                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2589                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2590                n += PyUnicode_GET_LENGTH(obj);
2591                break;
2592            }
2593            case 'V':
2594            {
2595                PyObject *obj = va_arg(count, PyObject *);
2596                const char *str = va_arg(count, const char *);
2597                PyObject *str_obj;
2598                assert(obj || str);
2599                assert(!obj || _PyUnicode_CHECK(obj));
2600                if (obj) {
2601                    if (PyUnicode_READY(obj) == -1)
2602                        goto fail;
2603                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2604                    maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2605                    n += PyUnicode_GET_LENGTH(obj);
2606                    *callresult++ = NULL;
2607                }
2608                else {
2609                    str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2610                    if (!str_obj)
2611                        goto fail;
2612                    if (PyUnicode_READY(str_obj) == -1) {
2613                        Py_DECREF(str_obj);
2614                        goto fail;
2615                    }
2616                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2617                    maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2618                    n += PyUnicode_GET_LENGTH(str_obj);
2619                    *callresult++ = str_obj;
2620                }
2621                break;
2622            }
2623            case 'S':
2624            {
2625                PyObject *obj = va_arg(count, PyObject *);
2626                PyObject *str;
2627                assert(obj);
2628                str = PyObject_Str(obj);
2629                if (!str)
2630                    goto fail;
2631                if (PyUnicode_READY(str) == -1) {
2632                    Py_DECREF(str);
2633                    goto fail;
2634                }
2635                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2636                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2637                n += PyUnicode_GET_LENGTH(str);
2638                /* Remember the str and switch to the next slot */
2639                *callresult++ = str;
2640                break;
2641            }
2642            case 'R':
2643            {
2644                PyObject *obj = va_arg(count, PyObject *);
2645                PyObject *repr;
2646                assert(obj);
2647                repr = PyObject_Repr(obj);
2648                if (!repr)
2649                    goto fail;
2650                if (PyUnicode_READY(repr) == -1) {
2651                    Py_DECREF(repr);
2652                    goto fail;
2653                }
2654                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2655                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2656                n += PyUnicode_GET_LENGTH(repr);
2657                /* Remember the repr and switch to the next slot */
2658                *callresult++ = repr;
2659                break;
2660            }
2661            case 'A':
2662            {
2663                PyObject *obj = va_arg(count, PyObject *);
2664                PyObject *ascii;
2665                assert(obj);
2666                ascii = PyObject_ASCII(obj);
2667                if (!ascii)
2668                    goto fail;
2669                if (PyUnicode_READY(ascii) == -1) {
2670                    Py_DECREF(ascii);
2671                    goto fail;
2672                }
2673                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2674                maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
2675                n += PyUnicode_GET_LENGTH(ascii);
2676                /* Remember the repr and switch to the next slot */
2677                *callresult++ = ascii;
2678                break;
2679            }
2680            default:
2681                /* if we stumble upon an unknown
2682                   formatting code, copy the rest of
2683                   the format string to the output
2684                   string. (we cannot just skip the
2685                   code, since there's no way to know
2686                   what's in the argument list) */
2687                n += strlen(p);
2688                goto expand;
2689            }
2690        } else
2691            n++;
2692    }
2693  expand:
2694    /* step 4: fill the buffer */
2695    /* Since we've analyzed how much space we need,
2696       we don't have to resize the string.
2697       There can be no errors beyond this point. */
2698    string = PyUnicode_New(n, maxchar);
2699    if (!string)
2700        goto fail;
2701    kind = PyUnicode_KIND(string);
2702    data = PyUnicode_DATA(string);
2703    callresult = callresults;
2704    numberresult = numberresults;
2705
2706    for (i = 0, f = format; *f; f++) {
2707        if (*f == '%') {
2708            const char* p;
2709
2710            p = f;
2711            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2712            /* checking for == because the last argument could be a empty
2713               string, which causes i to point to end, the assert at the end of
2714               the loop */
2715            assert(i <= PyUnicode_GET_LENGTH(string));
2716
2717            switch (*f) {
2718            case 'c':
2719            {
2720                const int ordinal = va_arg(vargs, int);
2721                PyUnicode_WRITE(kind, data, i++, ordinal);
2722                break;
2723            }
2724            case 'i':
2725            case 'd':
2726            case 'u':
2727            case 'x':
2728            case 'p':
2729            {
2730                Py_ssize_t len;
2731                /* unused, since we already have the result */
2732                if (*f == 'p')
2733                    (void) va_arg(vargs, void *);
2734                else
2735                    (void) va_arg(vargs, int);
2736                /* extract the result from numberresults and append. */
2737                len = strlen(numberresult);
2738                unicode_write_cstr(string, i, numberresult, len);
2739                /* skip over the separating '\0' */
2740                i += len;
2741                numberresult += len;
2742                assert(*numberresult == '\0');
2743                numberresult++;
2744                assert(numberresult <= numberresults + numbersize);
2745                break;
2746            }
2747            case 's':
2748            {
2749                /* unused, since we already have the result */
2750                Py_ssize_t size;
2751                (void) va_arg(vargs, char *);
2752                size = PyUnicode_GET_LENGTH(*callresult);
2753                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2754                _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2755                i += size;
2756                /* We're done with the unicode()/repr() => forget it */
2757                Py_DECREF(*callresult);
2758                /* switch to next unicode()/repr() result */
2759                ++callresult;
2760                break;
2761            }
2762            case 'U':
2763            {
2764                PyObject *obj = va_arg(vargs, PyObject *);
2765                Py_ssize_t size;
2766                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2767                size = PyUnicode_GET_LENGTH(obj);
2768                _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2769                i += size;
2770                break;
2771            }
2772            case 'V':
2773            {
2774                Py_ssize_t size;
2775                PyObject *obj = va_arg(vargs, PyObject *);
2776                va_arg(vargs, const char *);
2777                if (obj) {
2778                    size = PyUnicode_GET_LENGTH(obj);
2779                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2780                    _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
2781                    i += size;
2782                } else {
2783                    size = PyUnicode_GET_LENGTH(*callresult);
2784                    assert(PyUnicode_KIND(*callresult) <=
2785                           PyUnicode_KIND(string));
2786                    _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
2787                    i += size;
2788                    Py_DECREF(*callresult);
2789                }
2790                ++callresult;
2791                break;
2792            }
2793            case 'S':
2794            case 'R':
2795            case 'A':
2796            {
2797                Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
2798                /* unused, since we already have the result */
2799                (void) va_arg(vargs, PyObject *);
2800                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2801                _PyUnicode_FastCopyCharacters(string, i, *callresult, 0,  size);
2802                i += size;
2803                /* We're done with the unicode()/repr() => forget it */
2804                Py_DECREF(*callresult);
2805                /* switch to next unicode()/repr() result */
2806                ++callresult;
2807                break;
2808            }
2809            case '%':
2810                PyUnicode_WRITE(kind, data, i++, '%');
2811                break;
2812            default:
2813            {
2814                Py_ssize_t len = strlen(p);
2815                unicode_write_cstr(string, i, p, len);
2816                i += len;
2817                assert(i == PyUnicode_GET_LENGTH(string));
2818                goto end;
2819            }
2820            }
2821        }
2822        else {
2823            assert(i < PyUnicode_GET_LENGTH(string));
2824            PyUnicode_WRITE(kind, data, i++, *f);
2825        }
2826    }
2827    assert(i == PyUnicode_GET_LENGTH(string));
2828
2829  end:
2830    if (callresults)
2831        PyObject_Free(callresults);
2832    if (numberresults)
2833        PyObject_Free(numberresults);
2834    return unicode_result(string);
2835  fail:
2836    if (callresults) {
2837        PyObject **callresult2 = callresults;
2838        while (callresult2 < callresult) {
2839            Py_XDECREF(*callresult2);
2840            ++callresult2;
2841        }
2842        PyObject_Free(callresults);
2843    }
2844    if (numberresults)
2845        PyObject_Free(numberresults);
2846    return NULL;
2847}
2848
2849PyObject *
2850PyUnicode_FromFormat(const char *format, ...)
2851{
2852    PyObject* ret;
2853    va_list vargs;
2854
2855#ifdef HAVE_STDARG_PROTOTYPES
2856    va_start(vargs, format);
2857#else
2858    va_start(vargs);
2859#endif
2860    ret = PyUnicode_FromFormatV(format, vargs);
2861    va_end(vargs);
2862    return ret;
2863}
2864
2865#ifdef HAVE_WCHAR_H
2866
2867/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2868   convert a Unicode object to a wide character string.
2869
2870   - If w is NULL: return the number of wide characters (including the null
2871     character) required to convert the unicode object. Ignore size argument.
2872
2873   - Otherwise: return the number of wide characters (excluding the null
2874     character) written into w. Write at most size wide characters (including
2875     the null character). */
2876static Py_ssize_t
2877unicode_aswidechar(PyObject *unicode,
2878                   wchar_t *w,
2879                   Py_ssize_t size)
2880{
2881    Py_ssize_t res;
2882    const wchar_t *wstr;
2883
2884    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2885    if (wstr == NULL)
2886        return -1;
2887
2888    if (w != NULL) {
2889        if (size > res)
2890            size = res + 1;
2891        else
2892            res = size;
2893        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2894        return res;
2895    }
2896    else
2897        return res + 1;
2898}
2899
2900Py_ssize_t
2901PyUnicode_AsWideChar(PyObject *unicode,
2902                     wchar_t *w,
2903                     Py_ssize_t size)
2904{
2905    if (unicode == NULL) {
2906        PyErr_BadInternalCall();
2907        return -1;
2908    }
2909    return unicode_aswidechar(unicode, w, size);
2910}
2911
2912wchar_t*
2913PyUnicode_AsWideCharString(PyObject *unicode,
2914                           Py_ssize_t *size)
2915{
2916    wchar_t* buffer;
2917    Py_ssize_t buflen;
2918
2919    if (unicode == NULL) {
2920        PyErr_BadInternalCall();
2921        return NULL;
2922    }
2923
2924    buflen = unicode_aswidechar(unicode, NULL, 0);
2925    if (buflen == -1)
2926        return NULL;
2927    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2928        PyErr_NoMemory();
2929        return NULL;
2930    }
2931
2932    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2933    if (buffer == NULL) {
2934        PyErr_NoMemory();
2935        return NULL;
2936    }
2937    buflen = unicode_aswidechar(unicode, buffer, buflen);
2938    if (buflen == -1) {
2939        PyMem_FREE(buffer);
2940        return NULL;
2941    }
2942    if (size != NULL)
2943        *size = buflen;
2944    return buffer;
2945}
2946
2947#endif /* HAVE_WCHAR_H */
2948
2949PyObject *
2950PyUnicode_FromOrdinal(int ordinal)
2951{
2952    PyObject *v;
2953    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2954        PyErr_SetString(PyExc_ValueError,
2955                        "chr() arg not in range(0x110000)");
2956        return NULL;
2957    }
2958
2959    if (ordinal < 256)
2960        return get_latin1_char(ordinal);
2961
2962    v = PyUnicode_New(1, ordinal);
2963    if (v == NULL)
2964        return NULL;
2965    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2966    assert(_PyUnicode_CheckConsistency(v, 1));
2967    return v;
2968}
2969
2970PyObject *
2971PyUnicode_FromObject(register PyObject *obj)
2972{
2973    /* XXX Perhaps we should make this API an alias of
2974       PyObject_Str() instead ?! */
2975    if (PyUnicode_CheckExact(obj)) {
2976        if (PyUnicode_READY(obj) == -1)
2977            return NULL;
2978        Py_INCREF(obj);
2979        return obj;
2980    }
2981    if (PyUnicode_Check(obj)) {
2982        /* For a Unicode subtype that's not a Unicode object,
2983           return a true Unicode object with the same data. */
2984        return _PyUnicode_Copy(obj);
2985    }
2986    PyErr_Format(PyExc_TypeError,
2987                 "Can't convert '%.100s' object to str implicitly",
2988                 Py_TYPE(obj)->tp_name);
2989    return NULL;
2990}
2991
2992PyObject *
2993PyUnicode_FromEncodedObject(register PyObject *obj,
2994                            const char *encoding,
2995                            const char *errors)
2996{
2997    Py_buffer buffer;
2998    PyObject *v;
2999
3000    if (obj == NULL) {
3001        PyErr_BadInternalCall();
3002        return NULL;
3003    }
3004
3005    /* Decoding bytes objects is the most common case and should be fast */
3006    if (PyBytes_Check(obj)) {
3007        if (PyBytes_GET_SIZE(obj) == 0) {
3008            Py_INCREF(unicode_empty);
3009            v = unicode_empty;
3010        }
3011        else {
3012            v = PyUnicode_Decode(
3013                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3014                    encoding, errors);
3015        }
3016        return v;
3017    }
3018
3019    if (PyUnicode_Check(obj)) {
3020        PyErr_SetString(PyExc_TypeError,
3021                        "decoding str is not supported");
3022        return NULL;
3023    }
3024
3025    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3026    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3027        PyErr_Format(PyExc_TypeError,
3028                     "coercing to str: need bytes, bytearray "
3029                     "or buffer-like object, %.80s found",
3030                     Py_TYPE(obj)->tp_name);
3031        return NULL;
3032    }
3033
3034    if (buffer.len == 0) {
3035        Py_INCREF(unicode_empty);
3036        v = unicode_empty;
3037    }
3038    else
3039        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3040
3041    PyBuffer_Release(&buffer);
3042    return v;
3043}
3044
3045/* Convert encoding to lower case and replace '_' with '-' in order to
3046   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3047   1 on success. */
3048static int
3049normalize_encoding(const char *encoding,
3050                   char *lower,
3051                   size_t lower_len)
3052{
3053    const char *e;
3054    char *l;
3055    char *l_end;
3056
3057    if (encoding == NULL) {
3058        strcpy(lower, "utf-8");
3059        return 1;
3060    }
3061    e = encoding;
3062    l = lower;
3063    l_end = &lower[lower_len - 1];
3064    while (*e) {
3065        if (l == l_end)
3066            return 0;
3067        if (Py_ISUPPER(*e)) {
3068            *l++ = Py_TOLOWER(*e++);
3069        }
3070        else if (*e == '_') {
3071            *l++ = '-';
3072            e++;
3073        }
3074        else {
3075            *l++ = *e++;
3076        }
3077    }
3078    *l = '\0';
3079    return 1;
3080}
3081
3082PyObject *
3083PyUnicode_Decode(const char *s,
3084                 Py_ssize_t size,
3085                 const char *encoding,
3086                 const char *errors)
3087{
3088    PyObject *buffer = NULL, *unicode;
3089    Py_buffer info;
3090    char lower[11];  /* Enough for any encoding shortcut */
3091
3092    /* Shortcuts for common default encodings */
3093    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3094        if ((strcmp(lower, "utf-8") == 0) ||
3095            (strcmp(lower, "utf8") == 0))
3096            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3097        else if ((strcmp(lower, "latin-1") == 0) ||
3098                 (strcmp(lower, "latin1") == 0) ||
3099                 (strcmp(lower, "iso-8859-1") == 0))
3100            return PyUnicode_DecodeLatin1(s, size, errors);
3101#ifdef HAVE_MBCS
3102        else if (strcmp(lower, "mbcs") == 0)
3103            return PyUnicode_DecodeMBCS(s, size, errors);
3104#endif
3105        else if (strcmp(lower, "ascii") == 0)
3106            return PyUnicode_DecodeASCII(s, size, errors);
3107        else if (strcmp(lower, "utf-16") == 0)
3108            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3109        else if (strcmp(lower, "utf-32") == 0)
3110            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3111    }
3112
3113    /* Decode via the codec registry */
3114    buffer = NULL;
3115    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3116        goto onError;
3117    buffer = PyMemoryView_FromBuffer(&info);
3118    if (buffer == NULL)
3119        goto onError;
3120    unicode = PyCodec_Decode(buffer, encoding, errors);
3121    if (unicode == NULL)
3122        goto onError;
3123    if (!PyUnicode_Check(unicode)) {
3124        PyErr_Format(PyExc_TypeError,
3125                     "decoder did not return a str object (type=%.400s)",
3126                     Py_TYPE(unicode)->tp_name);
3127        Py_DECREF(unicode);
3128        goto onError;
3129    }
3130    Py_DECREF(buffer);
3131    return unicode_result(unicode);
3132
3133  onError:
3134    Py_XDECREF(buffer);
3135    return NULL;
3136}
3137
3138PyObject *
3139PyUnicode_AsDecodedObject(PyObject *unicode,
3140                          const char *encoding,
3141                          const char *errors)
3142{
3143    PyObject *v;
3144
3145    if (!PyUnicode_Check(unicode)) {
3146        PyErr_BadArgument();
3147        goto onError;
3148    }
3149
3150    if (encoding == NULL)
3151        encoding = PyUnicode_GetDefaultEncoding();
3152
3153    /* Decode via the codec registry */
3154    v = PyCodec_Decode(unicode, encoding, errors);
3155    if (v == NULL)
3156        goto onError;
3157    return unicode_result(v);
3158
3159  onError:
3160    return NULL;
3161}
3162
3163PyObject *
3164PyUnicode_AsDecodedUnicode(PyObject *unicode,
3165                           const char *encoding,
3166                           const char *errors)
3167{
3168    PyObject *v;
3169
3170    if (!PyUnicode_Check(unicode)) {
3171        PyErr_BadArgument();
3172        goto onError;
3173    }
3174
3175    if (encoding == NULL)
3176        encoding = PyUnicode_GetDefaultEncoding();
3177
3178    /* Decode via the codec registry */
3179    v = PyCodec_Decode(unicode, encoding, errors);
3180    if (v == NULL)
3181        goto onError;
3182    if (!PyUnicode_Check(v)) {
3183        PyErr_Format(PyExc_TypeError,
3184                     "decoder did not return a str object (type=%.400s)",
3185                     Py_TYPE(v)->tp_name);
3186        Py_DECREF(v);
3187        goto onError;
3188    }
3189    return unicode_result(v);
3190
3191  onError:
3192    return NULL;
3193}
3194
3195PyObject *
3196PyUnicode_Encode(const Py_UNICODE *s,
3197                 Py_ssize_t size,
3198                 const char *encoding,
3199                 const char *errors)
3200{
3201    PyObject *v, *unicode;
3202
3203    unicode = PyUnicode_FromUnicode(s, size);
3204    if (unicode == NULL)
3205        return NULL;
3206    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3207    Py_DECREF(unicode);
3208    return v;
3209}
3210
3211PyObject *
3212PyUnicode_AsEncodedObject(PyObject *unicode,
3213                          const char *encoding,
3214                          const char *errors)
3215{
3216    PyObject *v;
3217
3218    if (!PyUnicode_Check(unicode)) {
3219        PyErr_BadArgument();
3220        goto onError;
3221    }
3222
3223    if (encoding == NULL)
3224        encoding = PyUnicode_GetDefaultEncoding();
3225
3226    /* Encode via the codec registry */
3227    v = PyCodec_Encode(unicode, encoding, errors);
3228    if (v == NULL)
3229        goto onError;
3230    return v;
3231
3232  onError:
3233    return NULL;
3234}
3235
3236static size_t
3237wcstombs_errorpos(const wchar_t *wstr)
3238{
3239    size_t len;
3240#if SIZEOF_WCHAR_T == 2
3241    wchar_t buf[3];
3242#else
3243    wchar_t buf[2];
3244#endif
3245    char outbuf[MB_LEN_MAX];
3246    const wchar_t *start, *previous;
3247
3248#if SIZEOF_WCHAR_T == 2
3249    buf[2] = 0;
3250#else
3251    buf[1] = 0;
3252#endif
3253    start = wstr;
3254    while (*wstr != L'\0')
3255    {
3256        previous = wstr;
3257#if SIZEOF_WCHAR_T == 2
3258        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3259            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3260        {
3261            buf[0] = wstr[0];
3262            buf[1] = wstr[1];
3263            wstr += 2;
3264        }
3265        else {
3266            buf[0] = *wstr;
3267            buf[1] = 0;
3268            wstr++;
3269        }
3270#else
3271        buf[0] = *wstr;
3272        wstr++;
3273#endif
3274        len = wcstombs(outbuf, buf, sizeof(outbuf));
3275        if (len == (size_t)-1)
3276            return previous - start;
3277    }
3278
3279    /* failed to find the unencodable character */
3280    return 0;
3281}
3282
3283static int
3284locale_error_handler(const char *errors, int *surrogateescape)
3285{
3286    if (errors == NULL) {
3287        *surrogateescape = 0;
3288        return 0;
3289    }
3290
3291    if (strcmp(errors, "strict") == 0) {
3292        *surrogateescape = 0;
3293        return 0;
3294    }
3295    if (strcmp(errors, "surrogateescape") == 0) {
3296        *surrogateescape = 1;
3297        return 0;
3298    }
3299    PyErr_Format(PyExc_ValueError,
3300                 "only 'strict' and 'surrogateescape' error handlers "
3301                 "are supported, not '%s'",
3302                 errors);
3303    return -1;
3304}
3305
3306PyObject *
3307PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3308{
3309    Py_ssize_t wlen, wlen2;
3310    wchar_t *wstr;
3311    PyObject *bytes = NULL;
3312    char *errmsg;
3313    PyObject *reason;
3314    PyObject *exc;
3315    size_t error_pos;
3316    int surrogateescape;
3317
3318    if (locale_error_handler(errors, &surrogateescape) < 0)
3319        return NULL;
3320
3321    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3322    if (wstr == NULL)
3323        return NULL;
3324
3325    wlen2 = wcslen(wstr);
3326    if (wlen2 != wlen) {
3327        PyMem_Free(wstr);
3328        PyErr_SetString(PyExc_TypeError, "embedded null character");
3329        return NULL;
3330    }
3331
3332    if (surrogateescape) {
3333        /* locale encoding with surrogateescape */
3334        char *str;
3335
3336        str = _Py_wchar2char(wstr, &error_pos);
3337        if (str == NULL) {
3338            if (error_pos == (size_t)-1) {
3339                PyErr_NoMemory();
3340                PyMem_Free(wstr);
3341                return NULL;
3342            }
3343            else {
3344                goto encode_error;
3345            }
3346        }
3347        PyMem_Free(wstr);
3348
3349        bytes = PyBytes_FromString(str);
3350        PyMem_Free(str);
3351    }
3352    else {
3353        size_t len, len2;
3354
3355        len = wcstombs(NULL, wstr, 0);
3356        if (len == (size_t)-1) {
3357            error_pos = (size_t)-1;
3358            goto encode_error;
3359        }
3360
3361        bytes = PyBytes_FromStringAndSize(NULL, len);
3362        if (bytes == NULL) {
3363            PyMem_Free(wstr);
3364            return NULL;
3365        }
3366
3367        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3368        if (len2 == (size_t)-1 || len2 > len) {
3369            error_pos = (size_t)-1;
3370            goto encode_error;
3371        }
3372        PyMem_Free(wstr);
3373    }
3374    return bytes;
3375
3376encode_error:
3377    errmsg = strerror(errno);
3378    assert(errmsg != NULL);
3379
3380    if (error_pos == (size_t)-1)
3381        error_pos = wcstombs_errorpos(wstr);
3382
3383    PyMem_Free(wstr);
3384    Py_XDECREF(bytes);
3385
3386    if (errmsg != NULL) {
3387        size_t errlen;
3388        wstr = _Py_char2wchar(errmsg, &errlen);
3389        if (wstr != NULL) {
3390            reason = PyUnicode_FromWideChar(wstr, errlen);
3391            PyMem_Free(wstr);
3392        } else
3393            errmsg = NULL;
3394    }
3395    if (errmsg == NULL)
3396        reason = PyUnicode_FromString(
3397            "wcstombs() encountered an unencodable "
3398            "wide character");
3399    if (reason == NULL)
3400        return NULL;
3401
3402    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3403                                "locale", unicode,
3404                                (Py_ssize_t)error_pos,
3405                                (Py_ssize_t)(error_pos+1),
3406                                reason);
3407    Py_DECREF(reason);
3408    if (exc != NULL) {
3409        PyCodec_StrictErrors(exc);
3410        Py_XDECREF(exc);
3411    }
3412    return NULL;
3413}
3414
3415PyObject *
3416PyUnicode_EncodeFSDefault(PyObject *unicode)
3417{
3418#ifdef HAVE_MBCS
3419    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3420#elif defined(__APPLE__)
3421    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3422#else
3423    PyInterpreterState *interp = PyThreadState_GET()->interp;
3424    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3425       cannot use it to encode and decode filenames before it is loaded. Load
3426       the Python codec requires to encode at least its own filename. Use the C
3427       version of the locale codec until the codec registry is initialized and
3428       the Python codec is loaded.
3429
3430       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3431       cannot only rely on it: check also interp->fscodec_initialized for
3432       subinterpreters. */
3433    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3434        return PyUnicode_AsEncodedString(unicode,
3435                                         Py_FileSystemDefaultEncoding,
3436                                         "surrogateescape");
3437    }
3438    else {
3439        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3440    }
3441#endif
3442}
3443
3444PyObject *
3445PyUnicode_AsEncodedString(PyObject *unicode,
3446                          const char *encoding,
3447                          const char *errors)
3448{
3449    PyObject *v;
3450    char lower[11];  /* Enough for any encoding shortcut */
3451
3452    if (!PyUnicode_Check(unicode)) {
3453        PyErr_BadArgument();
3454        return NULL;
3455    }
3456
3457    /* Shortcuts for common default encodings */
3458    if (normalize_encoding(encoding, lower, sizeof(lower))) {
3459        if ((strcmp(lower, "utf-8") == 0) ||
3460            (strcmp(lower, "utf8") == 0))
3461        {
3462            if (errors == NULL || strcmp(errors, "strict") == 0)
3463                return _PyUnicode_AsUTF8String(unicode, NULL);
3464            else
3465                return _PyUnicode_AsUTF8String(unicode, errors);
3466        }
3467        else if ((strcmp(lower, "latin-1") == 0) ||
3468                 (strcmp(lower, "latin1") == 0) ||
3469                 (strcmp(lower, "iso-8859-1") == 0))
3470            return _PyUnicode_AsLatin1String(unicode, errors);
3471#ifdef HAVE_MBCS
3472        else if (strcmp(lower, "mbcs") == 0)
3473            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3474#endif
3475        else if (strcmp(lower, "ascii") == 0)
3476            return _PyUnicode_AsASCIIString(unicode, errors);
3477    }
3478
3479    /* Encode via the codec registry */
3480    v = PyCodec_Encode(unicode, encoding, errors);
3481    if (v == NULL)
3482        return NULL;
3483
3484    /* The normal path */
3485    if (PyBytes_Check(v))
3486        return v;
3487
3488    /* If the codec returns a buffer, raise a warning and convert to bytes */
3489    if (PyByteArray_Check(v)) {
3490        int error;
3491        PyObject *b;
3492
3493        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3494            "encoder %s returned bytearray instead of bytes",
3495            encoding);
3496        if (error) {
3497            Py_DECREF(v);
3498            return NULL;
3499        }
3500
3501        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3502        Py_DECREF(v);
3503        return b;
3504    }
3505
3506    PyErr_Format(PyExc_TypeError,
3507                 "encoder did not return a bytes object (type=%.400s)",
3508                 Py_TYPE(v)->tp_name);
3509    Py_DECREF(v);
3510    return NULL;
3511}
3512
3513PyObject *
3514PyUnicode_AsEncodedUnicode(PyObject *unicode,
3515                           const char *encoding,
3516                           const char *errors)
3517{
3518    PyObject *v;
3519
3520    if (!PyUnicode_Check(unicode)) {
3521        PyErr_BadArgument();
3522        goto onError;
3523    }
3524
3525    if (encoding == NULL)
3526        encoding = PyUnicode_GetDefaultEncoding();
3527
3528    /* Encode via the codec registry */
3529    v = PyCodec_Encode(unicode, encoding, errors);
3530    if (v == NULL)
3531        goto onError;
3532    if (!PyUnicode_Check(v)) {
3533        PyErr_Format(PyExc_TypeError,
3534                     "encoder did not return an str object (type=%.400s)",
3535                     Py_TYPE(v)->tp_name);
3536        Py_DECREF(v);
3537        goto onError;
3538    }
3539    return v;
3540
3541  onError:
3542    return NULL;
3543}
3544
3545static size_t
3546mbstowcs_errorpos(const char *str, size_t len)
3547{
3548#ifdef HAVE_MBRTOWC
3549    const char *start = str;
3550    mbstate_t mbs;
3551    size_t converted;
3552    wchar_t ch;
3553
3554    memset(&mbs, 0, sizeof mbs);
3555    while (len)
3556    {
3557        converted = mbrtowc(&ch, (char*)str, len, &mbs);
3558        if (converted == 0)
3559            /* Reached end of string */
3560            break;
3561        if (converted == (size_t)-1 || converted == (size_t)-2) {
3562            /* Conversion error or incomplete character */
3563            return str - start;
3564        }
3565        else {
3566            str += converted;
3567            len -= converted;
3568        }
3569    }
3570    /* failed to find the undecodable byte sequence */
3571    return 0;
3572#endif
3573    return 0;
3574}
3575
3576PyObject*
3577PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3578                              const char *errors)
3579{
3580    wchar_t smallbuf[256];
3581    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3582    wchar_t *wstr;
3583    size_t wlen, wlen2;
3584    PyObject *unicode;
3585    int surrogateescape;
3586    size_t error_pos;
3587    char *errmsg;
3588    PyObject *reason, *exc;
3589
3590    if (locale_error_handler(errors, &surrogateescape) < 0)
3591        return NULL;
3592
3593    if (str[len] != '\0' || len != strlen(str)) {
3594        PyErr_SetString(PyExc_TypeError, "embedded null character");
3595        return NULL;
3596    }
3597
3598    if (surrogateescape)
3599    {
3600        wstr = _Py_char2wchar(str, &wlen);
3601        if (wstr == NULL) {
3602            if (wlen == (size_t)-1)
3603                PyErr_NoMemory();
3604            else
3605                PyErr_SetFromErrno(PyExc_OSError);
3606            return NULL;
3607        }
3608
3609        unicode = PyUnicode_FromWideChar(wstr, wlen);
3610        PyMem_Free(wstr);
3611    }
3612    else {
3613#ifndef HAVE_BROKEN_MBSTOWCS
3614        wlen = mbstowcs(NULL, str, 0);
3615#else
3616        wlen = len;
3617#endif
3618        if (wlen == (size_t)-1)
3619            goto decode_error;
3620        if (wlen+1 <= smallbuf_len) {
3621            wstr = smallbuf;
3622        }
3623        else {
3624            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3625                return PyErr_NoMemory();
3626
3627            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3628            if (!wstr)
3629                return PyErr_NoMemory();
3630        }
3631
3632        /* This shouldn't fail now */
3633        wlen2 = mbstowcs(wstr, str, wlen+1);
3634        if (wlen2 == (size_t)-1) {
3635            if (wstr != smallbuf)
3636                PyMem_Free(wstr);
3637            goto decode_error;
3638        }
3639#ifdef HAVE_BROKEN_MBSTOWCS
3640        assert(wlen2 == wlen);
3641#endif
3642        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3643        if (wstr != smallbuf)
3644            PyMem_Free(wstr);
3645    }
3646    return unicode;
3647
3648decode_error:
3649    errmsg = strerror(errno);
3650    assert(errmsg != NULL);
3651
3652    error_pos = mbstowcs_errorpos(str, len);
3653    if (errmsg != NULL) {
3654        size_t errlen;
3655        wstr = _Py_char2wchar(errmsg, &errlen);
3656        if (wstr != NULL) {
3657            reason = PyUnicode_FromWideChar(wstr, errlen);
3658            PyMem_Free(wstr);
3659        } else
3660            errmsg = NULL;
3661    }
3662    if (errmsg == NULL)
3663        reason = PyUnicode_FromString(
3664            "mbstowcs() encountered an invalid multibyte sequence");
3665    if (reason == NULL)
3666        return NULL;
3667
3668    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3669                                "locale", str, len,
3670                                (Py_ssize_t)error_pos,
3671                                (Py_ssize_t)(error_pos+1),
3672                                reason);
3673    Py_DECREF(reason);
3674    if (exc != NULL) {
3675        PyCodec_StrictErrors(exc);
3676        Py_XDECREF(exc);
3677    }
3678    return NULL;
3679}
3680
3681PyObject*
3682PyUnicode_DecodeLocale(const char *str, const char *errors)
3683{
3684    Py_ssize_t size = (Py_ssize_t)strlen(str);
3685    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3686}
3687
3688
3689PyObject*
3690PyUnicode_DecodeFSDefault(const char *s) {
3691    Py_ssize_t size = (Py_ssize_t)strlen(s);
3692    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3693}
3694
3695PyObject*
3696PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3697{
3698#ifdef HAVE_MBCS
3699    return PyUnicode_DecodeMBCS(s, size, NULL);
3700#elif defined(__APPLE__)
3701    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3702#else
3703    PyInterpreterState *interp = PyThreadState_GET()->interp;
3704    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3705       cannot use it to encode and decode filenames before it is loaded. Load
3706       the Python codec requires to encode at least its own filename. Use the C
3707       version of the locale codec until the codec registry is initialized and
3708       the Python codec is loaded.
3709
3710       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3711       cannot only rely on it: check also interp->fscodec_initialized for
3712       subinterpreters. */
3713    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3714        return PyUnicode_Decode(s, size,
3715                                Py_FileSystemDefaultEncoding,
3716                                "surrogateescape");
3717    }
3718    else {
3719        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3720    }
3721#endif
3722}
3723
3724
3725int
3726_PyUnicode_HasNULChars(PyObject* s)
3727{
3728    static PyObject *nul = NULL;
3729
3730    if (nul == NULL)
3731        nul = PyUnicode_FromStringAndSize("\0", 1);
3732    if (nul == NULL)
3733        return -1;
3734    return PyUnicode_Contains(s, nul);
3735}
3736
3737
3738int
3739PyUnicode_FSConverter(PyObject* arg, void* addr)
3740{
3741    PyObject *output = NULL;
3742    Py_ssize_t size;
3743    void *data;
3744    if (arg == NULL) {
3745        Py_DECREF(*(PyObject**)addr);
3746        return 1;
3747    }
3748    if (PyBytes_Check(arg)) {
3749        output = arg;
3750        Py_INCREF(output);
3751    }
3752    else {
3753        arg = PyUnicode_FromObject(arg);
3754        if (!arg)
3755            return 0;
3756        output = PyUnicode_EncodeFSDefault(arg);
3757        Py_DECREF(arg);
3758        if (!output)
3759            return 0;
3760        if (!PyBytes_Check(output)) {
3761            Py_DECREF(output);
3762            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3763            return 0;
3764        }
3765    }
3766    size = PyBytes_GET_SIZE(output);
3767    data = PyBytes_AS_STRING(output);
3768    if (size != strlen(data)) {
3769        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3770        Py_DECREF(output);
3771        return 0;
3772    }
3773    *(PyObject**)addr = output;
3774    return Py_CLEANUP_SUPPORTED;
3775}
3776
3777
3778int
3779PyUnicode_FSDecoder(PyObject* arg, void* addr)
3780{
3781    PyObject *output = NULL;
3782    if (arg == NULL) {
3783        Py_DECREF(*(PyObject**)addr);
3784        return 1;
3785    }
3786    if (PyUnicode_Check(arg)) {
3787        if (PyUnicode_READY(arg) == -1)
3788            return 0;
3789        output = arg;
3790        Py_INCREF(output);
3791    }
3792    else {
3793        arg = PyBytes_FromObject(arg);
3794        if (!arg)
3795            return 0;
3796        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3797                                                  PyBytes_GET_SIZE(arg));
3798        Py_DECREF(arg);
3799        if (!output)
3800            return 0;
3801        if (!PyUnicode_Check(output)) {
3802            Py_DECREF(output);
3803            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3804            return 0;
3805        }
3806    }
3807    if (PyUnicode_READY(output) == -1) {
3808        Py_DECREF(output);
3809        return 0;
3810    }
3811    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3812                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3813        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3814        Py_DECREF(output);
3815        return 0;
3816    }
3817    *(PyObject**)addr = output;
3818    return Py_CLEANUP_SUPPORTED;
3819}
3820
3821
3822char*
3823PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3824{
3825    PyObject *bytes;
3826
3827    if (!PyUnicode_Check(unicode)) {
3828        PyErr_BadArgument();
3829        return NULL;
3830    }
3831    if (PyUnicode_READY(unicode) == -1)
3832        return NULL;
3833
3834    if (PyUnicode_UTF8(unicode) == NULL) {
3835        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3836        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3837        if (bytes == NULL)
3838            return NULL;
3839        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3840        if (_PyUnicode_UTF8(unicode) == NULL) {
3841            Py_DECREF(bytes);
3842            return NULL;
3843        }
3844        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3845        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3846                  PyBytes_AS_STRING(bytes),
3847                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3848        Py_DECREF(bytes);
3849    }
3850
3851    if (psize)
3852        *psize = PyUnicode_UTF8_LENGTH(unicode);
3853    return PyUnicode_UTF8(unicode);
3854}
3855
3856char*
3857PyUnicode_AsUTF8(PyObject *unicode)
3858{
3859    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3860}
3861
3862Py_UNICODE *
3863PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3864{
3865    const unsigned char *one_byte;
3866#if SIZEOF_WCHAR_T == 4
3867    const Py_UCS2 *two_bytes;
3868#else
3869    const Py_UCS4 *four_bytes;
3870    const Py_UCS4 *ucs4_end;
3871    Py_ssize_t num_surrogates;
3872#endif
3873    wchar_t *w;
3874    wchar_t *wchar_end;
3875
3876    if (!PyUnicode_Check(unicode)) {
3877        PyErr_BadArgument();
3878        return NULL;
3879    }
3880    if (_PyUnicode_WSTR(unicode) == NULL) {
3881        /* Non-ASCII compact unicode object */
3882        assert(_PyUnicode_KIND(unicode) != 0);
3883        assert(PyUnicode_IS_READY(unicode));
3884
3885        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3886#if SIZEOF_WCHAR_T == 2
3887            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3888            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3889            num_surrogates = 0;
3890
3891            for (; four_bytes < ucs4_end; ++four_bytes) {
3892                if (*four_bytes > 0xFFFF)
3893                    ++num_surrogates;
3894            }
3895
3896            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3897                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3898            if (!_PyUnicode_WSTR(unicode)) {
3899                PyErr_NoMemory();
3900                return NULL;
3901            }
3902            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3903
3904            w = _PyUnicode_WSTR(unicode);
3905            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3906            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3907            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3908                if (*four_bytes > 0xFFFF) {
3909                    assert(*four_bytes <= MAX_UNICODE);
3910                    /* encode surrogate pair in this case */
3911                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3912                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3913                }
3914                else
3915                    *w = *four_bytes;
3916
3917                if (w > wchar_end) {
3918                    assert(0 && "Miscalculated string end");
3919                }
3920            }
3921            *w = 0;
3922#else
3923            /* sizeof(wchar_t) == 4 */
3924            Py_FatalError("Impossible unicode object state, wstr and str "
3925                          "should share memory already.");
3926            return NULL;
3927#endif
3928        }
3929        else {
3930            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3931                                                  (_PyUnicode_LENGTH(unicode) + 1));
3932            if (!_PyUnicode_WSTR(unicode)) {
3933                PyErr_NoMemory();
3934                return NULL;
3935            }
3936            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3937                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3938            w = _PyUnicode_WSTR(unicode);
3939            wchar_end = w + _PyUnicode_LENGTH(unicode);
3940
3941            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3942                one_byte = PyUnicode_1BYTE_DATA(unicode);
3943                for (; w < wchar_end; ++one_byte, ++w)
3944                    *w = *one_byte;
3945                /* null-terminate the wstr */
3946                *w = 0;
3947            }
3948            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3949#if SIZEOF_WCHAR_T == 4
3950                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3951                for (; w < wchar_end; ++two_bytes, ++w)
3952                    *w = *two_bytes;
3953                /* null-terminate the wstr */
3954                *w = 0;
3955#else
3956                /* sizeof(wchar_t) == 2 */
3957                PyObject_FREE(_PyUnicode_WSTR(unicode));
3958                _PyUnicode_WSTR(unicode) = NULL;
3959                Py_FatalError("Impossible unicode object state, wstr "
3960                              "and str should share memory already.");
3961                return NULL;
3962#endif
3963            }
3964            else {
3965                assert(0 && "This should never happen.");
3966            }
3967        }
3968    }
3969    if (size != NULL)
3970        *size = PyUnicode_WSTR_LENGTH(unicode);
3971    return _PyUnicode_WSTR(unicode);
3972}
3973
3974Py_UNICODE *
3975PyUnicode_AsUnicode(PyObject *unicode)
3976{
3977    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3978}
3979
3980
3981Py_ssize_t
3982PyUnicode_GetSize(PyObject *unicode)
3983{
3984    if (!PyUnicode_Check(unicode)) {
3985        PyErr_BadArgument();
3986        goto onError;
3987    }
3988    return PyUnicode_GET_SIZE(unicode);
3989
3990  onError:
3991    return -1;
3992}
3993
3994Py_ssize_t
3995PyUnicode_GetLength(PyObject *unicode)
3996{
3997    if (!PyUnicode_Check(unicode)) {
3998        PyErr_BadArgument();
3999        return -1;
4000    }
4001    if (PyUnicode_READY(unicode) == -1)
4002        return -1;
4003    return PyUnicode_GET_LENGTH(unicode);
4004}
4005
4006Py_UCS4
4007PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4008{
4009    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4010        PyErr_BadArgument();
4011        return (Py_UCS4)-1;
4012    }
4013    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4014        PyErr_SetString(PyExc_IndexError, "string index out of range");
4015        return (Py_UCS4)-1;
4016    }
4017    return PyUnicode_READ_CHAR(unicode, index);
4018}
4019
4020int
4021PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4022{
4023    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4024        PyErr_BadArgument();
4025        return -1;
4026    }
4027    assert(PyUnicode_IS_READY(unicode));
4028    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4029        PyErr_SetString(PyExc_IndexError, "string index out of range");
4030        return -1;
4031    }
4032    if (unicode_check_modifiable(unicode))
4033        return -1;
4034    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4035        PyErr_SetString(PyExc_ValueError, "character out of range");
4036        return -1;
4037    }
4038    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4039                    index, ch);
4040    return 0;
4041}
4042
4043const char *
4044PyUnicode_GetDefaultEncoding(void)
4045{
4046    return "utf-8";
4047}
4048
4049/* create or adjust a UnicodeDecodeError */
4050static void
4051make_decode_exception(PyObject **exceptionObject,
4052                      const char *encoding,
4053                      const char *input, Py_ssize_t length,
4054                      Py_ssize_t startpos, Py_ssize_t endpos,
4055                      const char *reason)
4056{
4057    if (*exceptionObject == NULL) {
4058        *exceptionObject = PyUnicodeDecodeError_Create(
4059            encoding, input, length, startpos, endpos, reason);
4060    }
4061    else {
4062        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4063            goto onError;
4064        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4065            goto onError;
4066        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4067            goto onError;
4068    }
4069    return;
4070
4071onError:
4072    Py_DECREF(*exceptionObject);
4073    *exceptionObject = NULL;
4074}
4075
4076/* error handling callback helper:
4077   build arguments, call the callback and check the arguments,
4078   if no exception occurred, copy the replacement to the output
4079   and adjust various state variables.
4080   return 0 on success, -1 on error
4081*/
4082
4083static int
4084unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
4085                                 const char *encoding, const char *reason,
4086                                 const char **input, const char **inend, Py_ssize_t *startinpos,
4087                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4088                                 PyObject **output, Py_ssize_t *outpos)
4089{
4090    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4091
4092    PyObject *restuple = NULL;
4093    PyObject *repunicode = NULL;
4094    Py_ssize_t outsize;
4095    Py_ssize_t insize;
4096    Py_ssize_t requiredsize;
4097    Py_ssize_t newpos;
4098    PyObject *inputobj = NULL;
4099    int res = -1;
4100
4101    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4102        outsize = PyUnicode_GET_LENGTH(*output);
4103    else
4104        outsize = _PyUnicode_WSTR_LENGTH(*output);
4105
4106    if (*errorHandler == NULL) {
4107        *errorHandler = PyCodec_LookupError(errors);
4108        if (*errorHandler == NULL)
4109            goto onError;
4110    }
4111
4112    make_decode_exception(exceptionObject,
4113        encoding,
4114        *input, *inend - *input,
4115        *startinpos, *endinpos,
4116        reason);
4117    if (*exceptionObject == NULL)
4118        goto onError;
4119
4120    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4121    if (restuple == NULL)
4122        goto onError;
4123    if (!PyTuple_Check(restuple)) {
4124        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4125        goto onError;
4126    }
4127    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4128        goto onError;
4129    if (PyUnicode_READY(repunicode) == -1)
4130        goto onError;
4131
4132    /* Copy back the bytes variables, which might have been modified by the
4133       callback */
4134    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4135    if (!inputobj)
4136        goto onError;
4137    if (!PyBytes_Check(inputobj)) {
4138        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4139    }
4140    *input = PyBytes_AS_STRING(inputobj);
4141    insize = PyBytes_GET_SIZE(inputobj);
4142    *inend = *input + insize;
4143    /* we can DECREF safely, as the exception has another reference,
4144       so the object won't go away. */
4145    Py_DECREF(inputobj);
4146
4147    if (newpos<0)
4148        newpos = insize+newpos;
4149    if (newpos<0 || newpos>insize) {
4150        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4151        goto onError;
4152    }
4153
4154    if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4155        /* need more space? (at least enough for what we
4156           have+the replacement+the rest of the string (starting
4157           at the new input position), so we won't have to check space
4158           when there are no errors in the rest of the string) */
4159        Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4160        requiredsize = *outpos + replen + insize-newpos;
4161        if (requiredsize > outsize) {
4162            if (requiredsize<2*outsize)
4163                requiredsize = 2*outsize;
4164            if (unicode_resize(output, requiredsize) < 0)
4165                goto onError;
4166        }
4167        if (unicode_widen(output, *outpos,
4168                          PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
4169            goto onError;
4170        _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
4171        *outpos += replen;
4172    }
4173    else {
4174        wchar_t *repwstr;
4175        Py_ssize_t repwlen;
4176        repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4177        if (repwstr == NULL)
4178            goto onError;
4179        /* need more space? (at least enough for what we
4180           have+the replacement+the rest of the string (starting
4181           at the new input position), so we won't have to check space
4182           when there are no errors in the rest of the string) */
4183        requiredsize = *outpos + repwlen + insize-newpos;
4184        if (requiredsize > outsize) {
4185            if (requiredsize < 2*outsize)
4186                requiredsize = 2*outsize;
4187            if (unicode_resize(output, requiredsize) < 0)
4188                goto onError;
4189        }
4190        wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4191        *outpos += repwlen;
4192    }
4193    *endinpos = newpos;
4194    *inptr = *input + newpos;
4195
4196    /* we made it! */
4197    res = 0;
4198
4199  onError:
4200    Py_XDECREF(restuple);
4201    return res;
4202}
4203
4204/* --- UTF-7 Codec -------------------------------------------------------- */
4205
4206/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4207
4208/* Three simple macros defining base-64. */
4209
4210/* Is c a base-64 character? */
4211
4212#define IS_BASE64(c) \
4213    (((c) >= 'A' && (c) <= 'Z') ||     \
4214     ((c) >= 'a' && (c) <= 'z') ||     \
4215     ((c) >= '0' && (c) <= '9') ||     \
4216     (c) == '+' || (c) == '/')
4217
4218/* given that c is a base-64 character, what is its base-64 value? */
4219
4220#define FROM_BASE64(c)                                                  \
4221    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4222     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4223     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4224     (c) == '+' ? 62 : 63)
4225
4226/* What is the base-64 character of the bottom 6 bits of n? */
4227
4228#define TO_BASE64(n)  \
4229    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4230
4231/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4232 * decoded as itself.  We are permissive on decoding; the only ASCII
4233 * byte not decoding to itself is the + which begins a base64
4234 * string. */
4235
4236#define DECODE_DIRECT(c)                                \
4237    ((c) <= 127 && (c) != '+')
4238
4239/* The UTF-7 encoder treats ASCII characters differently according to
4240 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4241 * the above).  See RFC2152.  This array identifies these different
4242 * sets:
4243 * 0 : "Set D"
4244 *     alphanumeric and '(),-./:?
4245 * 1 : "Set O"
4246 *     !"#$%&*;<=>@[]^_`{|}
4247 * 2 : "whitespace"
4248 *     ht nl cr sp
4249 * 3 : special (must be base64 encoded)
4250 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4251 */
4252
4253static
4254char utf7_category[128] = {
4255/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4256    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4257/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4258    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4259/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4260    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4261/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4262    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4263/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4264    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4265/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4266    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4267/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4268    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4269/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4270    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4271};
4272
4273/* ENCODE_DIRECT: this character should be encoded as itself.  The
4274 * answer depends on whether we are encoding set O as itself, and also
4275 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4276 * clear that the answers to these questions vary between
4277 * applications, so this code needs to be flexible.  */
4278
4279#define ENCODE_DIRECT(c, directO, directWS)             \
4280    ((c) < 128 && (c) > 0 &&                            \
4281     ((utf7_category[(c)] == 0) ||                      \
4282      (directWS && (utf7_category[(c)] == 2)) ||        \
4283      (directO && (utf7_category[(c)] == 1))))
4284
4285PyObject *
4286PyUnicode_DecodeUTF7(const char *s,
4287                     Py_ssize_t size,
4288                     const char *errors)
4289{
4290    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4291}
4292
4293/* The decoder.  The only state we preserve is our read position,
4294 * i.e. how many characters we have consumed.  So if we end in the
4295 * middle of a shift sequence we have to back off the read position
4296 * and the output to the beginning of the sequence, otherwise we lose
4297 * all the shift state (seen bits, number of bits seen, high
4298 * surrogate). */
4299
4300PyObject *
4301PyUnicode_DecodeUTF7Stateful(const char *s,
4302                             Py_ssize_t size,
4303                             const char *errors,
4304                             Py_ssize_t *consumed)
4305{
4306    const char *starts = s;
4307    Py_ssize_t startinpos;
4308    Py_ssize_t endinpos;
4309    Py_ssize_t outpos;
4310    const char *e;
4311    PyObject *unicode;
4312    const char *errmsg = "";
4313    int inShift = 0;
4314    Py_ssize_t shiftOutStart;
4315    unsigned int base64bits = 0;
4316    unsigned long base64buffer = 0;
4317    Py_UCS4 surrogate = 0;
4318    PyObject *errorHandler = NULL;
4319    PyObject *exc = NULL;
4320
4321    /* Start off assuming it's all ASCII. Widen later as necessary. */
4322    unicode = PyUnicode_New(size, 127);
4323    if (!unicode)
4324        return NULL;
4325    if (size == 0) {
4326        if (consumed)
4327            *consumed = 0;
4328        return unicode;
4329    }
4330
4331    shiftOutStart = outpos = 0;
4332    e = s + size;
4333
4334    while (s < e) {
4335        Py_UCS4 ch;
4336      restart:
4337        ch = (unsigned char) *s;
4338
4339        if (inShift) { /* in a base-64 section */
4340            if (IS_BASE64(ch)) { /* consume a base-64 character */
4341                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4342                base64bits += 6;
4343                s++;
4344                if (base64bits >= 16) {
4345                    /* we have enough bits for a UTF-16 value */
4346                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4347                    base64bits -= 16;
4348                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4349                    if (surrogate) {
4350                        /* expecting a second surrogate */
4351                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4352                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4353                            if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4354                                goto onError;
4355                            surrogate = 0;
4356                            continue;
4357                        }
4358                        else {
4359                            if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4360                                goto onError;
4361                            surrogate = 0;
4362                        }
4363                    }
4364                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4365                        /* first surrogate */
4366                        surrogate = outCh;
4367                    }
4368                    else {
4369                        if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4370                            goto onError;
4371                    }
4372                }
4373            }
4374            else { /* now leaving a base-64 section */
4375                inShift = 0;
4376                s++;
4377                if (surrogate) {
4378                    if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4379                        goto onError;
4380                    surrogate = 0;
4381                }
4382                if (base64bits > 0) { /* left-over bits */
4383                    if (base64bits >= 6) {
4384                        /* We've seen at least one base-64 character */
4385                        errmsg = "partial character in shift sequence";
4386                        goto utf7Error;
4387                    }
4388                    else {
4389                        /* Some bits remain; they should be zero */
4390                        if (base64buffer != 0) {
4391                            errmsg = "non-zero padding bits in shift sequence";
4392                            goto utf7Error;
4393                        }
4394                    }
4395                }
4396                if (ch != '-') {
4397                    /* '-' is absorbed; other terminating
4398                       characters are preserved */
4399                    if (unicode_putchar(&unicode, &outpos, ch) < 0)
4400                        goto onError;
4401                }
4402            }
4403        }
4404        else if ( ch == '+' ) {
4405            startinpos = s-starts;
4406            s++; /* consume '+' */
4407            if (s < e && *s == '-') { /* '+-' encodes '+' */
4408                s++;
4409                if (unicode_putchar(&unicode, &outpos, '+') < 0)
4410                    goto onError;
4411            }
4412            else { /* begin base64-encoded section */
4413                inShift = 1;
4414                shiftOutStart = outpos;
4415                base64bits = 0;
4416            }
4417        }
4418        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4419            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4420                goto onError;
4421            s++;
4422        }
4423        else {
4424            startinpos = s-starts;
4425            s++;
4426            errmsg = "unexpected special character";
4427            goto utf7Error;
4428        }
4429        continue;
4430utf7Error:
4431        endinpos = s-starts;
4432        if (unicode_decode_call_errorhandler(
4433                errors, &errorHandler,
4434                "utf7", errmsg,
4435                &starts, &e, &startinpos, &endinpos, &exc, &s,
4436                &unicode, &outpos))
4437            goto onError;
4438    }
4439
4440    /* end of string */
4441
4442    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4443        /* if we're in an inconsistent state, that's an error */
4444        if (surrogate ||
4445                (base64bits >= 6) ||
4446                (base64bits > 0 && base64buffer != 0)) {
4447            endinpos = size;
4448            if (unicode_decode_call_errorhandler(
4449                    errors, &errorHandler,
4450                    "utf7", "unterminated shift sequence",
4451                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4452                    &unicode, &outpos))
4453                goto onError;
4454            if (s < e)
4455                goto restart;
4456        }
4457    }
4458
4459    /* return state */
4460    if (consumed) {
4461        if (inShift) {
4462            outpos = shiftOutStart; /* back off output */
4463            *consumed = startinpos;
4464        }
4465        else {
4466            *consumed = s-starts;
4467        }
4468    }
4469
4470    if (unicode_resize(&unicode, outpos) < 0)
4471        goto onError;
4472
4473    Py_XDECREF(errorHandler);
4474    Py_XDECREF(exc);
4475    return unicode_result(unicode);
4476
4477  onError:
4478    Py_XDECREF(errorHandler);
4479    Py_XDECREF(exc);
4480    Py_DECREF(unicode);
4481    return NULL;
4482}
4483
4484
4485PyObject *
4486_PyUnicode_EncodeUTF7(PyObject *str,
4487                      int base64SetO,
4488                      int base64WhiteSpace,
4489                      const char *errors)
4490{
4491    int kind;
4492    void *data;
4493    Py_ssize_t len;
4494    PyObject *v;
4495    int inShift = 0;
4496    Py_ssize_t i;
4497    unsigned int base64bits = 0;
4498    unsigned long base64buffer = 0;
4499    char * out;
4500    char * start;
4501
4502    if (PyUnicode_READY(str) == -1)
4503        return NULL;
4504    kind = PyUnicode_KIND(str);
4505    data = PyUnicode_DATA(str);
4506    len = PyUnicode_GET_LENGTH(str);
4507
4508    if (len == 0)
4509        return PyBytes_FromStringAndSize(NULL, 0);
4510
4511    /* It might be possible to tighten this worst case */
4512    if (len > PY_SSIZE_T_MAX / 8)
4513        return PyErr_NoMemory();
4514    v = PyBytes_FromStringAndSize(NULL, len * 8);
4515    if (v == NULL)
4516        return NULL;
4517
4518    start = out = PyBytes_AS_STRING(v);
4519    for (i = 0; i < len; ++i) {
4520        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4521
4522        if (inShift) {
4523            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4524                /* shifting out */
4525                if (base64bits) { /* output remaining bits */
4526                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4527                    base64buffer = 0;
4528                    base64bits = 0;
4529                }
4530                inShift = 0;
4531                /* Characters not in the BASE64 set implicitly unshift the sequence
4532                   so no '-' is required, except if the character is itself a '-' */
4533                if (IS_BASE64(ch) || ch == '-') {
4534                    *out++ = '-';
4535                }
4536                *out++ = (char) ch;
4537            }
4538            else {
4539                goto encode_char;
4540            }
4541        }
4542        else { /* not in a shift sequence */
4543            if (ch == '+') {
4544                *out++ = '+';
4545                        *out++ = '-';
4546            }
4547            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4548                *out++ = (char) ch;
4549            }
4550            else {
4551                *out++ = '+';
4552                inShift = 1;
4553                goto encode_char;
4554            }
4555        }
4556        continue;
4557encode_char:
4558        if (ch >= 0x10000) {
4559            assert(ch <= MAX_UNICODE);
4560
4561            /* code first surrogate */
4562            base64bits += 16;
4563            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4564            while (base64bits >= 6) {
4565                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4566                base64bits -= 6;
4567            }
4568            /* prepare second surrogate */
4569            ch = Py_UNICODE_LOW_SURROGATE(ch);
4570        }
4571        base64bits += 16;
4572        base64buffer = (base64buffer << 16) | ch;
4573        while (base64bits >= 6) {
4574            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4575            base64bits -= 6;
4576        }
4577    }
4578    if (base64bits)
4579        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4580    if (inShift)
4581        *out++ = '-';
4582    if (_PyBytes_Resize(&v, out - start) < 0)
4583        return NULL;
4584    return v;
4585}
4586PyObject *
4587PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4588                     Py_ssize_t size,
4589                     int base64SetO,
4590                     int base64WhiteSpace,
4591                     const char *errors)
4592{
4593    PyObject *result;
4594    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4595    if (tmp == NULL)
4596        return NULL;
4597    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4598                                   base64WhiteSpace, errors);
4599    Py_DECREF(tmp);
4600    return result;
4601}
4602
4603#undef IS_BASE64
4604#undef FROM_BASE64
4605#undef TO_BASE64
4606#undef DECODE_DIRECT
4607#undef ENCODE_DIRECT
4608
4609/* --- UTF-8 Codec -------------------------------------------------------- */
4610
4611PyObject *
4612PyUnicode_DecodeUTF8(const char *s,
4613                     Py_ssize_t size,
4614                     const char *errors)
4615{
4616    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4617}
4618
4619#include "stringlib/asciilib.h"
4620#include "stringlib/codecs.h"
4621#include "stringlib/undef.h"
4622
4623#include "stringlib/ucs1lib.h"
4624#include "stringlib/codecs.h"
4625#include "stringlib/undef.h"
4626
4627#include "stringlib/ucs2lib.h"
4628#include "stringlib/codecs.h"
4629#include "stringlib/undef.h"
4630
4631#include "stringlib/ucs4lib.h"
4632#include "stringlib/codecs.h"
4633#include "stringlib/undef.h"
4634
4635/* Mask to quickly check whether a C 'long' contains a
4636   non-ASCII, UTF8-encoded char. */
4637#if (SIZEOF_LONG == 8)
4638# define ASCII_CHAR_MASK 0x8080808080808080UL
4639#elif (SIZEOF_LONG == 4)
4640# define ASCII_CHAR_MASK 0x80808080UL
4641#else
4642# error C 'long' size should be either 4 or 8!
4643#endif
4644
4645static Py_ssize_t
4646ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4647{
4648    const char *p = start;
4649    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4650
4651#if SIZEOF_LONG <= SIZEOF_VOID_P
4652    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4653    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4654        /* Fast path, see in STRINGLIB(utf8_decode) for
4655           an explanation. */
4656        /* Help register allocation */
4657        register const char *_p = p;
4658        register Py_UCS1 * q = dest;
4659        while (_p < aligned_end) {
4660            unsigned long value = *(const unsigned long *) _p;
4661            if (value & ASCII_CHAR_MASK)
4662                break;
4663            *((unsigned long *)q) = value;
4664            _p += SIZEOF_LONG;
4665            q += SIZEOF_LONG;
4666        }
4667        p = _p;
4668        while (p < end) {
4669            if ((unsigned char)*p & 0x80)
4670                break;
4671            *q++ = *p++;
4672        }
4673        return p - start;
4674    }
4675#endif
4676    while (p < end) {
4677        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4678           for an explanation. */
4679        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4680            /* Help register allocation */
4681            register const char *_p = p;
4682            while (_p < aligned_end) {
4683                unsigned long value = *(unsigned long *) _p;
4684                if (value & ASCII_CHAR_MASK)
4685                    break;
4686                _p += SIZEOF_LONG;
4687            }
4688            p = _p;
4689            if (_p == end)
4690                break;
4691        }
4692        if ((unsigned char)*p & 0x80)
4693            break;
4694        ++p;
4695    }
4696    memcpy(dest, start, p - start);
4697    return p - start;
4698}
4699
4700PyObject *
4701PyUnicode_DecodeUTF8Stateful(const char *s,
4702                             Py_ssize_t size,
4703                             const char *errors,
4704                             Py_ssize_t *consumed)
4705{
4706    PyObject *unicode;
4707    const char *starts = s;
4708    const char *end = s + size;
4709    Py_ssize_t outpos;
4710
4711    Py_ssize_t startinpos;
4712    Py_ssize_t endinpos;
4713    const char *errmsg = "";
4714    PyObject *errorHandler = NULL;
4715    PyObject *exc = NULL;
4716
4717    if (size == 0) {
4718        if (consumed)
4719            *consumed = 0;
4720        Py_INCREF(unicode_empty);
4721        return unicode_empty;
4722    }
4723
4724    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4725    if (size == 1 && (unsigned char)s[0] < 128) {
4726        if (consumed)
4727            *consumed = 1;
4728        return get_latin1_char((unsigned char)s[0]);
4729    }
4730
4731    unicode = PyUnicode_New(size, 127);
4732    if (!unicode)
4733        return NULL;
4734
4735    outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4736    s += outpos;
4737    while (s < end) {
4738        Py_UCS4 ch;
4739        int kind = PyUnicode_KIND(unicode);
4740        if (kind == PyUnicode_1BYTE_KIND) {
4741            if (PyUnicode_IS_ASCII(unicode))
4742                ch = asciilib_utf8_decode(&s, end,
4743                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4744            else
4745                ch = ucs1lib_utf8_decode(&s, end,
4746                        PyUnicode_1BYTE_DATA(unicode), &outpos);
4747        } else if (kind == PyUnicode_2BYTE_KIND) {
4748            ch = ucs2lib_utf8_decode(&s, end,
4749                    PyUnicode_2BYTE_DATA(unicode), &outpos);
4750        } else {
4751            assert(kind == PyUnicode_4BYTE_KIND);
4752            ch = ucs4lib_utf8_decode(&s, end,
4753                    PyUnicode_4BYTE_DATA(unicode), &outpos);
4754        }
4755
4756        switch (ch) {
4757        case 0:
4758            if (s == end || consumed)
4759                goto End;
4760            errmsg = "unexpected end of data";
4761            startinpos = s - starts;
4762            endinpos = end - starts;
4763            break;
4764        case 1:
4765            errmsg = "invalid start byte";
4766            startinpos = s - starts;
4767            endinpos = startinpos + 1;
4768            break;
4769        case 2:
4770        case 3:
4771        case 4:
4772            errmsg = "invalid continuation byte";
4773            startinpos = s - starts;
4774            endinpos = startinpos + ch - 1;
4775            break;
4776        default:
4777            if (unicode_putchar(&unicode, &outpos, ch) < 0)
4778                goto onError;
4779            continue;
4780        }
4781
4782        if (unicode_decode_call_errorhandler(
4783                errors, &errorHandler,
4784                "utf-8", errmsg,
4785                &starts, &end, &startinpos, &endinpos, &exc, &s,
4786                &unicode, &outpos))
4787            goto onError;
4788    }
4789
4790End:
4791    if (unicode_resize(&unicode, outpos) < 0)
4792        goto onError;
4793
4794    if (consumed)
4795        *consumed = s - starts;
4796
4797    Py_XDECREF(errorHandler);
4798    Py_XDECREF(exc);
4799    assert(_PyUnicode_CheckConsistency(unicode, 1));
4800    return unicode;
4801
4802onError:
4803    Py_XDECREF(errorHandler);
4804    Py_XDECREF(exc);
4805    Py_XDECREF(unicode);
4806    return NULL;
4807}
4808
4809#ifdef __APPLE__
4810
4811/* Simplified UTF-8 decoder using surrogateescape error handler,
4812   used to decode the command line arguments on Mac OS X. */
4813
4814wchar_t*
4815_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4816{
4817    const char *e;
4818    wchar_t *unicode;
4819    Py_ssize_t outpos;
4820
4821    /* Note: size will always be longer than the resulting Unicode
4822       character count */
4823    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4824        PyErr_NoMemory();
4825        return NULL;
4826    }
4827    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4828    if (!unicode)
4829        return NULL;
4830
4831    /* Unpack UTF-8 encoded data */
4832    e = s + size;
4833    outpos = 0;
4834    while (s < e) {
4835        Py_UCS4 ch;
4836#if SIZEOF_WCHAR_T == 4
4837        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4838#else
4839        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4840#endif
4841        if (ch > 0xFF) {
4842#if SIZEOF_WCHAR_T == 4
4843            assert(0);
4844#else
4845            assert(Py_UNICODE_IS_SURROGATE(ch));
4846            /*  compute and append the two surrogates: */
4847            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4848            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4849#endif
4850        }
4851        else {
4852            if (!ch && s == e)
4853                break;
4854            /* surrogateescape */
4855            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4856        }
4857    }
4858    unicode[outpos] = L'\0';
4859    return unicode;
4860}
4861
4862#endif /* __APPLE__ */
4863
4864/* Primary internal function which creates utf8 encoded bytes objects.
4865
4866   Allocation strategy:  if the string is short, convert into a stack buffer
4867   and allocate exactly as much space needed at the end.  Else allocate the
4868   maximum possible needed (4 result bytes per Unicode character), and return
4869   the excess memory at the end.
4870*/
4871PyObject *
4872_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4873{
4874    enum PyUnicode_Kind kind;
4875    void *data;
4876    Py_ssize_t size;
4877
4878    if (!PyUnicode_Check(unicode)) {
4879        PyErr_BadArgument();
4880        return NULL;
4881    }
4882
4883    if (PyUnicode_READY(unicode) == -1)
4884        return NULL;
4885
4886    if (PyUnicode_UTF8(unicode))
4887        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4888                                         PyUnicode_UTF8_LENGTH(unicode));
4889
4890    kind = PyUnicode_KIND(unicode);
4891    data = PyUnicode_DATA(unicode);
4892    size = PyUnicode_GET_LENGTH(unicode);
4893
4894    switch (kind) {
4895    default:
4896        assert(0);
4897    case PyUnicode_1BYTE_KIND:
4898        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4899        assert(!PyUnicode_IS_ASCII(unicode));
4900        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4901    case PyUnicode_2BYTE_KIND:
4902        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4903    case PyUnicode_4BYTE_KIND:
4904        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4905    }
4906}
4907
4908PyObject *
4909PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4910                     Py_ssize_t size,
4911                     const char *errors)
4912{
4913    PyObject *v, *unicode;
4914
4915    unicode = PyUnicode_FromUnicode(s, size);
4916    if (unicode == NULL)
4917        return NULL;
4918    v = _PyUnicode_AsUTF8String(unicode, errors);
4919    Py_DECREF(unicode);
4920    return v;
4921}
4922
4923PyObject *
4924PyUnicode_AsUTF8String(PyObject *unicode)
4925{
4926    return _PyUnicode_AsUTF8String(unicode, NULL);
4927}
4928
4929/* --- UTF-32 Codec ------------------------------------------------------- */
4930
4931PyObject *
4932PyUnicode_DecodeUTF32(const char *s,
4933                      Py_ssize_t size,
4934                      const char *errors,
4935                      int *byteorder)
4936{
4937    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4938}
4939
4940PyObject *
4941PyUnicode_DecodeUTF32Stateful(const char *s,
4942                              Py_ssize_t size,
4943                              const char *errors,
4944                              int *byteorder,
4945                              Py_ssize_t *consumed)
4946{
4947    const char *starts = s;
4948    Py_ssize_t startinpos;
4949    Py_ssize_t endinpos;
4950    Py_ssize_t outpos;
4951    PyObject *unicode;
4952    const unsigned char *q, *e;
4953    int bo = 0;       /* assume native ordering by default */
4954    const char *errmsg = "";
4955    /* Offsets from q for retrieving bytes in the right order. */
4956#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4957    int iorder[] = {0, 1, 2, 3};
4958#else
4959    int iorder[] = {3, 2, 1, 0};
4960#endif
4961    PyObject *errorHandler = NULL;
4962    PyObject *exc = NULL;
4963
4964    q = (unsigned char *)s;
4965    e = q + size;
4966
4967    if (byteorder)
4968        bo = *byteorder;
4969
4970    /* Check for BOM marks (U+FEFF) in the input and adjust current
4971       byte order setting accordingly. In native mode, the leading BOM
4972       mark is skipped, in all other modes, it is copied to the output
4973       stream as-is (giving a ZWNBSP character). */
4974    if (bo == 0) {
4975        if (size >= 4) {
4976            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4977                (q[iorder[1]] << 8) | q[iorder[0]];
4978#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4979            if (bom == 0x0000FEFF) {
4980                q += 4;
4981                bo = -1;
4982            }
4983            else if (bom == 0xFFFE0000) {
4984                q += 4;
4985                bo = 1;
4986            }
4987#else
4988            if (bom == 0x0000FEFF) {
4989                q += 4;
4990                bo = 1;
4991            }
4992            else if (bom == 0xFFFE0000) {
4993                q += 4;
4994                bo = -1;
4995            }
4996#endif
4997        }
4998    }
4999
5000    if (bo == -1) {
5001        /* force LE */
5002        iorder[0] = 0;
5003        iorder[1] = 1;
5004        iorder[2] = 2;
5005        iorder[3] = 3;
5006    }
5007    else if (bo == 1) {
5008        /* force BE */
5009        iorder[0] = 3;
5010        iorder[1] = 2;
5011        iorder[2] = 1;
5012        iorder[3] = 0;
5013    }
5014
5015    /* This might be one to much, because of a BOM */
5016    unicode = PyUnicode_New((size+3)/4, 127);
5017    if (!unicode)
5018        return NULL;
5019    if (size == 0)
5020        return unicode;
5021    outpos = 0;
5022
5023    while (q < e) {
5024        Py_UCS4 ch;
5025        /* remaining bytes at the end? (size should be divisible by 4) */
5026        if (e-q<4) {
5027            if (consumed)
5028                break;
5029            errmsg = "truncated data";
5030            startinpos = ((const char *)q)-starts;
5031            endinpos = ((const char *)e)-starts;
5032            goto utf32Error;
5033            /* The remaining input chars are ignored if the callback
5034               chooses to skip the input */
5035        }
5036        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5037            (q[iorder[1]] << 8) | q[iorder[0]];
5038
5039        if (ch >= 0x110000)
5040        {
5041            errmsg = "codepoint not in range(0x110000)";
5042            startinpos = ((const char *)q)-starts;
5043            endinpos = startinpos+4;
5044            goto utf32Error;
5045        }
5046        if (unicode_putchar(&unicode, &outpos, ch) < 0)
5047            goto onError;
5048        q += 4;
5049        continue;
5050      utf32Error:
5051        if (unicode_decode_call_errorhandler(
5052                errors, &errorHandler,
5053                "utf32", errmsg,
5054                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5055                &unicode, &outpos))
5056            goto onError;
5057    }
5058
5059    if (byteorder)
5060        *byteorder = bo;
5061
5062    if (consumed)
5063        *consumed = (const char *)q-starts;
5064
5065    /* Adjust length */
5066    if (unicode_resize(&unicode, outpos) < 0)
5067        goto onError;
5068
5069    Py_XDECREF(errorHandler);
5070    Py_XDECREF(exc);
5071    return unicode_result(unicode);
5072
5073  onError:
5074    Py_DECREF(unicode);
5075    Py_XDECREF(errorHandler);
5076    Py_XDECREF(exc);
5077    return NULL;
5078}
5079
5080PyObject *
5081_PyUnicode_EncodeUTF32(PyObject *str,
5082                       const char *errors,
5083                       int byteorder)
5084{
5085    int kind;
5086    void *data;
5087    Py_ssize_t len;
5088    PyObject *v;
5089    unsigned char *p;
5090    Py_ssize_t nsize, i;
5091    /* Offsets from p for storing byte pairs in the right order. */
5092#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5093    int iorder[] = {0, 1, 2, 3};
5094#else
5095    int iorder[] = {3, 2, 1, 0};
5096#endif
5097
5098#define STORECHAR(CH)                           \
5099    do {                                        \
5100        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5101        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5102        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5103        p[iorder[0]] = (CH) & 0xff;             \
5104        p += 4;                                 \
5105    } while(0)
5106
5107    if (!PyUnicode_Check(str)) {
5108        PyErr_BadArgument();
5109        return NULL;
5110    }
5111    if (PyUnicode_READY(str) == -1)
5112        return NULL;
5113    kind = PyUnicode_KIND(str);
5114    data = PyUnicode_DATA(str);
5115    len = PyUnicode_GET_LENGTH(str);
5116
5117    nsize = len + (byteorder == 0);
5118    if (nsize > PY_SSIZE_T_MAX / 4)
5119        return PyErr_NoMemory();
5120    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5121    if (v == NULL)
5122        return NULL;
5123
5124    p = (unsigned char *)PyBytes_AS_STRING(v);
5125    if (byteorder == 0)
5126        STORECHAR(0xFEFF);
5127    if (len == 0)
5128        goto done;
5129
5130    if (byteorder == -1) {
5131        /* force LE */
5132        iorder[0] = 0;
5133        iorder[1] = 1;
5134        iorder[2] = 2;
5135        iorder[3] = 3;
5136    }
5137    else if (byteorder == 1) {
5138        /* force BE */
5139        iorder[0] = 3;
5140        iorder[1] = 2;
5141        iorder[2] = 1;
5142        iorder[3] = 0;
5143    }
5144
5145    for (i = 0; i < len; i++)
5146        STORECHAR(PyUnicode_READ(kind, data, i));
5147
5148  done:
5149    return v;
5150#undef STORECHAR
5151}
5152
5153PyObject *
5154PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5155                      Py_ssize_t size,
5156                      const char *errors,
5157                      int byteorder)
5158{
5159    PyObject *result;
5160    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5161    if (tmp == NULL)
5162        return NULL;
5163    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5164    Py_DECREF(tmp);
5165    return result;
5166}
5167
5168PyObject *
5169PyUnicode_AsUTF32String(PyObject *unicode)
5170{
5171    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5172}
5173
5174/* --- UTF-16 Codec ------------------------------------------------------- */
5175
5176PyObject *
5177PyUnicode_DecodeUTF16(const char *s,
5178                      Py_ssize_t size,
5179                      const char *errors,
5180                      int *byteorder)
5181{
5182    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5183}
5184
5185PyObject *
5186PyUnicode_DecodeUTF16Stateful(const char *s,
5187                              Py_ssize_t size,
5188                              const char *errors,
5189                              int *byteorder,
5190                              Py_ssize_t *consumed)
5191{
5192    const char *starts = s;
5193    Py_ssize_t startinpos;
5194    Py_ssize_t endinpos;
5195    Py_ssize_t outpos;
5196    PyObject *unicode;
5197    const unsigned char *q, *e;
5198    int bo = 0;       /* assume native ordering by default */
5199    int native_ordering;
5200    const char *errmsg = "";
5201    PyObject *errorHandler = NULL;
5202    PyObject *exc = NULL;
5203
5204    q = (unsigned char *)s;
5205    e = q + size;
5206
5207    if (byteorder)
5208        bo = *byteorder;
5209
5210    /* Check for BOM marks (U+FEFF) in the input and adjust current
5211       byte order setting accordingly. In native mode, the leading BOM
5212       mark is skipped, in all other modes, it is copied to the output
5213       stream as-is (giving a ZWNBSP character). */
5214    if (bo == 0 && size >= 2) {
5215        const Py_UCS4 bom = (q[1] << 8) | q[0];
5216        if (bom == 0xFEFF) {
5217            q += 2;
5218            bo = -1;
5219        }
5220        else if (bom == 0xFFFE) {
5221            q += 2;
5222            bo = 1;
5223        }
5224        if (byteorder)
5225            *byteorder = bo;
5226    }
5227
5228    if (q == e) {
5229        if (consumed)
5230            *consumed = size;
5231        Py_INCREF(unicode_empty);
5232        return unicode_empty;
5233    }
5234
5235#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5236    native_ordering = bo <= 0;
5237#else
5238    native_ordering = bo >= 0;
5239#endif
5240
5241    /* Note: size will always be longer than the resulting Unicode
5242       character count */
5243    unicode = PyUnicode_New((e - q + 1) / 2, 127);
5244    if (!unicode)
5245        return NULL;
5246
5247    outpos = 0;
5248    while (1) {
5249        Py_UCS4 ch = 0;
5250        if (e - q >= 2) {
5251            int kind = PyUnicode_KIND(unicode);
5252            if (kind == PyUnicode_1BYTE_KIND) {
5253                if (PyUnicode_IS_ASCII(unicode))
5254                    ch = asciilib_utf16_decode(&q, e,
5255                            PyUnicode_1BYTE_DATA(unicode), &outpos,
5256                            native_ordering);
5257                else
5258                    ch = ucs1lib_utf16_decode(&q, e,
5259                            PyUnicode_1BYTE_DATA(unicode), &outpos,
5260                            native_ordering);
5261            } else if (kind == PyUnicode_2BYTE_KIND) {
5262                ch = ucs2lib_utf16_decode(&q, e,
5263                        PyUnicode_2BYTE_DATA(unicode), &outpos,
5264                        native_ordering);
5265            } else {
5266                assert(kind == PyUnicode_4BYTE_KIND);
5267                ch = ucs4lib_utf16_decode(&q, e,
5268                        PyUnicode_4BYTE_DATA(unicode), &outpos,
5269                        native_ordering);
5270            }
5271        }
5272
5273        switch (ch)
5274        {
5275        case 0:
5276            /* remaining byte at the end? (size should be even) */
5277            if (q == e || consumed)
5278                goto End;
5279            errmsg = "truncated data";
5280            startinpos = ((const char *)q) - starts;
5281            endinpos = ((const char *)e) - starts;
5282            break;
5283            /* The remaining input chars are ignored if the callback
5284               chooses to skip the input */
5285        case 1:
5286            errmsg = "unexpected end of data";
5287            startinpos = ((const char *)q) - 2 - starts;
5288            endinpos = ((const char *)e) - starts;
5289            break;
5290        case 2:
5291            errmsg = "illegal encoding";
5292            startinpos = ((const char *)q) - 2 - starts;
5293            endinpos = startinpos + 2;
5294            break;
5295        case 3:
5296            errmsg = "illegal UTF-16 surrogate";
5297            startinpos = ((const char *)q) - 4 - starts;
5298            endinpos = startinpos + 2;
5299            break;
5300        default:
5301            if (unicode_putchar(&unicode, &outpos, ch) < 0)
5302                goto onError;
5303            continue;
5304        }
5305
5306        if (unicode_decode_call_errorhandler(
5307                errors,
5308                &errorHandler,
5309                "utf16", errmsg,
5310                &starts,
5311                (const char **)&e,
5312                &startinpos,
5313                &endinpos,
5314                &exc,
5315                (const char **)&q,
5316                &unicode,
5317                &outpos))
5318            goto onError;
5319    }
5320
5321End:
5322    if (consumed)
5323        *consumed = (const char *)q-starts;
5324
5325    /* Adjust length */
5326    if (unicode_resize(&unicode, outpos) < 0)
5327        goto onError;
5328
5329    Py_XDECREF(errorHandler);
5330    Py_XDECREF(exc);
5331    return unicode_result(unicode);
5332
5333  onError:
5334    Py_DECREF(unicode);
5335    Py_XDECREF(errorHandler);
5336    Py_XDECREF(exc);
5337    return NULL;
5338}
5339
5340PyObject *
5341_PyUnicode_EncodeUTF16(PyObject *str,
5342                       const char *errors,
5343                       int byteorder)
5344{
5345    enum PyUnicode_Kind kind;
5346    const void *data;
5347    Py_ssize_t len;
5348    PyObject *v;
5349    unsigned short *out;
5350    Py_ssize_t bytesize;
5351    Py_ssize_t pairs;
5352#ifdef WORDS_BIGENDIAN
5353    int native_ordering = byteorder >= 0;
5354#else
5355    int native_ordering = byteorder <= 0;
5356#endif
5357
5358    if (!PyUnicode_Check(str)) {
5359        PyErr_BadArgument();
5360        return NULL;
5361    }
5362    if (PyUnicode_READY(str) == -1)
5363        return NULL;
5364    kind = PyUnicode_KIND(str);
5365    data = PyUnicode_DATA(str);
5366    len = PyUnicode_GET_LENGTH(str);
5367
5368    pairs = 0;
5369    if (kind == PyUnicode_4BYTE_KIND) {
5370        const Py_UCS4 *in = (const Py_UCS4 *)data;
5371        const Py_UCS4 *end = in + len;
5372        while (in < end)
5373            if (*in++ >= 0x10000)
5374                pairs++;
5375    }
5376    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5377        return PyErr_NoMemory();
5378    bytesize = (len + pairs + (byteorder == 0)) * 2;
5379    v = PyBytes_FromStringAndSize(NULL, bytesize);
5380    if (v == NULL)
5381        return NULL;
5382
5383    /* output buffer is 2-bytes aligned */
5384    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5385    out = (unsigned short *)PyBytes_AS_STRING(v);
5386    if (byteorder == 0)
5387        *out++ = 0xFEFF;
5388    if (len == 0)
5389        goto done;
5390
5391    switch (kind) {
5392    case PyUnicode_1BYTE_KIND: {
5393        ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5394        break;
5395    }
5396    case PyUnicode_2BYTE_KIND: {
5397        ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5398        break;
5399    }
5400    case PyUnicode_4BYTE_KIND: {
5401        ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5402        break;
5403    }
5404    default:
5405        assert(0);
5406    }
5407
5408  done:
5409    return v;
5410}
5411
5412PyObject *
5413PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5414                      Py_ssize_t size,
5415                      const char *errors,
5416                      int byteorder)
5417{
5418    PyObject *result;
5419    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5420    if (tmp == NULL)
5421        return NULL;
5422    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5423    Py_DECREF(tmp);
5424    return result;
5425}
5426
5427PyObject *
5428PyUnicode_AsUTF16String(PyObject *unicode)
5429{
5430    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5431}
5432
5433/* --- Unicode Escape Codec ----------------------------------------------- */
5434
5435/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5436   if all the escapes in the string make it still a valid ASCII string.
5437   Returns -1 if any escapes were found which cause the string to
5438   pop out of ASCII range.  Otherwise returns the length of the
5439   required buffer to hold the string.
5440   */
5441static Py_ssize_t
5442length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5443{
5444    const unsigned char *p = (const unsigned char *)s;
5445    const unsigned char *end = p + size;
5446    Py_ssize_t length = 0;
5447
5448    if (size < 0)
5449        return -1;
5450
5451    for (; p < end; ++p) {
5452        if (*p > 127) {
5453            /* Non-ASCII */
5454            return -1;
5455        }
5456        else if (*p != '\\') {
5457            /* Normal character */
5458            ++length;
5459        }
5460        else {
5461            /* Backslash-escape, check next char */
5462            ++p;
5463            /* Escape sequence reaches till end of string or
5464               non-ASCII follow-up. */
5465            if (p >= end || *p > 127)
5466                return -1;
5467            switch (*p) {
5468            case '\n':
5469                /* backslash + \n result in zero characters */
5470                break;
5471            case '\\': case '\'': case '\"':
5472            case 'b': case 'f': case 't':
5473            case 'n': case 'r': case 'v': case 'a':
5474                ++length;
5475                break;
5476            case '0': case '1': case '2': case '3':
5477            case '4': case '5': case '6': case '7':
5478            case 'x': case 'u': case 'U': case 'N':
5479                /* these do not guarantee ASCII characters */
5480                return -1;
5481            default:
5482                /* count the backslash + the other character */
5483                length += 2;
5484            }
5485        }
5486    }
5487    return length;
5488}
5489
5490static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5491
5492PyObject *
5493PyUnicode_DecodeUnicodeEscape(const char *s,
5494                              Py_ssize_t size,
5495                              const char *errors)
5496{
5497    const char *starts = s;
5498    Py_ssize_t startinpos;
5499    Py_ssize_t endinpos;
5500    int j;
5501    PyObject *v;
5502    const char *end;
5503    char* message;
5504    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5505    PyObject *errorHandler = NULL;
5506    PyObject *exc = NULL;
5507    Py_ssize_t len;
5508    Py_ssize_t i;
5509
5510    len = length_of_escaped_ascii_string(s, size);
5511
5512    /* After length_of_escaped_ascii_string() there are two alternatives,
5513       either the string is pure ASCII with named escapes like \n, etc.
5514       and we determined it's exact size (common case)
5515       or it contains \x, \u, ... escape sequences.  then we create a
5516       legacy wchar string and resize it at the end of this function. */
5517    if (len >= 0) {
5518        v = PyUnicode_New(len, 127);
5519        if (!v)
5520            goto onError;
5521        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5522    }
5523    else {
5524        /* Escaped strings will always be longer than the resulting
5525           Unicode string, so we start with size here and then reduce the
5526           length after conversion to the true value.
5527           (but if the error callback returns a long replacement string
5528           we'll have to allocate more space) */
5529        v = PyUnicode_New(size, 127);
5530        if (!v)
5531            goto onError;
5532        len = size;
5533    }
5534
5535    if (size == 0)
5536        return v;
5537    i = 0;
5538    end = s + size;
5539
5540    while (s < end) {
5541        unsigned char c;
5542        Py_UCS4 x;
5543        int digits;
5544
5545        /* The only case in which i == ascii_length is a backslash
5546           followed by a newline. */
5547        assert(i <= len);
5548
5549        /* Non-escape characters are interpreted as Unicode ordinals */
5550        if (*s != '\\') {
5551            if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5552                goto onError;
5553            continue;
5554        }
5555
5556        startinpos = s-starts;
5557        /* \ - Escapes */
5558        s++;
5559        c = *s++;
5560        if (s > end)
5561            c = '\0'; /* Invalid after \ */
5562
5563        /* The only case in which i == ascii_length is a backslash
5564           followed by a newline. */
5565        assert(i < len || (i == len && c == '\n'));
5566
5567        switch (c) {
5568
5569            /* \x escapes */
5570#define WRITECHAR(ch)                                   \
5571            do {                                        \
5572                if (unicode_putchar(&v, &i, ch) < 0)    \
5573                    goto onError;                       \
5574            }while(0)
5575
5576        case '\n': break;
5577        case '\\': WRITECHAR('\\'); break;
5578        case '\'': WRITECHAR('\''); break;
5579        case '\"': WRITECHAR('\"'); break;
5580        case 'b': WRITECHAR('\b'); break;
5581        /* FF */
5582        case 'f': WRITECHAR('\014'); break;
5583        case 't': WRITECHAR('\t'); break;
5584        case 'n': WRITECHAR('\n'); break;
5585        case 'r': WRITECHAR('\r'); break;
5586        /* VT */
5587        case 'v': WRITECHAR('\013'); break;
5588        /* BEL, not classic C */
5589        case 'a': WRITECHAR('\007'); break;
5590
5591            /* \OOO (octal) escapes */
5592        case '0': case '1': case '2': case '3':
5593        case '4': case '5': case '6': case '7':
5594            x = s[-1] - '0';
5595            if (s < end && '0' <= *s && *s <= '7') {
5596                x = (x<<3) + *s++ - '0';
5597                if (s < end && '0' <= *s && *s <= '7')
5598                    x = (x<<3) + *s++ - '0';
5599            }
5600            WRITECHAR(x);
5601            break;
5602
5603            /* hex escapes */
5604            /* \xXX */
5605        case 'x':
5606            digits = 2;
5607            message = "truncated \\xXX escape";
5608            goto hexescape;
5609
5610            /* \uXXXX */
5611        case 'u':
5612            digits = 4;
5613            message = "truncated \\uXXXX escape";
5614            goto hexescape;
5615
5616            /* \UXXXXXXXX */
5617        case 'U':
5618            digits = 8;
5619            message = "truncated \\UXXXXXXXX escape";
5620        hexescape:
5621            chr = 0;
5622            if (s+digits>end) {
5623                endinpos = size;
5624                if (unicode_decode_call_errorhandler(
5625                        errors, &errorHandler,
5626                        "unicodeescape", "end of string in escape sequence",
5627                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5628                        &v, &i))
5629                    goto onError;
5630                goto nextByte;
5631            }
5632            for (j = 0; j < digits; ++j) {
5633                c = (unsigned char) s[j];
5634                if (!Py_ISXDIGIT(c)) {
5635                    endinpos = (s+j+1)-starts;
5636                    if (unicode_decode_call_errorhandler(
5637                            errors, &errorHandler,
5638                            "unicodeescape", message,
5639                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5640                            &v, &i))
5641                        goto onError;
5642                    len = PyUnicode_GET_LENGTH(v);
5643                    goto nextByte;
5644                }
5645                chr = (chr<<4) & ~0xF;
5646                if (c >= '0' && c <= '9')
5647                    chr += c - '0';
5648                else if (c >= 'a' && c <= 'f')
5649                    chr += 10 + c - 'a';
5650                else
5651                    chr += 10 + c - 'A';
5652            }
5653            s += j;
5654            if (chr == 0xffffffff && PyErr_Occurred())
5655                /* _decoding_error will have already written into the
5656                   target buffer. */
5657                break;
5658        store:
5659            /* when we get here, chr is a 32-bit unicode character */
5660            if (chr <= MAX_UNICODE) {
5661                WRITECHAR(chr);
5662            } else {
5663                endinpos = s-starts;
5664                if (unicode_decode_call_errorhandler(
5665                        errors, &errorHandler,
5666                        "unicodeescape", "illegal Unicode character",
5667                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5668                        &v, &i))
5669                    goto onError;
5670            }
5671            break;
5672
5673            /* \N{name} */
5674        case 'N':
5675            message = "malformed \\N character escape";
5676            if (ucnhash_CAPI == NULL) {
5677                /* load the unicode data module */
5678                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5679                                                PyUnicodeData_CAPSULE_NAME, 1);
5680                if (ucnhash_CAPI == NULL)
5681                    goto ucnhashError;
5682            }
5683            if (*s == '{') {
5684                const char *start = s+1;
5685                /* look for the closing brace */
5686                while (*s != '}' && s < end)
5687                    s++;
5688                if (s > start && s < end && *s == '}') {
5689                    /* found a name.  look it up in the unicode database */
5690                    message = "unknown Unicode character name";
5691                    s++;
5692                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5693                                              &chr, 0))
5694                        goto store;
5695                }
5696            }
5697            endinpos = s-starts;
5698            if (unicode_decode_call_errorhandler(
5699                    errors, &errorHandler,
5700                    "unicodeescape", message,
5701                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5702                    &v, &i))
5703                goto onError;
5704            break;
5705
5706        default:
5707            if (s > end) {
5708                message = "\\ at end of string";
5709                s--;
5710                endinpos = s-starts;
5711                if (unicode_decode_call_errorhandler(
5712                        errors, &errorHandler,
5713                        "unicodeescape", message,
5714                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5715                        &v, &i))
5716                    goto onError;
5717            }
5718            else {
5719                WRITECHAR('\\');
5720                WRITECHAR(s[-1]);
5721            }
5722            break;
5723        }
5724      nextByte:
5725        ;
5726    }
5727#undef WRITECHAR
5728
5729    if (unicode_resize(&v, i) < 0)
5730        goto onError;
5731    Py_XDECREF(errorHandler);
5732    Py_XDECREF(exc);
5733    return unicode_result(v);
5734
5735  ucnhashError:
5736    PyErr_SetString(
5737        PyExc_UnicodeError,
5738        "\\N escapes not supported (can't load unicodedata module)"
5739        );
5740    Py_XDECREF(v);
5741    Py_XDECREF(errorHandler);
5742    Py_XDECREF(exc);
5743    return NULL;
5744
5745  onError:
5746    Py_XDECREF(v);
5747    Py_XDECREF(errorHandler);
5748    Py_XDECREF(exc);
5749    return NULL;
5750}
5751
5752/* Return a Unicode-Escape string version of the Unicode object.
5753
5754   If quotes is true, the string is enclosed in u"" or u'' quotes as
5755   appropriate.
5756
5757*/
5758
5759PyObject *
5760PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5761{
5762    Py_ssize_t i, len;
5763    PyObject *repr;
5764    char *p;
5765    int kind;
5766    void *data;
5767    Py_ssize_t expandsize = 0;
5768
5769    /* Initial allocation is based on the longest-possible character
5770       escape.
5771
5772       For UCS1 strings it's '\xxx', 4 bytes per source character.
5773       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5774       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5775    */
5776
5777    if (!PyUnicode_Check(unicode)) {
5778        PyErr_BadArgument();
5779        return NULL;
5780    }
5781    if (PyUnicode_READY(unicode) == -1)
5782        return NULL;
5783    len = PyUnicode_GET_LENGTH(unicode);
5784    kind = PyUnicode_KIND(unicode);
5785    data = PyUnicode_DATA(unicode);
5786    switch (kind) {
5787    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5788    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5789    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5790    }
5791
5792    if (len == 0)
5793        return PyBytes_FromStringAndSize(NULL, 0);
5794
5795    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5796        return PyErr_NoMemory();
5797
5798    repr = PyBytes_FromStringAndSize(NULL,
5799                                     2
5800                                     + expandsize*len
5801                                     + 1);
5802    if (repr == NULL)
5803        return NULL;
5804
5805    p = PyBytes_AS_STRING(repr);
5806
5807    for (i = 0; i < len; i++) {
5808        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5809
5810        /* Escape backslashes */
5811        if (ch == '\\') {
5812            *p++ = '\\';
5813            *p++ = (char) ch;
5814            continue;
5815        }
5816
5817        /* Map 21-bit characters to '\U00xxxxxx' */
5818        else if (ch >= 0x10000) {
5819            assert(ch <= MAX_UNICODE);
5820            *p++ = '\\';
5821            *p++ = 'U';
5822            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5823            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5824            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5825            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5826            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5827            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5828            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5829            *p++ = Py_hexdigits[ch & 0x0000000F];
5830            continue;
5831        }
5832
5833        /* Map 16-bit characters to '\uxxxx' */
5834        if (ch >= 256) {
5835            *p++ = '\\';
5836            *p++ = 'u';
5837            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5838            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5839            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5840            *p++ = Py_hexdigits[ch & 0x000F];
5841        }
5842
5843        /* Map special whitespace to '\t', \n', '\r' */
5844        else if (ch == '\t') {
5845            *p++ = '\\';
5846            *p++ = 't';
5847        }
5848        else if (ch == '\n') {
5849            *p++ = '\\';
5850            *p++ = 'n';
5851        }
5852        else if (ch == '\r') {
5853            *p++ = '\\';
5854            *p++ = 'r';
5855        }
5856
5857        /* Map non-printable US ASCII to '\xhh' */
5858        else if (ch < ' ' || ch >= 0x7F) {
5859            *p++ = '\\';
5860            *p++ = 'x';
5861            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5862            *p++ = Py_hexdigits[ch & 0x000F];
5863        }
5864
5865        /* Copy everything else as-is */
5866        else
5867            *p++ = (char) ch;
5868    }
5869
5870    assert(p - PyBytes_AS_STRING(repr) > 0);
5871    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5872        return NULL;
5873    return repr;
5874}
5875
5876PyObject *
5877PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5878                              Py_ssize_t size)
5879{
5880    PyObject *result;
5881    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5882    if (tmp == NULL)
5883        return NULL;
5884    result = PyUnicode_AsUnicodeEscapeString(tmp);
5885    Py_DECREF(tmp);
5886    return result;
5887}
5888
5889/* --- Raw Unicode Escape Codec ------------------------------------------- */
5890
5891PyObject *
5892PyUnicode_DecodeRawUnicodeEscape(const char *s,
5893                                 Py_ssize_t size,
5894                                 const char *errors)
5895{
5896    const char *starts = s;
5897    Py_ssize_t startinpos;
5898    Py_ssize_t endinpos;
5899    Py_ssize_t outpos;
5900    PyObject *v;
5901    const char *end;
5902    const char *bs;
5903    PyObject *errorHandler = NULL;
5904    PyObject *exc = NULL;
5905
5906    /* Escaped strings will always be longer than the resulting
5907       Unicode string, so we start with size here and then reduce the
5908       length after conversion to the true value. (But decoding error
5909       handler might have to resize the string) */
5910    v = PyUnicode_New(size, 127);
5911    if (v == NULL)
5912        goto onError;
5913    if (size == 0)
5914        return v;
5915    outpos = 0;
5916    end = s + size;
5917    while (s < end) {
5918        unsigned char c;
5919        Py_UCS4 x;
5920        int i;
5921        int count;
5922
5923        /* Non-escape characters are interpreted as Unicode ordinals */
5924        if (*s != '\\') {
5925            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5926                goto onError;
5927            continue;
5928        }
5929        startinpos = s-starts;
5930
5931        /* \u-escapes are only interpreted iff the number of leading
5932           backslashes if odd */
5933        bs = s;
5934        for (;s < end;) {
5935            if (*s != '\\')
5936                break;
5937            if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5938                goto onError;
5939        }
5940        if (((s - bs) & 1) == 0 ||
5941            s >= end ||
5942            (*s != 'u' && *s != 'U')) {
5943            continue;
5944        }
5945        outpos--;
5946        count = *s=='u' ? 4 : 8;
5947        s++;
5948
5949        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5950        for (x = 0, i = 0; i < count; ++i, ++s) {
5951            c = (unsigned char)*s;
5952            if (!Py_ISXDIGIT(c)) {
5953                endinpos = s-starts;
5954                if (unicode_decode_call_errorhandler(
5955                        errors, &errorHandler,
5956                        "rawunicodeescape", "truncated \\uXXXX",
5957                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5958                        &v, &outpos))
5959                    goto onError;
5960                goto nextByte;
5961            }
5962            x = (x<<4) & ~0xF;
5963            if (c >= '0' && c <= '9')
5964                x += c - '0';
5965            else if (c >= 'a' && c <= 'f')
5966                x += 10 + c - 'a';
5967            else
5968                x += 10 + c - 'A';
5969        }
5970        if (x <= MAX_UNICODE) {
5971            if (unicode_putchar(&v, &outpos, x) < 0)
5972                goto onError;
5973        } else {
5974            endinpos = s-starts;
5975            if (unicode_decode_call_errorhandler(
5976                    errors, &errorHandler,
5977                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5978                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5979                    &v, &outpos))
5980                goto onError;
5981        }
5982      nextByte:
5983        ;
5984    }
5985    if (unicode_resize(&v, outpos) < 0)
5986        goto onError;
5987    Py_XDECREF(errorHandler);
5988    Py_XDECREF(exc);
5989    return unicode_result(v);
5990
5991  onError:
5992    Py_XDECREF(v);
5993    Py_XDECREF(errorHandler);
5994    Py_XDECREF(exc);
5995    return NULL;
5996}
5997
5998
5999PyObject *
6000PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6001{
6002    PyObject *repr;
6003    char *p;
6004    char *q;
6005    Py_ssize_t expandsize, pos;
6006    int kind;
6007    void *data;
6008    Py_ssize_t len;
6009
6010    if (!PyUnicode_Check(unicode)) {
6011        PyErr_BadArgument();
6012        return NULL;
6013    }
6014    if (PyUnicode_READY(unicode) == -1)
6015        return NULL;
6016    kind = PyUnicode_KIND(unicode);
6017    data = PyUnicode_DATA(unicode);
6018    len = PyUnicode_GET_LENGTH(unicode);
6019    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6020       bytes, and 1 byte characters 4. */
6021    expandsize = kind * 2 + 2;
6022
6023    if (len > PY_SSIZE_T_MAX / expandsize)
6024        return PyErr_NoMemory();
6025
6026    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6027    if (repr == NULL)
6028        return NULL;
6029    if (len == 0)
6030        return repr;
6031
6032    p = q = PyBytes_AS_STRING(repr);
6033    for (pos = 0; pos < len; pos++) {
6034        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6035        /* Map 32-bit characters to '\Uxxxxxxxx' */
6036        if (ch >= 0x10000) {
6037            assert(ch <= MAX_UNICODE);
6038            *p++ = '\\';
6039            *p++ = 'U';
6040            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6041            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6042            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6043            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6044            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6045            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6046            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6047            *p++ = Py_hexdigits[ch & 15];
6048        }
6049        /* Map 16-bit characters to '\uxxxx' */
6050        else if (ch >= 256) {
6051            *p++ = '\\';
6052            *p++ = 'u';
6053            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6054            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6055            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6056            *p++ = Py_hexdigits[ch & 15];
6057        }
6058        /* Copy everything else as-is */
6059        else
6060            *p++ = (char) ch;
6061    }
6062
6063    assert(p > q);
6064    if (_PyBytes_Resize(&repr, p - q) < 0)
6065        return NULL;
6066    return repr;
6067}
6068
6069PyObject *
6070PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6071                                 Py_ssize_t size)
6072{
6073    PyObject *result;
6074    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6075    if (tmp == NULL)
6076        return NULL;
6077    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6078    Py_DECREF(tmp);
6079    return result;
6080}
6081
6082/* --- Unicode Internal Codec ------------------------------------------- */
6083
6084PyObject *
6085_PyUnicode_DecodeUnicodeInternal(const char *s,
6086                                 Py_ssize_t size,
6087                                 const char *errors)
6088{
6089    const char *starts = s;
6090    Py_ssize_t startinpos;
6091    Py_ssize_t endinpos;
6092    Py_ssize_t outpos;
6093    PyObject *v;
6094    const char *end;
6095    const char *reason;
6096    PyObject *errorHandler = NULL;
6097    PyObject *exc = NULL;
6098
6099    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6100                     "unicode_internal codec has been deprecated",
6101                     1))
6102        return NULL;
6103
6104    /* XXX overflow detection missing */
6105    v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
6106    if (v == NULL)
6107        goto onError;
6108    if (PyUnicode_GET_LENGTH(v) == 0)
6109        return v;
6110    outpos = 0;
6111    end = s + size;
6112
6113    while (s < end) {
6114        Py_UNICODE uch;
6115        Py_UCS4 ch;
6116        /* We copy the raw representation one byte at a time because the
6117           pointer may be unaligned (see test_codeccallbacks). */
6118        ((char *) &uch)[0] = s[0];
6119        ((char *) &uch)[1] = s[1];
6120#ifdef Py_UNICODE_WIDE
6121        ((char *) &uch)[2] = s[2];
6122        ((char *) &uch)[3] = s[3];
6123#endif
6124        ch = uch;
6125
6126        /* We have to sanity check the raw data, otherwise doom looms for
6127           some malformed UCS-4 data. */
6128        if (
6129#ifdef Py_UNICODE_WIDE
6130            ch > 0x10ffff ||
6131#endif
6132            end-s < Py_UNICODE_SIZE
6133            )
6134        {
6135            startinpos = s - starts;
6136            if (end-s < Py_UNICODE_SIZE) {
6137                endinpos = end-starts;
6138                reason = "truncated input";
6139            }
6140            else {
6141                endinpos = s - starts + Py_UNICODE_SIZE;
6142                reason = "illegal code point (> 0x10FFFF)";
6143            }
6144            if (unicode_decode_call_errorhandler(
6145                    errors, &errorHandler,
6146                    "unicode_internal", reason,
6147                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6148                    &v, &outpos))
6149                goto onError;
6150            continue;
6151        }
6152
6153        s += Py_UNICODE_SIZE;
6154#ifndef Py_UNICODE_WIDE
6155        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
6156        {
6157            Py_UNICODE uch2;
6158            ((char *) &uch2)[0] = s[0];
6159            ((char *) &uch2)[1] = s[1];
6160            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6161            {
6162                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6163                s += Py_UNICODE_SIZE;
6164            }
6165        }
6166#endif
6167
6168        if (unicode_putchar(&v, &outpos, ch) < 0)
6169            goto onError;
6170    }
6171
6172    if (unicode_resize(&v, outpos) < 0)
6173        goto onError;
6174    Py_XDECREF(errorHandler);
6175    Py_XDECREF(exc);
6176    return unicode_result(v);
6177
6178  onError:
6179    Py_XDECREF(v);
6180    Py_XDECREF(errorHandler);
6181    Py_XDECREF(exc);
6182    return NULL;
6183}
6184
6185/* --- Latin-1 Codec ------------------------------------------------------ */
6186
6187PyObject *
6188PyUnicode_DecodeLatin1(const char *s,
6189                       Py_ssize_t size,
6190                       const char *errors)
6191{
6192    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6193    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6194}
6195
6196/* create or adjust a UnicodeEncodeError */
6197static void
6198make_encode_exception(PyObject **exceptionObject,
6199                      const char *encoding,
6200                      PyObject *unicode,
6201                      Py_ssize_t startpos, Py_ssize_t endpos,
6202                      const char *reason)
6203{
6204    if (*exceptionObject == NULL) {
6205        *exceptionObject = PyObject_CallFunction(
6206            PyExc_UnicodeEncodeError, "sOnns",
6207            encoding, unicode, startpos, endpos, reason);
6208    }
6209    else {
6210        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6211            goto onError;
6212        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6213            goto onError;
6214        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6215            goto onError;
6216        return;
6217      onError:
6218        Py_DECREF(*exceptionObject);
6219        *exceptionObject = NULL;
6220    }
6221}
6222
6223/* raises a UnicodeEncodeError */
6224static void
6225raise_encode_exception(PyObject **exceptionObject,
6226                       const char *encoding,
6227                       PyObject *unicode,
6228                       Py_ssize_t startpos, Py_ssize_t endpos,
6229                       const char *reason)
6230{
6231    make_encode_exception(exceptionObject,
6232                          encoding, unicode, startpos, endpos, reason);
6233    if (*exceptionObject != NULL)
6234        PyCodec_StrictErrors(*exceptionObject);
6235}
6236
6237/* error handling callback helper:
6238   build arguments, call the callback and check the arguments,
6239   put the result into newpos and return the replacement string, which
6240   has to be freed by the caller */
6241static PyObject *
6242unicode_encode_call_errorhandler(const char *errors,
6243                                 PyObject **errorHandler,
6244                                 const char *encoding, const char *reason,
6245                                 PyObject *unicode, PyObject **exceptionObject,
6246                                 Py_ssize_t startpos, Py_ssize_t endpos,
6247                                 Py_ssize_t *newpos)
6248{
6249    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6250    Py_ssize_t len;
6251    PyObject *restuple;
6252    PyObject *resunicode;
6253
6254    if (*errorHandler == NULL) {
6255        *errorHandler = PyCodec_LookupError(errors);
6256        if (*errorHandler == NULL)
6257            return NULL;
6258    }
6259
6260    if (PyUnicode_READY(unicode) == -1)
6261        return NULL;
6262    len = PyUnicode_GET_LENGTH(unicode);
6263
6264    make_encode_exception(exceptionObject,
6265                          encoding, unicode, startpos, endpos, reason);
6266    if (*exceptionObject == NULL)
6267        return NULL;
6268
6269    restuple = PyObject_CallFunctionObjArgs(
6270        *errorHandler, *exceptionObject, NULL);
6271    if (restuple == NULL)
6272        return NULL;
6273    if (!PyTuple_Check(restuple)) {
6274        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6275        Py_DECREF(restuple);
6276        return NULL;
6277    }
6278    if (!PyArg_ParseTuple(restuple, argparse,
6279                          &resunicode, newpos)) {
6280        Py_DECREF(restuple);
6281        return NULL;
6282    }
6283    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6284        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6285        Py_DECREF(restuple);
6286        return NULL;
6287    }
6288    if (*newpos<0)
6289        *newpos = len + *newpos;
6290    if (*newpos<0 || *newpos>len) {
6291        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6292        Py_DECREF(restuple);
6293        return NULL;
6294    }
6295    Py_INCREF(resunicode);
6296    Py_DECREF(restuple);
6297    return resunicode;
6298}
6299
6300static PyObject *
6301unicode_encode_ucs1(PyObject *unicode,
6302                    const char *errors,
6303                    unsigned int limit)
6304{
6305    /* input state */
6306    Py_ssize_t pos=0, size;
6307    int kind;
6308    void *data;
6309    /* output object */
6310    PyObject *res;
6311    /* pointer into the output */
6312    char *str;
6313    /* current output position */
6314    Py_ssize_t ressize;
6315    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6316    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6317    PyObject *errorHandler = NULL;
6318    PyObject *exc = NULL;
6319    /* the following variable is used for caching string comparisons
6320     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6321    int known_errorHandler = -1;
6322
6323    if (PyUnicode_READY(unicode) == -1)
6324        return NULL;
6325    size = PyUnicode_GET_LENGTH(unicode);
6326    kind = PyUnicode_KIND(unicode);
6327    data = PyUnicode_DATA(unicode);
6328    /* allocate enough for a simple encoding without
6329       replacements, if we need more, we'll resize */
6330    if (size == 0)
6331        return PyBytes_FromStringAndSize(NULL, 0);
6332    res = PyBytes_FromStringAndSize(NULL, size);
6333    if (res == NULL)
6334        return NULL;
6335    str = PyBytes_AS_STRING(res);
6336    ressize = size;
6337
6338    while (pos < size) {
6339        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6340
6341        /* can we encode this? */
6342        if (c<limit) {
6343            /* no overflow check, because we know that the space is enough */
6344            *str++ = (char)c;
6345            ++pos;
6346        }
6347        else {
6348            Py_ssize_t requiredsize;
6349            PyObject *repunicode;
6350            Py_ssize_t repsize, newpos, respos, i;
6351            /* startpos for collecting unencodable chars */
6352            Py_ssize_t collstart = pos;
6353            Py_ssize_t collend = pos;
6354            /* find all unecodable characters */
6355            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6356                ++collend;
6357            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6358            if (known_errorHandler==-1) {
6359                if ((errors==NULL) || (!strcmp(errors, "strict")))
6360                    known_errorHandler = 1;
6361                else if (!strcmp(errors, "replace"))
6362                    known_errorHandler = 2;
6363                else if (!strcmp(errors, "ignore"))
6364                    known_errorHandler = 3;
6365                else if (!strcmp(errors, "xmlcharrefreplace"))
6366                    known_errorHandler = 4;
6367                else
6368                    known_errorHandler = 0;
6369            }
6370            switch (known_errorHandler) {
6371            case 1: /* strict */
6372                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6373                goto onError;
6374            case 2: /* replace */
6375                while (collstart++<collend)
6376                    *str++ = '?'; /* fall through */
6377            case 3: /* ignore */
6378                pos = collend;
6379                break;
6380            case 4: /* xmlcharrefreplace */
6381                respos = str - PyBytes_AS_STRING(res);
6382                /* determine replacement size */
6383                for (i = collstart, repsize = 0; i < collend; ++i) {
6384                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6385                    if (ch < 10)
6386                        repsize += 2+1+1;
6387                    else if (ch < 100)
6388                        repsize += 2+2+1;
6389                    else if (ch < 1000)
6390                        repsize += 2+3+1;
6391                    else if (ch < 10000)
6392                        repsize += 2+4+1;
6393                    else if (ch < 100000)
6394                        repsize += 2+5+1;
6395                    else if (ch < 1000000)
6396                        repsize += 2+6+1;
6397                    else {
6398                        assert(ch <= MAX_UNICODE);
6399                        repsize += 2+7+1;
6400                    }
6401                }
6402                requiredsize = respos+repsize+(size-collend);
6403                if (requiredsize > ressize) {
6404                    if (requiredsize<2*ressize)
6405                        requiredsize = 2*ressize;
6406                    if (_PyBytes_Resize(&res, requiredsize))
6407                        goto onError;
6408                    str = PyBytes_AS_STRING(res) + respos;
6409                    ressize = requiredsize;
6410                }
6411                /* generate replacement */
6412                for (i = collstart; i < collend; ++i) {
6413                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6414                }
6415                pos = collend;
6416                break;
6417            default:
6418                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6419                                                              encoding, reason, unicode, &exc,
6420                                                              collstart, collend, &newpos);
6421                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6422                                           PyUnicode_READY(repunicode) == -1))
6423                    goto onError;
6424                if (PyBytes_Check(repunicode)) {
6425                    /* Directly copy bytes result to output. */
6426                    repsize = PyBytes_Size(repunicode);
6427                    if (repsize > 1) {
6428                        /* Make room for all additional bytes. */
6429                        respos = str - PyBytes_AS_STRING(res);
6430                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6431                            Py_DECREF(repunicode);
6432                            goto onError;
6433                        }
6434                        str = PyBytes_AS_STRING(res) + respos;
6435                        ressize += repsize-1;
6436                    }
6437                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6438                    str += repsize;
6439                    pos = newpos;
6440                    Py_DECREF(repunicode);
6441                    break;
6442                }
6443                /* need more space? (at least enough for what we
6444                   have+the replacement+the rest of the string, so
6445                   we won't have to check space for encodable characters) */
6446                respos = str - PyBytes_AS_STRING(res);
6447                repsize = PyUnicode_GET_LENGTH(repunicode);
6448                requiredsize = respos+repsize+(size-collend);
6449                if (requiredsize > ressize) {
6450                    if (requiredsize<2*ressize)
6451                        requiredsize = 2*ressize;
6452                    if (_PyBytes_Resize(&res, requiredsize)) {
6453                        Py_DECREF(repunicode);
6454                        goto onError;
6455                    }
6456                    str = PyBytes_AS_STRING(res) + respos;
6457                    ressize = requiredsize;
6458                }
6459                /* check if there is anything unencodable in the replacement
6460                   and copy it to the output */
6461                for (i = 0; repsize-->0; ++i, ++str) {
6462                    c = PyUnicode_READ_CHAR(repunicode, i);
6463                    if (c >= limit) {
6464                        raise_encode_exception(&exc, encoding, unicode,
6465                                               pos, pos+1, reason);
6466                        Py_DECREF(repunicode);
6467                        goto onError;
6468                    }
6469                    *str = (char)c;
6470                }
6471                pos = newpos;
6472                Py_DECREF(repunicode);
6473            }
6474        }
6475    }
6476    /* Resize if we allocated to much */
6477    size = str - PyBytes_AS_STRING(res);
6478    if (size < ressize) { /* If this falls res will be NULL */
6479        assert(size >= 0);
6480        if (_PyBytes_Resize(&res, size) < 0)
6481            goto onError;
6482    }
6483
6484    Py_XDECREF(errorHandler);
6485    Py_XDECREF(exc);
6486    return res;
6487
6488  onError:
6489    Py_XDECREF(res);
6490    Py_XDECREF(errorHandler);
6491    Py_XDECREF(exc);
6492    return NULL;
6493}
6494
6495/* Deprecated */
6496PyObject *
6497PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6498                       Py_ssize_t size,
6499                       const char *errors)
6500{
6501    PyObject *result;
6502    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6503    if (unicode == NULL)
6504        return NULL;
6505    result = unicode_encode_ucs1(unicode, errors, 256);
6506    Py_DECREF(unicode);
6507    return result;
6508}
6509
6510PyObject *
6511_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6512{
6513    if (!PyUnicode_Check(unicode)) {
6514        PyErr_BadArgument();
6515        return NULL;
6516    }
6517    if (PyUnicode_READY(unicode) == -1)
6518        return NULL;
6519    /* Fast path: if it is a one-byte string, construct
6520       bytes object directly. */
6521    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6522        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6523                                         PyUnicode_GET_LENGTH(unicode));
6524    /* Non-Latin-1 characters present. Defer to above function to
6525       raise the exception. */
6526    return unicode_encode_ucs1(unicode, errors, 256);
6527}
6528
6529PyObject*
6530PyUnicode_AsLatin1String(PyObject *unicode)
6531{
6532    return _PyUnicode_AsLatin1String(unicode, NULL);
6533}
6534
6535/* --- 7-bit ASCII Codec -------------------------------------------------- */
6536
6537PyObject *
6538PyUnicode_DecodeASCII(const char *s,
6539                      Py_ssize_t size,
6540                      const char *errors)
6541{
6542    const char *starts = s;
6543    PyObject *unicode;
6544    int kind;
6545    void *data;
6546    Py_ssize_t startinpos;
6547    Py_ssize_t endinpos;
6548    Py_ssize_t outpos;
6549    const char *e;
6550    PyObject *errorHandler = NULL;
6551    PyObject *exc = NULL;
6552
6553    if (size == 0) {
6554        Py_INCREF(unicode_empty);
6555        return unicode_empty;
6556    }
6557
6558    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6559    if (size == 1 && (unsigned char)s[0] < 128)
6560        return get_latin1_char((unsigned char)s[0]);
6561
6562    unicode = PyUnicode_New(size, 127);
6563    if (unicode == NULL)
6564        goto onError;
6565
6566    e = s + size;
6567    data = PyUnicode_1BYTE_DATA(unicode);
6568    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6569    if (outpos == size)
6570        return unicode;
6571
6572    s += outpos;
6573    kind = PyUnicode_1BYTE_KIND;
6574    while (s < e) {
6575        register unsigned char c = (unsigned char)*s;
6576        if (c < 128) {
6577            PyUnicode_WRITE(kind, data, outpos++, c);
6578            ++s;
6579        }
6580        else {
6581            startinpos = s-starts;
6582            endinpos = startinpos + 1;
6583            if (unicode_decode_call_errorhandler(
6584                    errors, &errorHandler,
6585                    "ascii", "ordinal not in range(128)",
6586                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6587                    &unicode, &outpos))
6588                goto onError;
6589            kind = PyUnicode_KIND(unicode);
6590            data = PyUnicode_DATA(unicode);
6591        }
6592    }
6593    if (unicode_resize(&unicode, outpos) < 0)
6594        goto onError;
6595    Py_XDECREF(errorHandler);
6596    Py_XDECREF(exc);
6597    assert(_PyUnicode_CheckConsistency(unicode, 1));
6598    return unicode;
6599
6600  onError:
6601    Py_XDECREF(unicode);
6602    Py_XDECREF(errorHandler);
6603    Py_XDECREF(exc);
6604    return NULL;
6605}
6606
6607/* Deprecated */
6608PyObject *
6609PyUnicode_EncodeASCII(const Py_UNICODE *p,
6610                      Py_ssize_t size,
6611                      const char *errors)
6612{
6613    PyObject *result;
6614    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6615    if (unicode == NULL)
6616        return NULL;
6617    result = unicode_encode_ucs1(unicode, errors, 128);
6618    Py_DECREF(unicode);
6619    return result;
6620}
6621
6622PyObject *
6623_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6624{
6625    if (!PyUnicode_Check(unicode)) {
6626        PyErr_BadArgument();
6627        return NULL;
6628    }
6629    if (PyUnicode_READY(unicode) == -1)
6630        return NULL;
6631    /* Fast path: if it is an ASCII-only string, construct bytes object
6632       directly. Else defer to above function to raise the exception. */
6633    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6634        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6635                                         PyUnicode_GET_LENGTH(unicode));
6636    return unicode_encode_ucs1(unicode, errors, 128);
6637}
6638
6639PyObject *
6640PyUnicode_AsASCIIString(PyObject *unicode)
6641{
6642    return _PyUnicode_AsASCIIString(unicode, NULL);
6643}
6644
6645#ifdef HAVE_MBCS
6646
6647/* --- MBCS codecs for Windows -------------------------------------------- */
6648
6649#if SIZEOF_INT < SIZEOF_SIZE_T
6650#define NEED_RETRY
6651#endif
6652
6653#ifndef WC_ERR_INVALID_CHARS
6654#  define WC_ERR_INVALID_CHARS 0x0080
6655#endif
6656
6657static char*
6658code_page_name(UINT code_page, PyObject **obj)
6659{
6660    *obj = NULL;
6661    if (code_page == CP_ACP)
6662        return "mbcs";
6663    if (code_page == CP_UTF7)
6664        return "CP_UTF7";
6665    if (code_page == CP_UTF8)
6666        return "CP_UTF8";
6667
6668    *obj = PyBytes_FromFormat("cp%u", code_page);
6669    if (*obj == NULL)
6670        return NULL;
6671    return PyBytes_AS_STRING(*obj);
6672}
6673
6674static int
6675is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6676{
6677    const char *curr = s + offset;
6678    const char *prev;
6679
6680    if (!IsDBCSLeadByteEx(code_page, *curr))
6681        return 0;
6682
6683    prev = CharPrevExA(code_page, s, curr, 0);
6684    if (prev == curr)
6685        return 1;
6686    /* FIXME: This code is limited to "true" double-byte encodings,
6687       as it assumes an incomplete character consists of a single
6688       byte. */
6689    if (curr - prev == 2)
6690        return 1;
6691    if (!IsDBCSLeadByteEx(code_page, *prev))
6692        return 1;
6693    return 0;
6694}
6695
6696static DWORD
6697decode_code_page_flags(UINT code_page)
6698{
6699    if (code_page == CP_UTF7) {
6700        /* The CP_UTF7 decoder only supports flags=0 */
6701        return 0;
6702    }
6703    else
6704        return MB_ERR_INVALID_CHARS;
6705}
6706
6707/*
6708 * Decode a byte string from a Windows code page into unicode object in strict
6709 * mode.
6710 *
6711 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6712 * WindowsError and returns -1 on other error.
6713 */
6714static int
6715decode_code_page_strict(UINT code_page,
6716                        PyObject **v,
6717                        const char *in,
6718                        int insize)
6719{
6720    const DWORD flags = decode_code_page_flags(code_page);
6721    wchar_t *out;
6722    DWORD outsize;
6723
6724    /* First get the size of the result */
6725    assert(insize > 0);
6726    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6727    if (outsize <= 0)
6728        goto error;
6729
6730    if (*v == NULL) {
6731        /* Create unicode object */
6732        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6733        *v = (PyObject*)_PyUnicode_New(outsize);
6734        if (*v == NULL)
6735            return -1;
6736        out = PyUnicode_AS_UNICODE(*v);
6737    }
6738    else {
6739        /* Extend unicode object */
6740        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6741        if (unicode_resize(v, n + outsize) < 0)
6742            return -1;
6743        out = PyUnicode_AS_UNICODE(*v) + n;
6744    }
6745
6746    /* Do the conversion */
6747    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6748    if (outsize <= 0)
6749        goto error;
6750    return insize;
6751
6752error:
6753    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6754        return -2;
6755    PyErr_SetFromWindowsErr(0);
6756    return -1;
6757}
6758
6759/*
6760 * Decode a byte string from a code page into unicode object with an error
6761 * handler.
6762 *
6763 * Returns consumed size if succeed, or raise a WindowsError or
6764 * UnicodeDecodeError exception and returns -1 on error.
6765 */
6766static int
6767decode_code_page_errors(UINT code_page,
6768                        PyObject **v,
6769                        const char *in, const int size,
6770                        const char *errors)
6771{
6772    const char *startin = in;
6773    const char *endin = in + size;
6774    const DWORD flags = decode_code_page_flags(code_page);
6775    /* Ideally, we should get reason from FormatMessage. This is the Windows
6776       2000 English version of the message. */
6777    const char *reason = "No mapping for the Unicode character exists "
6778                         "in the target code page.";
6779    /* each step cannot decode more than 1 character, but a character can be
6780       represented as a surrogate pair */
6781    wchar_t buffer[2], *startout, *out;
6782    int insize, outsize;
6783    PyObject *errorHandler = NULL;
6784    PyObject *exc = NULL;
6785    PyObject *encoding_obj = NULL;
6786    char *encoding;
6787    DWORD err;
6788    int ret = -1;
6789
6790    assert(size > 0);
6791
6792    encoding = code_page_name(code_page, &encoding_obj);
6793    if (encoding == NULL)
6794        return -1;
6795
6796    if (errors == NULL || strcmp(errors, "strict") == 0) {
6797        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6798           UnicodeDecodeError. */
6799        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6800        if (exc != NULL) {
6801            PyCodec_StrictErrors(exc);
6802            Py_CLEAR(exc);
6803        }
6804        goto error;
6805    }
6806
6807    if (*v == NULL) {
6808        /* Create unicode object */
6809        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6810            PyErr_NoMemory();
6811            goto error;
6812        }
6813        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6814        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6815        if (*v == NULL)
6816            goto error;
6817        startout = PyUnicode_AS_UNICODE(*v);
6818    }
6819    else {
6820        /* Extend unicode object */
6821        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6822        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6823            PyErr_NoMemory();
6824            goto error;
6825        }
6826        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6827            goto error;
6828        startout = PyUnicode_AS_UNICODE(*v) + n;
6829    }
6830
6831    /* Decode the byte string character per character */
6832    out = startout;
6833    while (in < endin)
6834    {
6835        /* Decode a character */
6836        insize = 1;
6837        do
6838        {
6839            outsize = MultiByteToWideChar(code_page, flags,
6840                                          in, insize,
6841                                          buffer, Py_ARRAY_LENGTH(buffer));
6842            if (outsize > 0)
6843                break;
6844            err = GetLastError();
6845            if (err != ERROR_NO_UNICODE_TRANSLATION
6846                && err != ERROR_INSUFFICIENT_BUFFER)
6847            {
6848                PyErr_SetFromWindowsErr(0);
6849                goto error;
6850            }
6851            insize++;
6852        }
6853        /* 4=maximum length of a UTF-8 sequence */
6854        while (insize <= 4 && (in + insize) <= endin);
6855
6856        if (outsize <= 0) {
6857            Py_ssize_t startinpos, endinpos, outpos;
6858
6859            startinpos = in - startin;
6860            endinpos = startinpos + 1;
6861            outpos = out - PyUnicode_AS_UNICODE(*v);
6862            if (unicode_decode_call_errorhandler(
6863                    errors, &errorHandler,
6864                    encoding, reason,
6865                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
6866                    v, &outpos))
6867            {
6868                goto error;
6869            }
6870            out = PyUnicode_AS_UNICODE(*v) + outpos;
6871        }
6872        else {
6873            in += insize;
6874            memcpy(out, buffer, outsize * sizeof(wchar_t));
6875            out += outsize;
6876        }
6877    }
6878
6879    /* write a NUL character at the end */
6880    *out = 0;
6881
6882    /* Extend unicode object */
6883    outsize = out - startout;
6884    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
6885    if (unicode_resize(v, outsize) < 0)
6886        goto error;
6887    ret = size;
6888
6889error:
6890    Py_XDECREF(encoding_obj);
6891    Py_XDECREF(errorHandler);
6892    Py_XDECREF(exc);
6893    return ret;
6894}
6895
6896static PyObject *
6897decode_code_page_stateful(int code_page,
6898                          const char *s, Py_ssize_t size,
6899                          const char *errors, Py_ssize_t *consumed)
6900{
6901    PyObject *v = NULL;
6902    int chunk_size, final, converted, done;
6903
6904    if (code_page < 0) {
6905        PyErr_SetString(PyExc_ValueError, "invalid code page number");
6906        return NULL;
6907    }
6908
6909    if (consumed)
6910        *consumed = 0;
6911
6912    do
6913    {
6914#ifdef NEED_RETRY
6915        if (size > INT_MAX) {
6916            chunk_size = INT_MAX;
6917            final = 0;
6918            done = 0;
6919        }
6920        else
6921#endif
6922        {
6923            chunk_size = (int)size;
6924            final = (consumed == NULL);
6925            done = 1;
6926        }
6927
6928        /* Skip trailing lead-byte unless 'final' is set */
6929        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6930            --chunk_size;
6931
6932        if (chunk_size == 0 && done) {
6933            if (v != NULL)
6934                break;
6935            Py_INCREF(unicode_empty);
6936            return unicode_empty;
6937        }
6938
6939
6940        converted = decode_code_page_strict(code_page, &v,
6941                                            s, chunk_size);
6942        if (converted == -2)
6943            converted = decode_code_page_errors(code_page, &v,
6944                                                s, chunk_size,
6945                                                errors);
6946        assert(converted != 0);
6947
6948        if (converted < 0) {
6949            Py_XDECREF(v);
6950            return NULL;
6951        }
6952
6953        if (consumed)
6954            *consumed += converted;
6955
6956        s += converted;
6957        size -= converted;
6958    } while (!done);
6959
6960    return unicode_result(v);
6961}
6962
6963PyObject *
6964PyUnicode_DecodeCodePageStateful(int code_page,
6965                                 const char *s,
6966                                 Py_ssize_t size,
6967                                 const char *errors,
6968                                 Py_ssize_t *consumed)
6969{
6970    return decode_code_page_stateful(code_page, s, size, errors, consumed);
6971}
6972
6973PyObject *
6974PyUnicode_DecodeMBCSStateful(const char *s,
6975                             Py_ssize_t size,
6976                             const char *errors,
6977                             Py_ssize_t *consumed)
6978{
6979    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6980}
6981
6982PyObject *
6983PyUnicode_DecodeMBCS(const char *s,
6984                     Py_ssize_t size,
6985                     const char *errors)
6986{
6987    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6988}
6989
6990static DWORD
6991encode_code_page_flags(UINT code_page, const char *errors)
6992{
6993    if (code_page == CP_UTF8) {
6994        if (winver.dwMajorVersion >= 6)
6995            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6996               and later */
6997            return WC_ERR_INVALID_CHARS;
6998        else
6999            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7000            return 0;
7001    }
7002    else if (code_page == CP_UTF7) {
7003        /* CP_UTF7 only supports flags=0 */
7004        return 0;
7005    }
7006    else {
7007        if (errors != NULL && strcmp(errors, "replace") == 0)
7008            return 0;
7009        else
7010            return WC_NO_BEST_FIT_CHARS;
7011    }
7012}
7013
7014/*
7015 * Encode a Unicode string to a Windows code page into a byte string in strict
7016 * mode.
7017 *
7018 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7019 * a WindowsError and returns -1 on other error.
7020 */
7021static int
7022encode_code_page_strict(UINT code_page, PyObject **outbytes,
7023                        PyObject *unicode, Py_ssize_t offset, int len,
7024                        const char* errors)
7025{
7026    BOOL usedDefaultChar = FALSE;
7027    BOOL *pusedDefaultChar = &usedDefaultChar;
7028    int outsize;
7029    PyObject *exc = NULL;
7030    wchar_t *p;
7031    Py_ssize_t size;
7032    const DWORD flags = encode_code_page_flags(code_page, NULL);
7033    char *out;
7034    /* Create a substring so that we can get the UTF-16 representation
7035       of just the slice under consideration. */
7036    PyObject *substring;
7037
7038    assert(len > 0);
7039
7040    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7041        pusedDefaultChar = &usedDefaultChar;
7042    else
7043        pusedDefaultChar = NULL;
7044
7045    substring = PyUnicode_Substring(unicode, offset, offset+len);
7046    if (substring == NULL)
7047        return -1;
7048    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7049    if (p == NULL) {
7050        Py_DECREF(substring);
7051        return -1;
7052    }
7053
7054    /* First get the size of the result */
7055    outsize = WideCharToMultiByte(code_page, flags,
7056                                  p, size,
7057                                  NULL, 0,
7058                                  NULL, pusedDefaultChar);
7059    if (outsize <= 0)
7060        goto error;
7061    /* If we used a default char, then we failed! */
7062    if (pusedDefaultChar && *pusedDefaultChar) {
7063        Py_DECREF(substring);
7064        return -2;
7065    }
7066
7067    if (*outbytes == NULL) {
7068        /* Create string object */
7069        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7070        if (*outbytes == NULL) {
7071            Py_DECREF(substring);
7072            return -1;
7073        }
7074        out = PyBytes_AS_STRING(*outbytes);
7075    }
7076    else {
7077        /* Extend string object */
7078        const Py_ssize_t n = PyBytes_Size(*outbytes);
7079        if (outsize > PY_SSIZE_T_MAX - n) {
7080            PyErr_NoMemory();
7081            Py_DECREF(substring);
7082            return -1;
7083        }
7084        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7085            Py_DECREF(substring);
7086            return -1;
7087        }
7088        out = PyBytes_AS_STRING(*outbytes) + n;
7089    }
7090
7091    /* Do the conversion */
7092    outsize = WideCharToMultiByte(code_page, flags,
7093                                  p, size,
7094                                  out, outsize,
7095                                  NULL, pusedDefaultChar);
7096    Py_CLEAR(substring);
7097    if (outsize <= 0)
7098        goto error;
7099    if (pusedDefaultChar && *pusedDefaultChar)
7100        return -2;
7101    return 0;
7102
7103error:
7104    Py_XDECREF(substring);
7105    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7106        return -2;
7107    PyErr_SetFromWindowsErr(0);
7108    return -1;
7109}
7110
7111/*
7112 * Encode a Unicode string to a Windows code page into a byte string using a
7113 * error handler.
7114 *
7115 * Returns consumed characters if succeed, or raise a WindowsError and returns
7116 * -1 on other error.
7117 */
7118static int
7119encode_code_page_errors(UINT code_page, PyObject **outbytes,
7120                        PyObject *unicode, Py_ssize_t unicode_offset,
7121                        Py_ssize_t insize, const char* errors)
7122{
7123    const DWORD flags = encode_code_page_flags(code_page, errors);
7124    Py_ssize_t pos = unicode_offset;
7125    Py_ssize_t endin = unicode_offset + insize;
7126    /* Ideally, we should get reason from FormatMessage. This is the Windows
7127       2000 English version of the message. */
7128    const char *reason = "invalid character";
7129    /* 4=maximum length of a UTF-8 sequence */
7130    char buffer[4];
7131    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7132    Py_ssize_t outsize;
7133    char *out;
7134    PyObject *errorHandler = NULL;
7135    PyObject *exc = NULL;
7136    PyObject *encoding_obj = NULL;
7137    char *encoding;
7138    Py_ssize_t newpos, newoutsize;
7139    PyObject *rep;
7140    int ret = -1;
7141
7142    assert(insize > 0);
7143
7144    encoding = code_page_name(code_page, &encoding_obj);
7145    if (encoding == NULL)
7146        return -1;
7147
7148    if (errors == NULL || strcmp(errors, "strict") == 0) {
7149        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7150           then we raise a UnicodeEncodeError. */
7151        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7152        if (exc != NULL) {
7153            PyCodec_StrictErrors(exc);
7154            Py_DECREF(exc);
7155        }
7156        Py_XDECREF(encoding_obj);
7157        return -1;
7158    }
7159
7160    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7161        pusedDefaultChar = &usedDefaultChar;
7162    else
7163        pusedDefaultChar = NULL;
7164
7165    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7166        PyErr_NoMemory();
7167        goto error;
7168    }
7169    outsize = insize * Py_ARRAY_LENGTH(buffer);
7170
7171    if (*outbytes == NULL) {
7172        /* Create string object */
7173        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7174        if (*outbytes == NULL)
7175            goto error;
7176        out = PyBytes_AS_STRING(*outbytes);
7177    }
7178    else {
7179        /* Extend string object */
7180        Py_ssize_t n = PyBytes_Size(*outbytes);
7181        if (n > PY_SSIZE_T_MAX - outsize) {
7182            PyErr_NoMemory();
7183            goto error;
7184        }
7185        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7186            goto error;
7187        out = PyBytes_AS_STRING(*outbytes) + n;
7188    }
7189
7190    /* Encode the string character per character */
7191    while (pos < endin)
7192    {
7193        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7194        wchar_t chars[2];
7195        int charsize;
7196        if (ch < 0x10000) {
7197            chars[0] = (wchar_t)ch;
7198            charsize = 1;
7199        }
7200        else {
7201            ch -= 0x10000;
7202            chars[0] = 0xd800 + (ch >> 10);
7203            chars[1] = 0xdc00 + (ch & 0x3ff);
7204            charsize = 2;
7205        }
7206
7207        outsize = WideCharToMultiByte(code_page, flags,
7208                                      chars, charsize,
7209                                      buffer, Py_ARRAY_LENGTH(buffer),
7210                                      NULL, pusedDefaultChar);
7211        if (outsize > 0) {
7212            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7213            {
7214                pos++;
7215                memcpy(out, buffer, outsize);
7216                out += outsize;
7217                continue;
7218            }
7219        }
7220        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7221            PyErr_SetFromWindowsErr(0);
7222            goto error;
7223        }
7224
7225        rep = unicode_encode_call_errorhandler(
7226                  errors, &errorHandler, encoding, reason,
7227                  unicode, &exc,
7228                  pos, pos + 1, &newpos);
7229        if (rep == NULL)
7230            goto error;
7231        pos = newpos;
7232
7233        if (PyBytes_Check(rep)) {
7234            outsize = PyBytes_GET_SIZE(rep);
7235            if (outsize != 1) {
7236                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7237                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7238                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7239                    Py_DECREF(rep);
7240                    goto error;
7241                }
7242                out = PyBytes_AS_STRING(*outbytes) + offset;
7243            }
7244            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7245            out += outsize;
7246        }
7247        else {
7248            Py_ssize_t i;
7249            enum PyUnicode_Kind kind;
7250            void *data;
7251
7252            if (PyUnicode_READY(rep) == -1) {
7253                Py_DECREF(rep);
7254                goto error;
7255            }
7256
7257            outsize = PyUnicode_GET_LENGTH(rep);
7258            if (outsize != 1) {
7259                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7260                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7261                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7262                    Py_DECREF(rep);
7263                    goto error;
7264                }
7265                out = PyBytes_AS_STRING(*outbytes) + offset;
7266            }
7267            kind = PyUnicode_KIND(rep);
7268            data = PyUnicode_DATA(rep);
7269            for (i=0; i < outsize; i++) {
7270                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7271                if (ch > 127) {
7272                    raise_encode_exception(&exc,
7273                        encoding, unicode,
7274                        pos, pos + 1,
7275                        "unable to encode error handler result to ASCII");
7276                    Py_DECREF(rep);
7277                    goto error;
7278                }
7279                *out = (unsigned char)ch;
7280                out++;
7281            }
7282        }
7283        Py_DECREF(rep);
7284    }
7285    /* write a NUL byte */
7286    *out = 0;
7287    outsize = out - PyBytes_AS_STRING(*outbytes);
7288    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7289    if (_PyBytes_Resize(outbytes, outsize) < 0)
7290        goto error;
7291    ret = 0;
7292
7293error:
7294    Py_XDECREF(encoding_obj);
7295    Py_XDECREF(errorHandler);
7296    Py_XDECREF(exc);
7297    return ret;
7298}
7299
7300static PyObject *
7301encode_code_page(int code_page,
7302                 PyObject *unicode,
7303                 const char *errors)
7304{
7305    Py_ssize_t len;
7306    PyObject *outbytes = NULL;
7307    Py_ssize_t offset;
7308    int chunk_len, ret, done;
7309
7310    if (PyUnicode_READY(unicode) == -1)
7311        return NULL;
7312    len = PyUnicode_GET_LENGTH(unicode);
7313
7314    if (code_page < 0) {
7315        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7316        return NULL;
7317    }
7318
7319    if (len == 0)
7320        return PyBytes_FromStringAndSize(NULL, 0);
7321
7322    offset = 0;
7323    do
7324    {
7325#ifdef NEED_RETRY
7326        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7327           chunks. */
7328        if (len > INT_MAX/2) {
7329            chunk_len = INT_MAX/2;
7330            done = 0;
7331        }
7332        else
7333#endif
7334        {
7335            chunk_len = (int)len;
7336            done = 1;
7337        }
7338
7339        ret = encode_code_page_strict(code_page, &outbytes,
7340                                      unicode, offset, chunk_len,
7341                                      errors);
7342        if (ret == -2)
7343            ret = encode_code_page_errors(code_page, &outbytes,
7344                                          unicode, offset,
7345                                          chunk_len, errors);
7346        if (ret < 0) {
7347            Py_XDECREF(outbytes);
7348            return NULL;
7349        }
7350
7351        offset += chunk_len;
7352        len -= chunk_len;
7353    } while (!done);
7354
7355    return outbytes;
7356}
7357
7358PyObject *
7359PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7360                     Py_ssize_t size,
7361                     const char *errors)
7362{
7363    PyObject *unicode, *res;
7364    unicode = PyUnicode_FromUnicode(p, size);
7365    if (unicode == NULL)
7366        return NULL;
7367    res = encode_code_page(CP_ACP, unicode, errors);
7368    Py_DECREF(unicode);
7369    return res;
7370}
7371
7372PyObject *
7373PyUnicode_EncodeCodePage(int code_page,
7374                         PyObject *unicode,
7375                         const char *errors)
7376{
7377    return encode_code_page(code_page, unicode, errors);
7378}
7379
7380PyObject *
7381PyUnicode_AsMBCSString(PyObject *unicode)
7382{
7383    if (!PyUnicode_Check(unicode)) {
7384        PyErr_BadArgument();
7385        return NULL;
7386    }
7387    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7388}
7389
7390#undef NEED_RETRY
7391
7392#endif /* HAVE_MBCS */
7393
7394/* --- Character Mapping Codec -------------------------------------------- */
7395
7396PyObject *
7397PyUnicode_DecodeCharmap(const char *s,
7398                        Py_ssize_t size,
7399                        PyObject *mapping,
7400                        const char *errors)
7401{
7402    const char *starts = s;
7403    Py_ssize_t startinpos;
7404    Py_ssize_t endinpos;
7405    Py_ssize_t outpos;
7406    const char *e;
7407    PyObject *v;
7408    Py_ssize_t extrachars = 0;
7409    PyObject *errorHandler = NULL;
7410    PyObject *exc = NULL;
7411
7412    /* Default to Latin-1 */
7413    if (mapping == NULL)
7414        return PyUnicode_DecodeLatin1(s, size, errors);
7415
7416    v = PyUnicode_New(size, 127);
7417    if (v == NULL)
7418        goto onError;
7419    if (size == 0)
7420        return v;
7421    outpos = 0;
7422    e = s + size;
7423    if (PyUnicode_CheckExact(mapping)) {
7424        Py_ssize_t maplen;
7425        enum PyUnicode_Kind mapkind;
7426        void *mapdata;
7427        Py_UCS4 x;
7428
7429        if (PyUnicode_READY(mapping) == -1)
7430            return NULL;
7431
7432        maplen = PyUnicode_GET_LENGTH(mapping);
7433        mapdata = PyUnicode_DATA(mapping);
7434        mapkind = PyUnicode_KIND(mapping);
7435        while (s < e) {
7436            unsigned char ch;
7437            if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7438                enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7439                if (outkind == PyUnicode_1BYTE_KIND) {
7440                    void *outdata = PyUnicode_DATA(v);
7441                    Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7442                    while (s < e) {
7443                        unsigned char ch = *s;
7444                        x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7445                        if (x > maxchar)
7446                            goto Error;
7447                        PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7448                        ++s;
7449                    }
7450                    break;
7451                }
7452                else if (outkind == PyUnicode_2BYTE_KIND) {
7453                    void *outdata = PyUnicode_DATA(v);
7454                    while (s < e) {
7455                        unsigned char ch = *s;
7456                        x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7457                        if (x == 0xFFFE)
7458                            goto Error;
7459                        PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7460                        ++s;
7461                    }
7462                    break;
7463                }
7464            }
7465            ch = *s;
7466
7467            if (ch < maplen)
7468                x = PyUnicode_READ(mapkind, mapdata, ch);
7469            else
7470                x = 0xfffe; /* invalid value */
7471Error:
7472            if (x == 0xfffe)
7473            {
7474                /* undefined mapping */
7475                startinpos = s-starts;
7476                endinpos = startinpos+1;
7477                if (unicode_decode_call_errorhandler(
7478                        errors, &errorHandler,
7479                        "charmap", "character maps to <undefined>",
7480                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7481                        &v, &outpos)) {
7482                    goto onError;
7483                }
7484                continue;
7485            }
7486
7487            if (unicode_putchar(&v, &outpos, x) < 0)
7488                goto onError;
7489            ++s;
7490        }
7491    }
7492    else {
7493        while (s < e) {
7494            unsigned char ch = *s;
7495            PyObject *w, *x;
7496
7497            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7498            w = PyLong_FromLong((long)ch);
7499            if (w == NULL)
7500                goto onError;
7501            x = PyObject_GetItem(mapping, w);
7502            Py_DECREF(w);
7503            if (x == NULL) {
7504                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7505                    /* No mapping found means: mapping is undefined. */
7506                    PyErr_Clear();
7507                    x = Py_None;
7508                    Py_INCREF(x);
7509                } else
7510                    goto onError;
7511            }
7512
7513            /* Apply mapping */
7514            if (PyLong_Check(x)) {
7515                long value = PyLong_AS_LONG(x);
7516                if (value < 0 || value > MAX_UNICODE) {
7517                    PyErr_Format(PyExc_TypeError,
7518                                 "character mapping must be in range(0x%lx)",
7519                                 (unsigned long)MAX_UNICODE + 1);
7520                    Py_DECREF(x);
7521                    goto onError;
7522                }
7523                if (unicode_putchar(&v, &outpos, value) < 0)
7524                    goto onError;
7525            }
7526            else if (x == Py_None) {
7527                /* undefined mapping */
7528                startinpos = s-starts;
7529                endinpos = startinpos+1;
7530                if (unicode_decode_call_errorhandler(
7531                        errors, &errorHandler,
7532                        "charmap", "character maps to <undefined>",
7533                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7534                        &v, &outpos)) {
7535                    Py_DECREF(x);
7536                    goto onError;
7537                }
7538                Py_DECREF(x);
7539                continue;
7540            }
7541            else if (PyUnicode_Check(x)) {
7542                Py_ssize_t targetsize;
7543
7544                if (PyUnicode_READY(x) == -1)
7545                    goto onError;
7546                targetsize = PyUnicode_GET_LENGTH(x);
7547
7548                if (targetsize == 1) {
7549                    /* 1-1 mapping */
7550                    if (unicode_putchar(&v, &outpos,
7551                                        PyUnicode_READ_CHAR(x, 0)) < 0)
7552                        goto onError;
7553                }
7554                else if (targetsize > 1) {
7555                    /* 1-n mapping */
7556                    if (targetsize > extrachars) {
7557                        /* resize first */
7558                        Py_ssize_t needed = (targetsize - extrachars) + \
7559                            (targetsize << 2);
7560                        extrachars += needed;
7561                        /* XXX overflow detection missing */
7562                        if (unicode_resize(&v,
7563                                           PyUnicode_GET_LENGTH(v) + needed) < 0)
7564                        {
7565                            Py_DECREF(x);
7566                            goto onError;
7567                        }
7568                    }
7569                    if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7570                        goto onError;
7571                    PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7572                    outpos += targetsize;
7573                    extrachars -= targetsize;
7574                }
7575                /* 1-0 mapping: skip the character */
7576            }
7577            else {
7578                /* wrong return value */
7579                PyErr_SetString(PyExc_TypeError,
7580                                "character mapping must return integer, None or str");
7581                Py_DECREF(x);
7582                goto onError;
7583            }
7584            Py_DECREF(x);
7585            ++s;
7586        }
7587    }
7588    if (unicode_resize(&v, outpos) < 0)
7589        goto onError;
7590    Py_XDECREF(errorHandler);
7591    Py_XDECREF(exc);
7592    return unicode_result(v);
7593
7594  onError:
7595    Py_XDECREF(errorHandler);
7596    Py_XDECREF(exc);
7597    Py_XDECREF(v);
7598    return NULL;
7599}
7600
7601/* Charmap encoding: the lookup table */
7602
7603struct encoding_map {
7604    PyObject_HEAD
7605    unsigned char level1[32];
7606    int count2, count3;
7607    unsigned char level23[1];
7608};
7609
7610static PyObject*
7611encoding_map_size(PyObject *obj, PyObject* args)
7612{
7613    struct encoding_map *map = (struct encoding_map*)obj;
7614    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7615                           128*map->count3);
7616}
7617
7618static PyMethodDef encoding_map_methods[] = {
7619    {"size", encoding_map_size, METH_NOARGS,
7620     PyDoc_STR("Return the size (in bytes) of this object") },
7621    { 0 }
7622};
7623
7624static void
7625encoding_map_dealloc(PyObject* o)
7626{
7627    PyObject_FREE(o);
7628}
7629
7630static PyTypeObject EncodingMapType = {
7631    PyVarObject_HEAD_INIT(NULL, 0)
7632    "EncodingMap",          /*tp_name*/
7633    sizeof(struct encoding_map),   /*tp_basicsize*/
7634    0,                      /*tp_itemsize*/
7635    /* methods */
7636    encoding_map_dealloc,   /*tp_dealloc*/
7637    0,                      /*tp_print*/
7638    0,                      /*tp_getattr*/
7639    0,                      /*tp_setattr*/
7640    0,                      /*tp_reserved*/
7641    0,                      /*tp_repr*/
7642    0,                      /*tp_as_number*/
7643    0,                      /*tp_as_sequence*/
7644    0,                      /*tp_as_mapping*/
7645    0,                      /*tp_hash*/
7646    0,                      /*tp_call*/
7647    0,                      /*tp_str*/
7648    0,                      /*tp_getattro*/
7649    0,                      /*tp_setattro*/
7650    0,                      /*tp_as_buffer*/
7651    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7652    0,                      /*tp_doc*/
7653    0,                      /*tp_traverse*/
7654    0,                      /*tp_clear*/
7655    0,                      /*tp_richcompare*/
7656    0,                      /*tp_weaklistoffset*/
7657    0,                      /*tp_iter*/
7658    0,                      /*tp_iternext*/
7659    encoding_map_methods,   /*tp_methods*/
7660    0,                      /*tp_members*/
7661    0,                      /*tp_getset*/
7662    0,                      /*tp_base*/
7663    0,                      /*tp_dict*/
7664    0,                      /*tp_descr_get*/
7665    0,                      /*tp_descr_set*/
7666    0,                      /*tp_dictoffset*/
7667    0,                      /*tp_init*/
7668    0,                      /*tp_alloc*/
7669    0,                      /*tp_new*/
7670    0,                      /*tp_free*/
7671    0,                      /*tp_is_gc*/
7672};
7673
7674PyObject*
7675PyUnicode_BuildEncodingMap(PyObject* string)
7676{
7677    PyObject *result;
7678    struct encoding_map *mresult;
7679    int i;
7680    int need_dict = 0;
7681    unsigned char level1[32];
7682    unsigned char level2[512];
7683    unsigned char *mlevel1, *mlevel2, *mlevel3;
7684    int count2 = 0, count3 = 0;
7685    int kind;
7686    void *data;
7687    Py_ssize_t length;
7688    Py_UCS4 ch;
7689
7690    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7691        PyErr_BadArgument();
7692        return NULL;
7693    }
7694    kind = PyUnicode_KIND(string);
7695    data = PyUnicode_DATA(string);
7696    length = PyUnicode_GET_LENGTH(string);
7697    length = Py_MIN(length, 256);
7698    memset(level1, 0xFF, sizeof level1);
7699    memset(level2, 0xFF, sizeof level2);
7700
7701    /* If there isn't a one-to-one mapping of NULL to \0,
7702       or if there are non-BMP characters, we need to use
7703       a mapping dictionary. */
7704    if (PyUnicode_READ(kind, data, 0) != 0)
7705        need_dict = 1;
7706    for (i = 1; i < length; i++) {
7707        int l1, l2;
7708        ch = PyUnicode_READ(kind, data, i);
7709        if (ch == 0 || ch > 0xFFFF) {
7710            need_dict = 1;
7711            break;
7712        }
7713        if (ch == 0xFFFE)
7714            /* unmapped character */
7715            continue;
7716        l1 = ch >> 11;
7717        l2 = ch >> 7;
7718        if (level1[l1] == 0xFF)
7719            level1[l1] = count2++;
7720        if (level2[l2] == 0xFF)
7721            level2[l2] = count3++;
7722    }
7723
7724    if (count2 >= 0xFF || count3 >= 0xFF)
7725        need_dict = 1;
7726
7727    if (need_dict) {
7728        PyObject *result = PyDict_New();
7729        PyObject *key, *value;
7730        if (!result)
7731            return NULL;
7732        for (i = 0; i < length; i++) {
7733            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7734            value = PyLong_FromLong(i);
7735            if (!key || !value)
7736                goto failed1;
7737            if (PyDict_SetItem(result, key, value) == -1)
7738                goto failed1;
7739            Py_DECREF(key);
7740            Py_DECREF(value);
7741        }
7742        return result;
7743      failed1:
7744        Py_XDECREF(key);
7745        Py_XDECREF(value);
7746        Py_DECREF(result);
7747        return NULL;
7748    }
7749
7750    /* Create a three-level trie */
7751    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7752                             16*count2 + 128*count3 - 1);
7753    if (!result)
7754        return PyErr_NoMemory();
7755    PyObject_Init(result, &EncodingMapType);
7756    mresult = (struct encoding_map*)result;
7757    mresult->count2 = count2;
7758    mresult->count3 = count3;
7759    mlevel1 = mresult->level1;
7760    mlevel2 = mresult->level23;
7761    mlevel3 = mresult->level23 + 16*count2;
7762    memcpy(mlevel1, level1, 32);
7763    memset(mlevel2, 0xFF, 16*count2);
7764    memset(mlevel3, 0, 128*count3);
7765    count3 = 0;
7766    for (i = 1; i < length; i++) {
7767        int o1, o2, o3, i2, i3;
7768        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7769        if (ch == 0xFFFE)
7770            /* unmapped character */
7771            continue;
7772        o1 = ch>>11;
7773        o2 = (ch>>7) & 0xF;
7774        i2 = 16*mlevel1[o1] + o2;
7775        if (mlevel2[i2] == 0xFF)
7776            mlevel2[i2] = count3++;
7777        o3 = ch & 0x7F;
7778        i3 = 128*mlevel2[i2] + o3;
7779        mlevel3[i3] = i;
7780    }
7781    return result;
7782}
7783
7784static int
7785encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7786{
7787    struct encoding_map *map = (struct encoding_map*)mapping;
7788    int l1 = c>>11;
7789    int l2 = (c>>7) & 0xF;
7790    int l3 = c & 0x7F;
7791    int i;
7792
7793    if (c > 0xFFFF)
7794        return -1;
7795    if (c == 0)
7796        return 0;
7797    /* level 1*/
7798    i = map->level1[l1];
7799    if (i == 0xFF) {
7800        return -1;
7801    }
7802    /* level 2*/
7803    i = map->level23[16*i+l2];
7804    if (i == 0xFF) {
7805        return -1;
7806    }
7807    /* level 3 */
7808    i = map->level23[16*map->count2 + 128*i + l3];
7809    if (i == 0) {
7810        return -1;
7811    }
7812    return i;
7813}
7814
7815/* Lookup the character ch in the mapping. If the character
7816   can't be found, Py_None is returned (or NULL, if another
7817   error occurred). */
7818static PyObject *
7819charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7820{
7821    PyObject *w = PyLong_FromLong((long)c);
7822    PyObject *x;
7823
7824    if (w == NULL)
7825        return NULL;
7826    x = PyObject_GetItem(mapping, w);
7827    Py_DECREF(w);
7828    if (x == NULL) {
7829        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7830            /* No mapping found means: mapping is undefined. */
7831            PyErr_Clear();
7832            x = Py_None;
7833            Py_INCREF(x);
7834            return x;
7835        } else
7836            return NULL;
7837    }
7838    else if (x == Py_None)
7839        return x;
7840    else if (PyLong_Check(x)) {
7841        long value = PyLong_AS_LONG(x);
7842        if (value < 0 || value > 255) {
7843            PyErr_SetString(PyExc_TypeError,
7844                            "character mapping must be in range(256)");
7845            Py_DECREF(x);
7846            return NULL;
7847        }
7848        return x;
7849    }
7850    else if (PyBytes_Check(x))
7851        return x;
7852    else {
7853        /* wrong return value */
7854        PyErr_Format(PyExc_TypeError,
7855                     "character mapping must return integer, bytes or None, not %.400s",
7856                     x->ob_type->tp_name);
7857        Py_DECREF(x);
7858        return NULL;
7859    }
7860}
7861
7862static int
7863charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7864{
7865    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7866    /* exponentially overallocate to minimize reallocations */
7867    if (requiredsize < 2*outsize)
7868        requiredsize = 2*outsize;
7869    if (_PyBytes_Resize(outobj, requiredsize))
7870        return -1;
7871    return 0;
7872}
7873
7874typedef enum charmapencode_result {
7875    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7876} charmapencode_result;
7877/* lookup the character, put the result in the output string and adjust
7878   various state variables. Resize the output bytes object if not enough
7879   space is available. Return a new reference to the object that
7880   was put in the output buffer, or Py_None, if the mapping was undefined
7881   (in which case no character was written) or NULL, if a
7882   reallocation error occurred. The caller must decref the result */
7883static charmapencode_result
7884charmapencode_output(Py_UCS4 c, PyObject *mapping,
7885                     PyObject **outobj, Py_ssize_t *outpos)
7886{
7887    PyObject *rep;
7888    char *outstart;
7889    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7890
7891    if (Py_TYPE(mapping) == &EncodingMapType) {
7892        int res = encoding_map_lookup(c, mapping);
7893        Py_ssize_t requiredsize = *outpos+1;
7894        if (res == -1)
7895            return enc_FAILED;
7896        if (outsize<requiredsize)
7897            if (charmapencode_resize(outobj, outpos, requiredsize))
7898                return enc_EXCEPTION;
7899        outstart = PyBytes_AS_STRING(*outobj);
7900        outstart[(*outpos)++] = (char)res;
7901        return enc_SUCCESS;
7902    }
7903
7904    rep = charmapencode_lookup(c, mapping);
7905    if (rep==NULL)
7906        return enc_EXCEPTION;
7907    else if (rep==Py_None) {
7908        Py_DECREF(rep);
7909        return enc_FAILED;
7910    } else {
7911        if (PyLong_Check(rep)) {
7912            Py_ssize_t requiredsize = *outpos+1;
7913            if (outsize<requiredsize)
7914                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7915                    Py_DECREF(rep);
7916                    return enc_EXCEPTION;
7917                }
7918            outstart = PyBytes_AS_STRING(*outobj);
7919            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7920        }
7921        else {
7922            const char *repchars = PyBytes_AS_STRING(rep);
7923            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7924            Py_ssize_t requiredsize = *outpos+repsize;
7925            if (outsize<requiredsize)
7926                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7927                    Py_DECREF(rep);
7928                    return enc_EXCEPTION;
7929                }
7930            outstart = PyBytes_AS_STRING(*outobj);
7931            memcpy(outstart + *outpos, repchars, repsize);
7932            *outpos += repsize;
7933        }
7934    }
7935    Py_DECREF(rep);
7936    return enc_SUCCESS;
7937}
7938
7939/* handle an error in PyUnicode_EncodeCharmap
7940   Return 0 on success, -1 on error */
7941static int
7942charmap_encoding_error(
7943    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
7944    PyObject **exceptionObject,
7945    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7946    PyObject **res, Py_ssize_t *respos)
7947{
7948    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7949    Py_ssize_t size, repsize;
7950    Py_ssize_t newpos;
7951    enum PyUnicode_Kind kind;
7952    void *data;
7953    Py_ssize_t index;
7954    /* startpos for collecting unencodable chars */
7955    Py_ssize_t collstartpos = *inpos;
7956    Py_ssize_t collendpos = *inpos+1;
7957    Py_ssize_t collpos;
7958    char *encoding = "charmap";
7959    char *reason = "character maps to <undefined>";
7960    charmapencode_result x;
7961    Py_UCS4 ch;
7962    int val;
7963
7964    if (PyUnicode_READY(unicode) == -1)
7965        return -1;
7966    size = PyUnicode_GET_LENGTH(unicode);
7967    /* find all unencodable characters */
7968    while (collendpos < size) {
7969        PyObject *rep;
7970        if (Py_TYPE(mapping) == &EncodingMapType) {
7971            ch = PyUnicode_READ_CHAR(unicode, collendpos);
7972            val = encoding_map_lookup(ch, mapping);
7973            if (val != -1)
7974                break;
7975            ++collendpos;
7976            continue;
7977        }
7978
7979        ch = PyUnicode_READ_CHAR(unicode, collendpos);
7980        rep = charmapencode_lookup(ch, mapping);
7981        if (rep==NULL)
7982            return -1;
7983        else if (rep!=Py_None) {
7984            Py_DECREF(rep);
7985            break;
7986        }
7987        Py_DECREF(rep);
7988        ++collendpos;
7989    }
7990    /* cache callback name lookup
7991     * (if not done yet, i.e. it's the first error) */
7992    if (*known_errorHandler==-1) {
7993        if ((errors==NULL) || (!strcmp(errors, "strict")))
7994            *known_errorHandler = 1;
7995        else if (!strcmp(errors, "replace"))
7996            *known_errorHandler = 2;
7997        else if (!strcmp(errors, "ignore"))
7998            *known_errorHandler = 3;
7999        else if (!strcmp(errors, "xmlcharrefreplace"))
8000            *known_errorHandler = 4;
8001        else
8002            *known_errorHandler = 0;
8003    }
8004    switch (*known_errorHandler) {
8005    case 1: /* strict */
8006        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8007        return -1;
8008    case 2: /* replace */
8009        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8010            x = charmapencode_output('?', mapping, res, respos);
8011            if (x==enc_EXCEPTION) {
8012                return -1;
8013            }
8014            else if (x==enc_FAILED) {
8015                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8016                return -1;
8017            }
8018        }
8019        /* fall through */
8020    case 3: /* ignore */
8021        *inpos = collendpos;
8022        break;
8023    case 4: /* xmlcharrefreplace */
8024        /* generate replacement (temporarily (mis)uses p) */
8025        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8026            char buffer[2+29+1+1];
8027            char *cp;
8028            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8029            for (cp = buffer; *cp; ++cp) {
8030                x = charmapencode_output(*cp, mapping, res, respos);
8031                if (x==enc_EXCEPTION)
8032                    return -1;
8033                else if (x==enc_FAILED) {
8034                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8035                    return -1;
8036                }
8037            }
8038        }
8039        *inpos = collendpos;
8040        break;
8041    default:
8042        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8043                                                      encoding, reason, unicode, exceptionObject,
8044                                                      collstartpos, collendpos, &newpos);
8045        if (repunicode == NULL)
8046            return -1;
8047        if (PyBytes_Check(repunicode)) {
8048            /* Directly copy bytes result to output. */
8049            Py_ssize_t outsize = PyBytes_Size(*res);
8050            Py_ssize_t requiredsize;
8051            repsize = PyBytes_Size(repunicode);
8052            requiredsize = *respos + repsize;
8053            if (requiredsize > outsize)
8054                /* Make room for all additional bytes. */
8055                if (charmapencode_resize(res, respos, requiredsize)) {
8056                    Py_DECREF(repunicode);
8057                    return -1;
8058                }
8059            memcpy(PyBytes_AsString(*res) + *respos,
8060                   PyBytes_AsString(repunicode),  repsize);
8061            *respos += repsize;
8062            *inpos = newpos;
8063            Py_DECREF(repunicode);
8064            break;
8065        }
8066        /* generate replacement  */
8067        if (PyUnicode_READY(repunicode) == -1) {
8068            Py_DECREF(repunicode);
8069            return -1;
8070        }
8071        repsize = PyUnicode_GET_LENGTH(repunicode);
8072        data = PyUnicode_DATA(repunicode);
8073        kind = PyUnicode_KIND(repunicode);
8074        for (index = 0; index < repsize; index++) {
8075            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8076            x = charmapencode_output(repch, mapping, res, respos);
8077            if (x==enc_EXCEPTION) {
8078                Py_DECREF(repunicode);
8079                return -1;
8080            }
8081            else if (x==enc_FAILED) {
8082                Py_DECREF(repunicode);
8083                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8084                return -1;
8085            }
8086        }
8087        *inpos = newpos;
8088        Py_DECREF(repunicode);
8089    }
8090    return 0;
8091}
8092
8093PyObject *
8094_PyUnicode_EncodeCharmap(PyObject *unicode,
8095                         PyObject *mapping,
8096                         const char *errors)
8097{
8098    /* output object */
8099    PyObject *res = NULL;
8100    /* current input position */
8101    Py_ssize_t inpos = 0;
8102    Py_ssize_t size;
8103    /* current output position */
8104    Py_ssize_t respos = 0;
8105    PyObject *errorHandler = NULL;
8106    PyObject *exc = NULL;
8107    /* the following variable is used for caching string comparisons
8108     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8109     * 3=ignore, 4=xmlcharrefreplace */
8110    int known_errorHandler = -1;
8111
8112    if (PyUnicode_READY(unicode) == -1)
8113        return NULL;
8114    size = PyUnicode_GET_LENGTH(unicode);
8115
8116    /* Default to Latin-1 */
8117    if (mapping == NULL)
8118        return unicode_encode_ucs1(unicode, errors, 256);
8119
8120    /* allocate enough for a simple encoding without
8121       replacements, if we need more, we'll resize */
8122    res = PyBytes_FromStringAndSize(NULL, size);
8123    if (res == NULL)
8124        goto onError;
8125    if (size == 0)
8126        return res;
8127
8128    while (inpos<size) {
8129        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
8130        /* try to encode it */
8131        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8132        if (x==enc_EXCEPTION) /* error */
8133            goto onError;
8134        if (x==enc_FAILED) { /* unencodable character */
8135            if (charmap_encoding_error(unicode, &inpos, mapping,
8136                                       &exc,
8137                                       &known_errorHandler, &errorHandler, errors,
8138                                       &res, &respos)) {
8139                goto onError;
8140            }
8141        }
8142        else
8143            /* done with this character => adjust input position */
8144            ++inpos;
8145    }
8146
8147    /* Resize if we allocated to much */
8148    if (respos<PyBytes_GET_SIZE(res))
8149        if (_PyBytes_Resize(&res, respos) < 0)
8150            goto onError;
8151
8152    Py_XDECREF(exc);
8153    Py_XDECREF(errorHandler);
8154    return res;
8155
8156  onError:
8157    Py_XDECREF(res);
8158    Py_XDECREF(exc);
8159    Py_XDECREF(errorHandler);
8160    return NULL;
8161}
8162
8163/* Deprecated */
8164PyObject *
8165PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8166                        Py_ssize_t size,
8167                        PyObject *mapping,
8168                        const char *errors)
8169{
8170    PyObject *result;
8171    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8172    if (unicode == NULL)
8173        return NULL;
8174    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8175    Py_DECREF(unicode);
8176    return result;
8177}
8178
8179PyObject *
8180PyUnicode_AsCharmapString(PyObject *unicode,
8181                          PyObject *mapping)
8182{
8183    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8184        PyErr_BadArgument();
8185        return NULL;
8186    }
8187    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8188}
8189
8190/* create or adjust a UnicodeTranslateError */
8191static void
8192make_translate_exception(PyObject **exceptionObject,
8193                         PyObject *unicode,
8194                         Py_ssize_t startpos, Py_ssize_t endpos,
8195                         const char *reason)
8196{
8197    if (*exceptionObject == NULL) {
8198        *exceptionObject = _PyUnicodeTranslateError_Create(
8199            unicode, startpos, endpos, reason);
8200    }
8201    else {
8202        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8203            goto onError;
8204        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8205            goto onError;
8206        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8207            goto onError;
8208        return;
8209      onError:
8210        Py_DECREF(*exceptionObject);
8211        *exceptionObject = NULL;
8212    }
8213}
8214
8215/* raises a UnicodeTranslateError */
8216static void
8217raise_translate_exception(PyObject **exceptionObject,
8218                          PyObject *unicode,
8219                          Py_ssize_t startpos, Py_ssize_t endpos,
8220                          const char *reason)
8221{
8222    make_translate_exception(exceptionObject,
8223                             unicode, startpos, endpos, reason);
8224    if (*exceptionObject != NULL)
8225        PyCodec_StrictErrors(*exceptionObject);
8226}
8227
8228/* error handling callback helper:
8229   build arguments, call the callback and check the arguments,
8230   put the result into newpos and return the replacement string, which
8231   has to be freed by the caller */
8232static PyObject *
8233unicode_translate_call_errorhandler(const char *errors,
8234                                    PyObject **errorHandler,
8235                                    const char *reason,
8236                                    PyObject *unicode, PyObject **exceptionObject,
8237                                    Py_ssize_t startpos, Py_ssize_t endpos,
8238                                    Py_ssize_t *newpos)
8239{
8240    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8241
8242    Py_ssize_t i_newpos;
8243    PyObject *restuple;
8244    PyObject *resunicode;
8245
8246    if (*errorHandler == NULL) {
8247        *errorHandler = PyCodec_LookupError(errors);
8248        if (*errorHandler == NULL)
8249            return NULL;
8250    }
8251
8252    make_translate_exception(exceptionObject,
8253                             unicode, startpos, endpos, reason);
8254    if (*exceptionObject == NULL)
8255        return NULL;
8256
8257    restuple = PyObject_CallFunctionObjArgs(
8258        *errorHandler, *exceptionObject, NULL);
8259    if (restuple == NULL)
8260        return NULL;
8261    if (!PyTuple_Check(restuple)) {
8262        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8263        Py_DECREF(restuple);
8264        return NULL;
8265    }
8266    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8267                          &resunicode, &i_newpos)) {
8268        Py_DECREF(restuple);
8269        return NULL;
8270    }
8271    if (i_newpos<0)
8272        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8273    else
8274        *newpos = i_newpos;
8275    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8276        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8277        Py_DECREF(restuple);
8278        return NULL;
8279    }
8280    Py_INCREF(resunicode);
8281    Py_DECREF(restuple);
8282    return resunicode;
8283}
8284
8285/* Lookup the character ch in the mapping and put the result in result,
8286   which must be decrefed by the caller.
8287   Return 0 on success, -1 on error */
8288static int
8289charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8290{
8291    PyObject *w = PyLong_FromLong((long)c);
8292    PyObject *x;
8293
8294    if (w == NULL)
8295        return -1;
8296    x = PyObject_GetItem(mapping, w);
8297    Py_DECREF(w);
8298    if (x == NULL) {
8299        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8300            /* No mapping found means: use 1:1 mapping. */
8301            PyErr_Clear();
8302            *result = NULL;
8303            return 0;
8304        } else
8305            return -1;
8306    }
8307    else if (x == Py_None) {
8308        *result = x;
8309        return 0;
8310    }
8311    else if (PyLong_Check(x)) {
8312        long value = PyLong_AS_LONG(x);
8313        long max = PyUnicode_GetMax();
8314        if (value < 0 || value > max) {
8315            PyErr_Format(PyExc_TypeError,
8316                         "character mapping must be in range(0x%x)", max+1);
8317            Py_DECREF(x);
8318            return -1;
8319        }
8320        *result = x;
8321        return 0;
8322    }
8323    else if (PyUnicode_Check(x)) {
8324        *result = x;
8325        return 0;
8326    }
8327    else {
8328        /* wrong return value */
8329        PyErr_SetString(PyExc_TypeError,
8330                        "character mapping must return integer, None or str");
8331        Py_DECREF(x);
8332        return -1;
8333    }
8334}
8335/* ensure that *outobj is at least requiredsize characters long,
8336   if not reallocate and adjust various state variables.
8337   Return 0 on success, -1 on error */
8338static int
8339charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8340                               Py_ssize_t requiredsize)
8341{
8342    Py_ssize_t oldsize = *psize;
8343    Py_UCS4 *new_outobj;
8344    if (requiredsize > oldsize) {
8345        /* exponentially overallocate to minimize reallocations */
8346        if (requiredsize < 2 * oldsize)
8347            requiredsize = 2 * oldsize;
8348        new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8349        if (new_outobj == 0)
8350            return -1;
8351        *outobj = new_outobj;
8352        *psize = requiredsize;
8353    }
8354    return 0;
8355}
8356/* lookup the character, put the result in the output string and adjust
8357   various state variables. Return a new reference to the object that
8358   was put in the output buffer in *result, or Py_None, if the mapping was
8359   undefined (in which case no character was written).
8360   The called must decref result.
8361   Return 0 on success, -1 on error. */
8362static int
8363charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8364                        PyObject *mapping, Py_UCS4 **output,
8365                        Py_ssize_t *osize, Py_ssize_t *opos,
8366                        PyObject **res)
8367{
8368    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8369    if (charmaptranslate_lookup(curinp, mapping, res))
8370        return -1;
8371    if (*res==NULL) {
8372        /* not found => default to 1:1 mapping */
8373        (*output)[(*opos)++] = curinp;
8374    }
8375    else if (*res==Py_None)
8376        ;
8377    else if (PyLong_Check(*res)) {
8378        /* no overflow check, because we know that the space is enough */
8379        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8380    }
8381    else if (PyUnicode_Check(*res)) {
8382        Py_ssize_t repsize;
8383        if (PyUnicode_READY(*res) == -1)
8384            return -1;
8385        repsize = PyUnicode_GET_LENGTH(*res);
8386        if (repsize==1) {
8387            /* no overflow check, because we know that the space is enough */
8388            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8389        }
8390        else if (repsize!=0) {
8391            /* more than one character */
8392            Py_ssize_t requiredsize = *opos +
8393                (PyUnicode_GET_LENGTH(input) - ipos) +
8394                repsize - 1;
8395            Py_ssize_t i;
8396            if (charmaptranslate_makespace(output, osize, requiredsize))
8397                return -1;
8398            for(i = 0; i < repsize; i++)
8399                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8400        }
8401    }
8402    else
8403        return -1;
8404    return 0;
8405}
8406
8407PyObject *
8408_PyUnicode_TranslateCharmap(PyObject *input,
8409                            PyObject *mapping,
8410                            const char *errors)
8411{
8412    /* input object */
8413    char *idata;
8414    Py_ssize_t size, i;
8415    int kind;
8416    /* output buffer */
8417    Py_UCS4 *output = NULL;
8418    Py_ssize_t osize;
8419    PyObject *res;
8420    /* current output position */
8421    Py_ssize_t opos;
8422    char *reason = "character maps to <undefined>";
8423    PyObject *errorHandler = NULL;
8424    PyObject *exc = NULL;
8425    /* the following variable is used for caching string comparisons
8426     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8427     * 3=ignore, 4=xmlcharrefreplace */
8428    int known_errorHandler = -1;
8429
8430    if (mapping == NULL) {
8431        PyErr_BadArgument();
8432        return NULL;
8433    }
8434
8435    if (PyUnicode_READY(input) == -1)
8436        return NULL;
8437    idata = (char*)PyUnicode_DATA(input);
8438    kind = PyUnicode_KIND(input);
8439    size = PyUnicode_GET_LENGTH(input);
8440    i = 0;
8441
8442    if (size == 0) {
8443        Py_INCREF(input);
8444        return input;
8445    }
8446
8447    /* allocate enough for a simple 1:1 translation without
8448       replacements, if we need more, we'll resize */
8449    osize = size;
8450    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8451    opos = 0;
8452    if (output == NULL) {
8453        PyErr_NoMemory();
8454        goto onError;
8455    }
8456
8457    while (i<size) {
8458        /* try to encode it */
8459        PyObject *x = NULL;
8460        if (charmaptranslate_output(input, i, mapping,
8461                                    &output, &osize, &opos, &x)) {
8462            Py_XDECREF(x);
8463            goto onError;
8464        }
8465        Py_XDECREF(x);
8466        if (x!=Py_None) /* it worked => adjust input pointer */
8467            ++i;
8468        else { /* untranslatable character */
8469            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8470            Py_ssize_t repsize;
8471            Py_ssize_t newpos;
8472            Py_ssize_t uni2;
8473            /* startpos for collecting untranslatable chars */
8474            Py_ssize_t collstart = i;
8475            Py_ssize_t collend = i+1;
8476            Py_ssize_t coll;
8477
8478            /* find all untranslatable characters */
8479            while (collend < size) {
8480                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8481                    goto onError;
8482                Py_XDECREF(x);
8483                if (x!=Py_None)
8484                    break;
8485                ++collend;
8486            }
8487            /* cache callback name lookup
8488             * (if not done yet, i.e. it's the first error) */
8489            if (known_errorHandler==-1) {
8490                if ((errors==NULL) || (!strcmp(errors, "strict")))
8491                    known_errorHandler = 1;
8492                else if (!strcmp(errors, "replace"))
8493                    known_errorHandler = 2;
8494                else if (!strcmp(errors, "ignore"))
8495                    known_errorHandler = 3;
8496                else if (!strcmp(errors, "xmlcharrefreplace"))
8497                    known_errorHandler = 4;
8498                else
8499                    known_errorHandler = 0;
8500            }
8501            switch (known_errorHandler) {
8502            case 1: /* strict */
8503                raise_translate_exception(&exc, input, collstart,
8504                                          collend, reason);
8505                goto onError;
8506            case 2: /* replace */
8507                /* No need to check for space, this is a 1:1 replacement */
8508                for (coll = collstart; coll<collend; coll++)
8509                    output[opos++] = '?';
8510                /* fall through */
8511            case 3: /* ignore */
8512                i = collend;
8513                break;
8514            case 4: /* xmlcharrefreplace */
8515                /* generate replacement (temporarily (mis)uses i) */
8516                for (i = collstart; i < collend; ++i) {
8517                    char buffer[2+29+1+1];
8518                    char *cp;
8519                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8520                    if (charmaptranslate_makespace(&output, &osize,
8521                                                   opos+strlen(buffer)+(size-collend)))
8522                        goto onError;
8523                    for (cp = buffer; *cp; ++cp)
8524                        output[opos++] = *cp;
8525                }
8526                i = collend;
8527                break;
8528            default:
8529                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8530                                                                 reason, input, &exc,
8531                                                                 collstart, collend, &newpos);
8532                if (repunicode == NULL)
8533                    goto onError;
8534                if (PyUnicode_READY(repunicode) == -1) {
8535                    Py_DECREF(repunicode);
8536                    goto onError;
8537                }
8538                /* generate replacement  */
8539                repsize = PyUnicode_GET_LENGTH(repunicode);
8540                if (charmaptranslate_makespace(&output, &osize,
8541                                               opos+repsize+(size-collend))) {
8542                    Py_DECREF(repunicode);
8543                    goto onError;
8544                }
8545                for (uni2 = 0; repsize-->0; ++uni2)
8546                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8547                i = newpos;
8548                Py_DECREF(repunicode);
8549            }
8550        }
8551    }
8552    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8553    if (!res)
8554        goto onError;
8555    PyMem_Free(output);
8556    Py_XDECREF(exc);
8557    Py_XDECREF(errorHandler);
8558    return res;
8559
8560  onError:
8561    PyMem_Free(output);
8562    Py_XDECREF(exc);
8563    Py_XDECREF(errorHandler);
8564    return NULL;
8565}
8566
8567/* Deprecated. Use PyUnicode_Translate instead. */
8568PyObject *
8569PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8570                           Py_ssize_t size,
8571                           PyObject *mapping,
8572                           const char *errors)
8573{
8574    PyObject *result;
8575    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8576    if (!unicode)
8577        return NULL;
8578    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8579    Py_DECREF(unicode);
8580    return result;
8581}
8582
8583PyObject *
8584PyUnicode_Translate(PyObject *str,
8585                    PyObject *mapping,
8586                    const char *errors)
8587{
8588    PyObject *result;
8589
8590    str = PyUnicode_FromObject(str);
8591    if (str == NULL)
8592        return NULL;
8593    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8594    Py_DECREF(str);
8595    return result;
8596}
8597
8598static Py_UCS4
8599fix_decimal_and_space_to_ascii(PyObject *self)
8600{
8601    /* No need to call PyUnicode_READY(self) because this function is only
8602       called as a callback from fixup() which does it already. */
8603    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8604    const int kind = PyUnicode_KIND(self);
8605    void *data = PyUnicode_DATA(self);
8606    Py_UCS4 maxchar = 127, ch, fixed;
8607    int modified = 0;
8608    Py_ssize_t i;
8609
8610    for (i = 0; i < len; ++i) {
8611        ch = PyUnicode_READ(kind, data, i);
8612        fixed = 0;
8613        if (ch > 127) {
8614            if (Py_UNICODE_ISSPACE(ch))
8615                fixed = ' ';
8616            else {
8617                const int decimal = Py_UNICODE_TODECIMAL(ch);
8618                if (decimal >= 0)
8619                    fixed = '0' + decimal;
8620            }
8621            if (fixed != 0) {
8622                modified = 1;
8623                maxchar = MAX_MAXCHAR(maxchar, fixed);
8624                PyUnicode_WRITE(kind, data, i, fixed);
8625            }
8626            else
8627                maxchar = MAX_MAXCHAR(maxchar, ch);
8628        }
8629    }
8630
8631    return (modified) ? maxchar : 0;
8632}
8633
8634PyObject *
8635_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8636{
8637    if (!PyUnicode_Check(unicode)) {
8638        PyErr_BadInternalCall();
8639        return NULL;
8640    }
8641    if (PyUnicode_READY(unicode) == -1)
8642        return NULL;
8643    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8644        /* If the string is already ASCII, just return the same string */
8645        Py_INCREF(unicode);
8646        return unicode;
8647    }
8648    return fixup(unicode, fix_decimal_and_space_to_ascii);
8649}
8650
8651PyObject *
8652PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8653                                  Py_ssize_t length)
8654{
8655    PyObject *decimal;
8656    Py_ssize_t i;
8657    Py_UCS4 maxchar;
8658    enum PyUnicode_Kind kind;
8659    void *data;
8660
8661    maxchar = 127;
8662    for (i = 0; i < length; i++) {
8663        Py_UNICODE ch = s[i];
8664        if (ch > 127) {
8665            int decimal = Py_UNICODE_TODECIMAL(ch);
8666            if (decimal >= 0)
8667                ch = '0' + decimal;
8668            maxchar = MAX_MAXCHAR(maxchar, ch);
8669        }
8670    }
8671
8672    /* Copy to a new string */
8673    decimal = PyUnicode_New(length, maxchar);
8674    if (decimal == NULL)
8675        return decimal;
8676    kind = PyUnicode_KIND(decimal);
8677    data = PyUnicode_DATA(decimal);
8678    /* Iterate over code points */
8679    for (i = 0; i < length; i++) {
8680        Py_UNICODE ch = s[i];
8681        if (ch > 127) {
8682            int decimal = Py_UNICODE_TODECIMAL(ch);
8683            if (decimal >= 0)
8684                ch = '0' + decimal;
8685        }
8686        PyUnicode_WRITE(kind, data, i, ch);
8687    }
8688    return unicode_result(decimal);
8689}
8690/* --- Decimal Encoder ---------------------------------------------------- */
8691
8692int
8693PyUnicode_EncodeDecimal(Py_UNICODE *s,
8694                        Py_ssize_t length,
8695                        char *output,
8696                        const char *errors)
8697{
8698    PyObject *unicode;
8699    Py_ssize_t i;
8700    enum PyUnicode_Kind kind;
8701    void *data;
8702
8703    if (output == NULL) {
8704        PyErr_BadArgument();
8705        return -1;
8706    }
8707
8708    unicode = PyUnicode_FromUnicode(s, length);
8709    if (unicode == NULL)
8710        return -1;
8711
8712    if (PyUnicode_READY(unicode) == -1) {
8713        Py_DECREF(unicode);
8714        return -1;
8715    }
8716    kind = PyUnicode_KIND(unicode);
8717    data = PyUnicode_DATA(unicode);
8718
8719    for (i=0; i < length; ) {
8720        PyObject *exc;
8721        Py_UCS4 ch;
8722        int decimal;
8723        Py_ssize_t startpos;
8724
8725        ch = PyUnicode_READ(kind, data, i);
8726
8727        if (Py_UNICODE_ISSPACE(ch)) {
8728            *output++ = ' ';
8729            i++;
8730            continue;
8731        }
8732        decimal = Py_UNICODE_TODECIMAL(ch);
8733        if (decimal >= 0) {
8734            *output++ = '0' + decimal;
8735            i++;
8736            continue;
8737        }
8738        if (0 < ch && ch < 256) {
8739            *output++ = (char)ch;
8740            i++;
8741            continue;
8742        }
8743
8744        startpos = i;
8745        exc = NULL;
8746        raise_encode_exception(&exc, "decimal", unicode,
8747                               startpos, startpos+1,
8748                               "invalid decimal Unicode string");
8749        Py_XDECREF(exc);
8750        Py_DECREF(unicode);
8751        return -1;
8752    }
8753    /* 0-terminate the output string */
8754    *output++ = '\0';
8755    Py_DECREF(unicode);
8756    return 0;
8757}
8758
8759/* --- Helpers ------------------------------------------------------------ */
8760
8761static Py_ssize_t
8762any_find_slice(int direction, PyObject* s1, PyObject* s2,
8763               Py_ssize_t start,
8764               Py_ssize_t end)
8765{
8766    int kind1, kind2, kind;
8767    void *buf1, *buf2;
8768    Py_ssize_t len1, len2, result;
8769
8770    kind1 = PyUnicode_KIND(s1);
8771    kind2 = PyUnicode_KIND(s2);
8772    kind = kind1 > kind2 ? kind1 : kind2;
8773    buf1 = PyUnicode_DATA(s1);
8774    buf2 = PyUnicode_DATA(s2);
8775    if (kind1 != kind)
8776        buf1 = _PyUnicode_AsKind(s1, kind);
8777    if (!buf1)
8778        return -2;
8779    if (kind2 != kind)
8780        buf2 = _PyUnicode_AsKind(s2, kind);
8781    if (!buf2) {
8782        if (kind1 != kind) PyMem_Free(buf1);
8783        return -2;
8784    }
8785    len1 = PyUnicode_GET_LENGTH(s1);
8786    len2 = PyUnicode_GET_LENGTH(s2);
8787
8788    if (direction > 0) {
8789        switch (kind) {
8790        case PyUnicode_1BYTE_KIND:
8791            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8792                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8793            else
8794                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8795            break;
8796        case PyUnicode_2BYTE_KIND:
8797            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8798            break;
8799        case PyUnicode_4BYTE_KIND:
8800            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8801            break;
8802        default:
8803            assert(0); result = -2;
8804        }
8805    }
8806    else {
8807        switch (kind) {
8808        case PyUnicode_1BYTE_KIND:
8809            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8810                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8811            else
8812                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8813            break;
8814        case PyUnicode_2BYTE_KIND:
8815            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8816            break;
8817        case PyUnicode_4BYTE_KIND:
8818            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8819            break;
8820        default:
8821            assert(0); result = -2;
8822        }
8823    }
8824
8825    if (kind1 != kind)
8826        PyMem_Free(buf1);
8827    if (kind2 != kind)
8828        PyMem_Free(buf2);
8829
8830    return result;
8831}
8832
8833Py_ssize_t
8834_PyUnicode_InsertThousandsGrouping(
8835    PyObject *unicode, Py_ssize_t index,
8836    Py_ssize_t n_buffer,
8837    void *digits, Py_ssize_t n_digits,
8838    Py_ssize_t min_width,
8839    const char *grouping, PyObject *thousands_sep,
8840    Py_UCS4 *maxchar)
8841{
8842    unsigned int kind, thousands_sep_kind;
8843    char *data, *thousands_sep_data;
8844    Py_ssize_t thousands_sep_len;
8845    Py_ssize_t len;
8846
8847    if (unicode != NULL) {
8848        kind = PyUnicode_KIND(unicode);
8849        data = (char *) PyUnicode_DATA(unicode) + index * kind;
8850    }
8851    else {
8852        kind = PyUnicode_1BYTE_KIND;
8853        data = NULL;
8854    }
8855    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8856    thousands_sep_data = PyUnicode_DATA(thousands_sep);
8857    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8858    if (unicode != NULL && thousands_sep_kind != kind) {
8859        if (thousands_sep_kind < kind) {
8860            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8861            if (!thousands_sep_data)
8862                return -1;
8863        }
8864        else {
8865            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8866            if (!data)
8867                return -1;
8868        }
8869    }
8870
8871    switch (kind) {
8872    case PyUnicode_1BYTE_KIND:
8873        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8874            len = asciilib_InsertThousandsGrouping(
8875                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
8876                min_width, grouping,
8877                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8878        else
8879            len = ucs1lib_InsertThousandsGrouping(
8880                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8881                min_width, grouping,
8882                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8883        break;
8884    case PyUnicode_2BYTE_KIND:
8885        len = ucs2lib_InsertThousandsGrouping(
8886            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
8887            min_width, grouping,
8888            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
8889        break;
8890    case PyUnicode_4BYTE_KIND:
8891        len = ucs4lib_InsertThousandsGrouping(
8892            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
8893            min_width, grouping,
8894            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
8895        break;
8896    default:
8897        assert(0);
8898        return -1;
8899    }
8900    if (unicode != NULL && thousands_sep_kind != kind) {
8901        if (thousands_sep_kind < kind)
8902            PyMem_Free(thousands_sep_data);
8903        else
8904            PyMem_Free(data);
8905    }
8906    if (unicode == NULL) {
8907        *maxchar = 127;
8908        if (len != n_digits) {
8909            *maxchar = MAX_MAXCHAR(*maxchar,
8910                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
8911        }
8912    }
8913    return len;
8914}
8915
8916
8917/* helper macro to fixup start/end slice values */
8918#define ADJUST_INDICES(start, end, len)         \
8919    if (end > len)                              \
8920        end = len;                              \
8921    else if (end < 0) {                         \
8922        end += len;                             \
8923        if (end < 0)                            \
8924            end = 0;                            \
8925    }                                           \
8926    if (start < 0) {                            \
8927        start += len;                           \
8928        if (start < 0)                          \
8929            start = 0;                          \
8930    }
8931
8932Py_ssize_t
8933PyUnicode_Count(PyObject *str,
8934                PyObject *substr,
8935                Py_ssize_t start,
8936                Py_ssize_t end)
8937{
8938    Py_ssize_t result;
8939    PyObject* str_obj;
8940    PyObject* sub_obj;
8941    int kind1, kind2, kind;
8942    void *buf1 = NULL, *buf2 = NULL;
8943    Py_ssize_t len1, len2;
8944
8945    str_obj = PyUnicode_FromObject(str);
8946    if (!str_obj)
8947        return -1;
8948    sub_obj = PyUnicode_FromObject(substr);
8949    if (!sub_obj) {
8950        Py_DECREF(str_obj);
8951        return -1;
8952    }
8953    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
8954        Py_DECREF(sub_obj);
8955        Py_DECREF(str_obj);
8956        return -1;
8957    }
8958
8959    kind1 = PyUnicode_KIND(str_obj);
8960    kind2 = PyUnicode_KIND(sub_obj);
8961    kind = kind1;
8962    buf1 = PyUnicode_DATA(str_obj);
8963    buf2 = PyUnicode_DATA(sub_obj);
8964    if (kind2 != kind) {
8965        if (kind2 > kind) {
8966            Py_DECREF(sub_obj);
8967            Py_DECREF(str_obj);
8968            return 0;
8969        }
8970        buf2 = _PyUnicode_AsKind(sub_obj, kind);
8971    }
8972    if (!buf2)
8973        goto onError;
8974    len1 = PyUnicode_GET_LENGTH(str_obj);
8975    len2 = PyUnicode_GET_LENGTH(sub_obj);
8976
8977    ADJUST_INDICES(start, end, len1);
8978    switch (kind) {
8979    case PyUnicode_1BYTE_KIND:
8980        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8981            result = asciilib_count(
8982                ((Py_UCS1*)buf1) + start, end - start,
8983                buf2, len2, PY_SSIZE_T_MAX
8984                );
8985        else
8986            result = ucs1lib_count(
8987                ((Py_UCS1*)buf1) + start, end - start,
8988                buf2, len2, PY_SSIZE_T_MAX
8989                );
8990        break;
8991    case PyUnicode_2BYTE_KIND:
8992        result = ucs2lib_count(
8993            ((Py_UCS2*)buf1) + start, end - start,
8994            buf2, len2, PY_SSIZE_T_MAX
8995            );
8996        break;
8997    case PyUnicode_4BYTE_KIND:
8998        result = ucs4lib_count(
8999            ((Py_UCS4*)buf1) + start, end - start,
9000            buf2, len2, PY_SSIZE_T_MAX
9001            );
9002        break;
9003    default:
9004        assert(0); result = 0;
9005    }
9006
9007    Py_DECREF(sub_obj);
9008    Py_DECREF(str_obj);
9009
9010    if (kind2 != kind)
9011        PyMem_Free(buf2);
9012
9013    return result;
9014  onError:
9015    Py_DECREF(sub_obj);
9016    Py_DECREF(str_obj);
9017    if (kind2 != kind && buf2)
9018        PyMem_Free(buf2);
9019    return -1;
9020}
9021
9022Py_ssize_t
9023PyUnicode_Find(PyObject *str,
9024               PyObject *sub,
9025               Py_ssize_t start,
9026               Py_ssize_t end,
9027               int direction)
9028{
9029    Py_ssize_t result;
9030
9031    str = PyUnicode_FromObject(str);
9032    if (!str)
9033        return -2;
9034    sub = PyUnicode_FromObject(sub);
9035    if (!sub) {
9036        Py_DECREF(str);
9037        return -2;
9038    }
9039    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9040        Py_DECREF(sub);
9041        Py_DECREF(str);
9042        return -2;
9043    }
9044
9045    result = any_find_slice(direction,
9046        str, sub, start, end
9047        );
9048
9049    Py_DECREF(str);
9050    Py_DECREF(sub);
9051
9052    return result;
9053}
9054
9055Py_ssize_t
9056PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9057                   Py_ssize_t start, Py_ssize_t end,
9058                   int direction)
9059{
9060    int kind;
9061    Py_ssize_t result;
9062    if (PyUnicode_READY(str) == -1)
9063        return -2;
9064    if (start < 0 || end < 0) {
9065        PyErr_SetString(PyExc_IndexError, "string index out of range");
9066        return -2;
9067    }
9068    if (end > PyUnicode_GET_LENGTH(str))
9069        end = PyUnicode_GET_LENGTH(str);
9070    kind = PyUnicode_KIND(str);
9071    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9072                      kind, end-start, ch, direction);
9073    if (result == -1)
9074        return -1;
9075    else
9076        return start + result;
9077}
9078
9079static int
9080tailmatch(PyObject *self,
9081          PyObject *substring,
9082          Py_ssize_t start,
9083          Py_ssize_t end,
9084          int direction)
9085{
9086    int kind_self;
9087    int kind_sub;
9088    void *data_self;
9089    void *data_sub;
9090    Py_ssize_t offset;
9091    Py_ssize_t i;
9092    Py_ssize_t end_sub;
9093
9094    if (PyUnicode_READY(self) == -1 ||
9095        PyUnicode_READY(substring) == -1)
9096        return 0;
9097
9098    if (PyUnicode_GET_LENGTH(substring) == 0)
9099        return 1;
9100
9101    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9102    end -= PyUnicode_GET_LENGTH(substring);
9103    if (end < start)
9104        return 0;
9105
9106    kind_self = PyUnicode_KIND(self);
9107    data_self = PyUnicode_DATA(self);
9108    kind_sub = PyUnicode_KIND(substring);
9109    data_sub = PyUnicode_DATA(substring);
9110    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9111
9112    if (direction > 0)
9113        offset = end;
9114    else
9115        offset = start;
9116
9117    if (PyUnicode_READ(kind_self, data_self, offset) ==
9118        PyUnicode_READ(kind_sub, data_sub, 0) &&
9119        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9120        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9121        /* If both are of the same kind, memcmp is sufficient */
9122        if (kind_self == kind_sub) {
9123            return ! memcmp((char *)data_self +
9124                                (offset * PyUnicode_KIND(substring)),
9125                            data_sub,
9126                            PyUnicode_GET_LENGTH(substring) *
9127                                PyUnicode_KIND(substring));
9128        }
9129        /* otherwise we have to compare each character by first accesing it */
9130        else {
9131            /* We do not need to compare 0 and len(substring)-1 because
9132               the if statement above ensured already that they are equal
9133               when we end up here. */
9134            /* TODO: honor direction and do a forward or backwards search */
9135            for (i = 1; i < end_sub; ++i) {
9136                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9137                    PyUnicode_READ(kind_sub, data_sub, i))
9138                    return 0;
9139            }
9140            return 1;
9141        }
9142    }
9143
9144    return 0;
9145}
9146
9147Py_ssize_t
9148PyUnicode_Tailmatch(PyObject *str,
9149                    PyObject *substr,
9150                    Py_ssize_t start,
9151                    Py_ssize_t end,
9152                    int direction)
9153{
9154    Py_ssize_t result;
9155
9156    str = PyUnicode_FromObject(str);
9157    if (str == NULL)
9158        return -1;
9159    substr = PyUnicode_FromObject(substr);
9160    if (substr == NULL) {
9161        Py_DECREF(str);
9162        return -1;
9163    }
9164
9165    result = tailmatch(str, substr,
9166                       start, end, direction);
9167    Py_DECREF(str);
9168    Py_DECREF(substr);
9169    return result;
9170}
9171
9172/* Apply fixfct filter to the Unicode object self and return a
9173   reference to the modified object */
9174
9175static PyObject *
9176fixup(PyObject *self,
9177      Py_UCS4 (*fixfct)(PyObject *s))
9178{
9179    PyObject *u;
9180    Py_UCS4 maxchar_old, maxchar_new = 0;
9181    PyObject *v;
9182
9183    u = _PyUnicode_Copy(self);
9184    if (u == NULL)
9185        return NULL;
9186    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9187
9188    /* fix functions return the new maximum character in a string,
9189       if the kind of the resulting unicode object does not change,
9190       everything is fine.  Otherwise we need to change the string kind
9191       and re-run the fix function. */
9192    maxchar_new = fixfct(u);
9193
9194    if (maxchar_new == 0) {
9195        /* no changes */;
9196        if (PyUnicode_CheckExact(self)) {
9197            Py_DECREF(u);
9198            Py_INCREF(self);
9199            return self;
9200        }
9201        else
9202            return u;
9203    }
9204
9205    maxchar_new = align_maxchar(maxchar_new);
9206
9207    if (maxchar_new == maxchar_old)
9208        return u;
9209
9210    /* In case the maximum character changed, we need to
9211       convert the string to the new category. */
9212    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9213    if (v == NULL) {
9214        Py_DECREF(u);
9215        return NULL;
9216    }
9217    if (maxchar_new > maxchar_old) {
9218        /* If the maxchar increased so that the kind changed, not all
9219           characters are representable anymore and we need to fix the
9220           string again. This only happens in very few cases. */
9221        _PyUnicode_FastCopyCharacters(v, 0,
9222                                      self, 0, PyUnicode_GET_LENGTH(self));
9223        maxchar_old = fixfct(v);
9224        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9225    }
9226    else {
9227        _PyUnicode_FastCopyCharacters(v, 0,
9228                                      u, 0, PyUnicode_GET_LENGTH(self));
9229    }
9230    Py_DECREF(u);
9231    assert(_PyUnicode_CheckConsistency(v, 1));
9232    return v;
9233}
9234
9235static PyObject *
9236ascii_upper_or_lower(PyObject *self, int lower)
9237{
9238    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9239    char *resdata, *data = PyUnicode_DATA(self);
9240    PyObject *res;
9241
9242    res = PyUnicode_New(len, 127);
9243    if (res == NULL)
9244        return NULL;
9245    resdata = PyUnicode_DATA(res);
9246    if (lower)
9247        _Py_bytes_lower(resdata, data, len);
9248    else
9249        _Py_bytes_upper(resdata, data, len);
9250    return res;
9251}
9252
9253static Py_UCS4
9254handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9255{
9256    Py_ssize_t j;
9257    int final_sigma;
9258    Py_UCS4 c;
9259    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9260
9261     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9262
9263    where ! is a negation and \p{xxx} is a character with property xxx.
9264    */
9265    for (j = i - 1; j >= 0; j--) {
9266        c = PyUnicode_READ(kind, data, j);
9267        if (!_PyUnicode_IsCaseIgnorable(c))
9268            break;
9269    }
9270    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9271    if (final_sigma) {
9272        for (j = i + 1; j < length; j++) {
9273            c = PyUnicode_READ(kind, data, j);
9274            if (!_PyUnicode_IsCaseIgnorable(c))
9275                break;
9276        }
9277        final_sigma = j == length || !_PyUnicode_IsCased(c);
9278    }
9279    return (final_sigma) ? 0x3C2 : 0x3C3;
9280}
9281
9282static int
9283lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9284           Py_UCS4 c, Py_UCS4 *mapped)
9285{
9286    /* Obscure special case. */
9287    if (c == 0x3A3) {
9288        mapped[0] = handle_capital_sigma(kind, data, length, i);
9289        return 1;
9290    }
9291    return _PyUnicode_ToLowerFull(c, mapped);
9292}
9293
9294static Py_ssize_t
9295do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9296{
9297    Py_ssize_t i, k = 0;
9298    int n_res, j;
9299    Py_UCS4 c, mapped[3];
9300
9301    c = PyUnicode_READ(kind, data, 0);
9302    n_res = _PyUnicode_ToUpperFull(c, mapped);
9303    for (j = 0; j < n_res; j++) {
9304        *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9305        res[k++] = mapped[j];
9306    }
9307    for (i = 1; i < length; i++) {
9308        c = PyUnicode_READ(kind, data, i);
9309        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9310        for (j = 0; j < n_res; j++) {
9311            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9312            res[k++] = mapped[j];
9313        }
9314    }
9315    return k;
9316}
9317
9318static Py_ssize_t
9319do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9320    Py_ssize_t i, k = 0;
9321
9322    for (i = 0; i < length; i++) {
9323        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9324        int n_res, j;
9325        if (Py_UNICODE_ISUPPER(c)) {
9326            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9327        }
9328        else if (Py_UNICODE_ISLOWER(c)) {
9329            n_res = _PyUnicode_ToUpperFull(c, mapped);
9330        }
9331        else {
9332            n_res = 1;
9333            mapped[0] = c;
9334        }
9335        for (j = 0; j < n_res; j++) {
9336            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9337            res[k++] = mapped[j];
9338        }
9339    }
9340    return k;
9341}
9342
9343static Py_ssize_t
9344do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9345                  Py_UCS4 *maxchar, int lower)
9346{
9347    Py_ssize_t i, k = 0;
9348
9349    for (i = 0; i < length; i++) {
9350        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9351        int n_res, j;
9352        if (lower)
9353            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9354        else
9355            n_res = _PyUnicode_ToUpperFull(c, mapped);
9356        for (j = 0; j < n_res; j++) {
9357            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9358            res[k++] = mapped[j];
9359        }
9360    }
9361    return k;
9362}
9363
9364static Py_ssize_t
9365do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9366{
9367    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9368}
9369
9370static Py_ssize_t
9371do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9372{
9373    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9374}
9375
9376static Py_ssize_t
9377do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9378{
9379    Py_ssize_t i, k = 0;
9380
9381    for (i = 0; i < length; i++) {
9382        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9383        Py_UCS4 mapped[3];
9384        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9385        for (j = 0; j < n_res; j++) {
9386            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9387            res[k++] = mapped[j];
9388        }
9389    }
9390    return k;
9391}
9392
9393static Py_ssize_t
9394do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9395{
9396    Py_ssize_t i, k = 0;
9397    int previous_is_cased;
9398
9399    previous_is_cased = 0;
9400    for (i = 0; i < length; i++) {
9401        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9402        Py_UCS4 mapped[3];
9403        int n_res, j;
9404
9405        if (previous_is_cased)
9406            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9407        else
9408            n_res = _PyUnicode_ToTitleFull(c, mapped);
9409
9410        for (j = 0; j < n_res; j++) {
9411            *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
9412            res[k++] = mapped[j];
9413        }
9414
9415        previous_is_cased = _PyUnicode_IsCased(c);
9416    }
9417    return k;
9418}
9419
9420static PyObject *
9421case_operation(PyObject *self,
9422               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9423{
9424    PyObject *res = NULL;
9425    Py_ssize_t length, newlength = 0;
9426    int kind, outkind;
9427    void *data, *outdata;
9428    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9429
9430    assert(PyUnicode_IS_READY(self));
9431
9432    kind = PyUnicode_KIND(self);
9433    data = PyUnicode_DATA(self);
9434    length = PyUnicode_GET_LENGTH(self);
9435    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9436    if (tmp == NULL)
9437        return PyErr_NoMemory();
9438    newlength = perform(kind, data, length, tmp, &maxchar);
9439    res = PyUnicode_New(newlength, maxchar);
9440    if (res == NULL)
9441        goto leave;
9442    tmpend = tmp + newlength;
9443    outdata = PyUnicode_DATA(res);
9444    outkind = PyUnicode_KIND(res);
9445    switch (outkind) {
9446    case PyUnicode_1BYTE_KIND:
9447        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9448        break;
9449    case PyUnicode_2BYTE_KIND:
9450        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9451        break;
9452    case PyUnicode_4BYTE_KIND:
9453        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9454        break;
9455    default:
9456        assert(0);
9457        break;
9458    }
9459  leave:
9460    PyMem_FREE(tmp);
9461    return res;
9462}
9463
9464PyObject *
9465PyUnicode_Join(PyObject *separator, PyObject *seq)
9466{
9467    PyObject *sep = NULL;
9468    Py_ssize_t seplen;
9469    PyObject *res = NULL; /* the result */
9470    PyObject *fseq;          /* PySequence_Fast(seq) */
9471    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9472    PyObject **items;
9473    PyObject *item;
9474    Py_ssize_t sz, i, res_offset;
9475    Py_UCS4 maxchar;
9476    Py_UCS4 item_maxchar;
9477    int use_memcpy;
9478    unsigned char *res_data = NULL, *sep_data = NULL;
9479    PyObject *last_obj;
9480    unsigned int kind = 0;
9481
9482    fseq = PySequence_Fast(seq, "");
9483    if (fseq == NULL) {
9484        return NULL;
9485    }
9486
9487    /* NOTE: the following code can't call back into Python code,
9488     * so we are sure that fseq won't be mutated.
9489     */
9490
9491    seqlen = PySequence_Fast_GET_SIZE(fseq);
9492    /* If empty sequence, return u"". */
9493    if (seqlen == 0) {
9494        Py_DECREF(fseq);
9495        Py_INCREF(unicode_empty);
9496        res = unicode_empty;
9497        return res;
9498    }
9499
9500    /* If singleton sequence with an exact Unicode, return that. */
9501    last_obj = NULL;
9502    items = PySequence_Fast_ITEMS(fseq);
9503    if (seqlen == 1) {
9504        if (PyUnicode_CheckExact(items[0])) {
9505            res = items[0];
9506            Py_INCREF(res);
9507            Py_DECREF(fseq);
9508            return res;
9509        }
9510        seplen = 0;
9511        maxchar = 0;
9512    }
9513    else {
9514        /* Set up sep and seplen */
9515        if (separator == NULL) {
9516            /* fall back to a blank space separator */
9517            sep = PyUnicode_FromOrdinal(' ');
9518            if (!sep)
9519                goto onError;
9520            seplen = 1;
9521            maxchar = 32;
9522        }
9523        else {
9524            if (!PyUnicode_Check(separator)) {
9525                PyErr_Format(PyExc_TypeError,
9526                             "separator: expected str instance,"
9527                             " %.80s found",
9528                             Py_TYPE(separator)->tp_name);
9529                goto onError;
9530            }
9531            if (PyUnicode_READY(separator))
9532                goto onError;
9533            sep = separator;
9534            seplen = PyUnicode_GET_LENGTH(separator);
9535            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9536            /* inc refcount to keep this code path symmetric with the
9537               above case of a blank separator */
9538            Py_INCREF(sep);
9539        }
9540        last_obj = sep;
9541    }
9542
9543    /* There are at least two things to join, or else we have a subclass
9544     * of str in the sequence.
9545     * Do a pre-pass to figure out the total amount of space we'll
9546     * need (sz), and see whether all argument are strings.
9547     */
9548    sz = 0;
9549#ifdef Py_DEBUG
9550    use_memcpy = 0;
9551#else
9552    use_memcpy = 1;
9553#endif
9554    for (i = 0; i < seqlen; i++) {
9555        const Py_ssize_t old_sz = sz;
9556        item = items[i];
9557        if (!PyUnicode_Check(item)) {
9558            PyErr_Format(PyExc_TypeError,
9559                         "sequence item %zd: expected str instance,"
9560                         " %.80s found",
9561                         i, Py_TYPE(item)->tp_name);
9562            goto onError;
9563        }
9564        if (PyUnicode_READY(item) == -1)
9565            goto onError;
9566        sz += PyUnicode_GET_LENGTH(item);
9567        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9568        maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
9569        if (i != 0)
9570            sz += seplen;
9571        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9572            PyErr_SetString(PyExc_OverflowError,
9573                            "join() result is too long for a Python string");
9574            goto onError;
9575        }
9576        if (use_memcpy && last_obj != NULL) {
9577            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9578                use_memcpy = 0;
9579        }
9580        last_obj = item;
9581    }
9582
9583    res = PyUnicode_New(sz, maxchar);
9584    if (res == NULL)
9585        goto onError;
9586
9587    /* Catenate everything. */
9588#ifdef Py_DEBUG
9589    use_memcpy = 0;
9590#else
9591    if (use_memcpy) {
9592        res_data = PyUnicode_1BYTE_DATA(res);
9593        kind = PyUnicode_KIND(res);
9594        if (seplen != 0)
9595            sep_data = PyUnicode_1BYTE_DATA(sep);
9596    }
9597#endif
9598    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9599        Py_ssize_t itemlen;
9600        item = items[i];
9601        /* Copy item, and maybe the separator. */
9602        if (i && seplen != 0) {
9603            if (use_memcpy) {
9604                Py_MEMCPY(res_data,
9605                          sep_data,
9606                          kind * seplen);
9607                res_data += kind * seplen;
9608            }
9609            else {
9610                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9611                res_offset += seplen;
9612            }
9613        }
9614        itemlen = PyUnicode_GET_LENGTH(item);
9615        if (itemlen != 0) {
9616            if (use_memcpy) {
9617                Py_MEMCPY(res_data,
9618                          PyUnicode_DATA(item),
9619                          kind * itemlen);
9620                res_data += kind * itemlen;
9621            }
9622            else {
9623                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9624                res_offset += itemlen;
9625            }
9626        }
9627    }
9628    if (use_memcpy)
9629        assert(res_data == PyUnicode_1BYTE_DATA(res)
9630                           + kind * PyUnicode_GET_LENGTH(res));
9631    else
9632        assert(res_offset == PyUnicode_GET_LENGTH(res));
9633
9634    Py_DECREF(fseq);
9635    Py_XDECREF(sep);
9636    assert(_PyUnicode_CheckConsistency(res, 1));
9637    return res;
9638
9639  onError:
9640    Py_DECREF(fseq);
9641    Py_XDECREF(sep);
9642    Py_XDECREF(res);
9643    return NULL;
9644}
9645
9646#define FILL(kind, data, value, start, length) \
9647    do { \
9648        Py_ssize_t i_ = 0; \
9649        assert(kind != PyUnicode_WCHAR_KIND); \
9650        switch ((kind)) { \
9651        case PyUnicode_1BYTE_KIND: { \
9652            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9653            memset(to_, (unsigned char)value, (length)); \
9654            break; \
9655        } \
9656        case PyUnicode_2BYTE_KIND: { \
9657            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9658            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9659            break; \
9660        } \
9661        case PyUnicode_4BYTE_KIND: { \
9662            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9663            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9664            break; \
9665        default: assert(0); \
9666        } \
9667        } \
9668    } while (0)
9669
9670void
9671_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9672                    Py_UCS4 fill_char)
9673{
9674    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9675    const void *data = PyUnicode_DATA(unicode);
9676    assert(PyUnicode_IS_READY(unicode));
9677    assert(unicode_modifiable(unicode));
9678    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9679    assert(start >= 0);
9680    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9681    FILL(kind, data, fill_char, start, length);
9682}
9683
9684Py_ssize_t
9685PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9686               Py_UCS4 fill_char)
9687{
9688    Py_ssize_t maxlen;
9689
9690    if (!PyUnicode_Check(unicode)) {
9691        PyErr_BadInternalCall();
9692        return -1;
9693    }
9694    if (PyUnicode_READY(unicode) == -1)
9695        return -1;
9696    if (unicode_check_modifiable(unicode))
9697        return -1;
9698
9699    if (start < 0) {
9700        PyErr_SetString(PyExc_IndexError, "string index out of range");
9701        return -1;
9702    }
9703    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9704        PyErr_SetString(PyExc_ValueError,
9705                         "fill character is bigger than "
9706                         "the string maximum character");
9707        return -1;
9708    }
9709
9710    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9711    length = Py_MIN(maxlen, length);
9712    if (length <= 0)
9713        return 0;
9714
9715    _PyUnicode_FastFill(unicode, start, length, fill_char);
9716    return length;
9717}
9718
9719static PyObject *
9720pad(PyObject *self,
9721    Py_ssize_t left,
9722    Py_ssize_t right,
9723    Py_UCS4 fill)
9724{
9725    PyObject *u;
9726    Py_UCS4 maxchar;
9727    int kind;
9728    void *data;
9729
9730    if (left < 0)
9731        left = 0;
9732    if (right < 0)
9733        right = 0;
9734
9735    if (left == 0 && right == 0)
9736        return unicode_result_unchanged(self);
9737
9738    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9739        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9740        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9741        return NULL;
9742    }
9743    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9744    maxchar = MAX_MAXCHAR(maxchar, fill);
9745    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9746    if (!u)
9747        return NULL;
9748
9749    kind = PyUnicode_KIND(u);
9750    data = PyUnicode_DATA(u);
9751    if (left)
9752        FILL(kind, data, fill, 0, left);
9753    if (right)
9754        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9755    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9756    assert(_PyUnicode_CheckConsistency(u, 1));
9757    return u;
9758}
9759
9760PyObject *
9761PyUnicode_Splitlines(PyObject *string, int keepends)
9762{
9763    PyObject *list;
9764
9765    string = PyUnicode_FromObject(string);
9766    if (string == NULL)
9767        return NULL;
9768    if (PyUnicode_READY(string) == -1) {
9769        Py_DECREF(string);
9770        return NULL;
9771    }
9772
9773    switch (PyUnicode_KIND(string)) {
9774    case PyUnicode_1BYTE_KIND:
9775        if (PyUnicode_IS_ASCII(string))
9776            list = asciilib_splitlines(
9777                string, PyUnicode_1BYTE_DATA(string),
9778                PyUnicode_GET_LENGTH(string), keepends);
9779        else
9780            list = ucs1lib_splitlines(
9781                string, PyUnicode_1BYTE_DATA(string),
9782                PyUnicode_GET_LENGTH(string), keepends);
9783        break;
9784    case PyUnicode_2BYTE_KIND:
9785        list = ucs2lib_splitlines(
9786            string, PyUnicode_2BYTE_DATA(string),
9787            PyUnicode_GET_LENGTH(string), keepends);
9788        break;
9789    case PyUnicode_4BYTE_KIND:
9790        list = ucs4lib_splitlines(
9791            string, PyUnicode_4BYTE_DATA(string),
9792            PyUnicode_GET_LENGTH(string), keepends);
9793        break;
9794    default:
9795        assert(0);
9796        list = 0;
9797    }
9798    Py_DECREF(string);
9799    return list;
9800}
9801
9802static PyObject *
9803split(PyObject *self,
9804      PyObject *substring,
9805      Py_ssize_t maxcount)
9806{
9807    int kind1, kind2, kind;
9808    void *buf1, *buf2;
9809    Py_ssize_t len1, len2;
9810    PyObject* out;
9811
9812    if (maxcount < 0)
9813        maxcount = PY_SSIZE_T_MAX;
9814
9815    if (PyUnicode_READY(self) == -1)
9816        return NULL;
9817
9818    if (substring == NULL)
9819        switch (PyUnicode_KIND(self)) {
9820        case PyUnicode_1BYTE_KIND:
9821            if (PyUnicode_IS_ASCII(self))
9822                return asciilib_split_whitespace(
9823                    self,  PyUnicode_1BYTE_DATA(self),
9824                    PyUnicode_GET_LENGTH(self), maxcount
9825                    );
9826            else
9827                return ucs1lib_split_whitespace(
9828                    self,  PyUnicode_1BYTE_DATA(self),
9829                    PyUnicode_GET_LENGTH(self), maxcount
9830                    );
9831        case PyUnicode_2BYTE_KIND:
9832            return ucs2lib_split_whitespace(
9833                self,  PyUnicode_2BYTE_DATA(self),
9834                PyUnicode_GET_LENGTH(self), maxcount
9835                );
9836        case PyUnicode_4BYTE_KIND:
9837            return ucs4lib_split_whitespace(
9838                self,  PyUnicode_4BYTE_DATA(self),
9839                PyUnicode_GET_LENGTH(self), maxcount
9840                );
9841        default:
9842            assert(0);
9843            return NULL;
9844        }
9845
9846    if (PyUnicode_READY(substring) == -1)
9847        return NULL;
9848
9849    kind1 = PyUnicode_KIND(self);
9850    kind2 = PyUnicode_KIND(substring);
9851    kind = kind1 > kind2 ? kind1 : kind2;
9852    buf1 = PyUnicode_DATA(self);
9853    buf2 = PyUnicode_DATA(substring);
9854    if (kind1 != kind)
9855        buf1 = _PyUnicode_AsKind(self, kind);
9856    if (!buf1)
9857        return NULL;
9858    if (kind2 != kind)
9859        buf2 = _PyUnicode_AsKind(substring, kind);
9860    if (!buf2) {
9861        if (kind1 != kind) PyMem_Free(buf1);
9862        return NULL;
9863    }
9864    len1 = PyUnicode_GET_LENGTH(self);
9865    len2 = PyUnicode_GET_LENGTH(substring);
9866
9867    switch (kind) {
9868    case PyUnicode_1BYTE_KIND:
9869        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9870            out = asciilib_split(
9871                self,  buf1, len1, buf2, len2, maxcount);
9872        else
9873            out = ucs1lib_split(
9874                self,  buf1, len1, buf2, len2, maxcount);
9875        break;
9876    case PyUnicode_2BYTE_KIND:
9877        out = ucs2lib_split(
9878            self,  buf1, len1, buf2, len2, maxcount);
9879        break;
9880    case PyUnicode_4BYTE_KIND:
9881        out = ucs4lib_split(
9882            self,  buf1, len1, buf2, len2, maxcount);
9883        break;
9884    default:
9885        out = NULL;
9886    }
9887    if (kind1 != kind)
9888        PyMem_Free(buf1);
9889    if (kind2 != kind)
9890        PyMem_Free(buf2);
9891    return out;
9892}
9893
9894static PyObject *
9895rsplit(PyObject *self,
9896       PyObject *substring,
9897       Py_ssize_t maxcount)
9898{
9899    int kind1, kind2, kind;
9900    void *buf1, *buf2;
9901    Py_ssize_t len1, len2;
9902    PyObject* out;
9903
9904    if (maxcount < 0)
9905        maxcount = PY_SSIZE_T_MAX;
9906
9907    if (PyUnicode_READY(self) == -1)
9908        return NULL;
9909
9910    if (substring == NULL)
9911        switch (PyUnicode_KIND(self)) {
9912        case PyUnicode_1BYTE_KIND:
9913            if (PyUnicode_IS_ASCII(self))
9914                return asciilib_rsplit_whitespace(
9915                    self,  PyUnicode_1BYTE_DATA(self),
9916                    PyUnicode_GET_LENGTH(self), maxcount
9917                    );
9918            else
9919                return ucs1lib_rsplit_whitespace(
9920                    self,  PyUnicode_1BYTE_DATA(self),
9921                    PyUnicode_GET_LENGTH(self), maxcount
9922                    );
9923        case PyUnicode_2BYTE_KIND:
9924            return ucs2lib_rsplit_whitespace(
9925                self,  PyUnicode_2BYTE_DATA(self),
9926                PyUnicode_GET_LENGTH(self), maxcount
9927                );
9928        case PyUnicode_4BYTE_KIND:
9929            return ucs4lib_rsplit_whitespace(
9930                self,  PyUnicode_4BYTE_DATA(self),
9931                PyUnicode_GET_LENGTH(self), maxcount
9932                );
9933        default:
9934            assert(0);
9935            return NULL;
9936        }
9937
9938    if (PyUnicode_READY(substring) == -1)
9939        return NULL;
9940
9941    kind1 = PyUnicode_KIND(self);
9942    kind2 = PyUnicode_KIND(substring);
9943    kind = kind1 > kind2 ? kind1 : kind2;
9944    buf1 = PyUnicode_DATA(self);
9945    buf2 = PyUnicode_DATA(substring);
9946    if (kind1 != kind)
9947        buf1 = _PyUnicode_AsKind(self, kind);
9948    if (!buf1)
9949        return NULL;
9950    if (kind2 != kind)
9951        buf2 = _PyUnicode_AsKind(substring, kind);
9952    if (!buf2) {
9953        if (kind1 != kind) PyMem_Free(buf1);
9954        return NULL;
9955    }
9956    len1 = PyUnicode_GET_LENGTH(self);
9957    len2 = PyUnicode_GET_LENGTH(substring);
9958
9959    switch (kind) {
9960    case PyUnicode_1BYTE_KIND:
9961        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9962            out = asciilib_rsplit(
9963                self,  buf1, len1, buf2, len2, maxcount);
9964        else
9965            out = ucs1lib_rsplit(
9966                self,  buf1, len1, buf2, len2, maxcount);
9967        break;
9968    case PyUnicode_2BYTE_KIND:
9969        out = ucs2lib_rsplit(
9970            self,  buf1, len1, buf2, len2, maxcount);
9971        break;
9972    case PyUnicode_4BYTE_KIND:
9973        out = ucs4lib_rsplit(
9974            self,  buf1, len1, buf2, len2, maxcount);
9975        break;
9976    default:
9977        out = NULL;
9978    }
9979    if (kind1 != kind)
9980        PyMem_Free(buf1);
9981    if (kind2 != kind)
9982        PyMem_Free(buf2);
9983    return out;
9984}
9985
9986static Py_ssize_t
9987anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9988            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9989{
9990    switch (kind) {
9991    case PyUnicode_1BYTE_KIND:
9992        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9993            return asciilib_find(buf1, len1, buf2, len2, offset);
9994        else
9995            return ucs1lib_find(buf1, len1, buf2, len2, offset);
9996    case PyUnicode_2BYTE_KIND:
9997        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9998    case PyUnicode_4BYTE_KIND:
9999        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10000    }
10001    assert(0);
10002    return -1;
10003}
10004
10005static Py_ssize_t
10006anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10007             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10008{
10009    switch (kind) {
10010    case PyUnicode_1BYTE_KIND:
10011        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10012            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10013        else
10014            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10015    case PyUnicode_2BYTE_KIND:
10016        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10017    case PyUnicode_4BYTE_KIND:
10018        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10019    }
10020    assert(0);
10021    return 0;
10022}
10023
10024static PyObject *
10025replace(PyObject *self, PyObject *str1,
10026        PyObject *str2, Py_ssize_t maxcount)
10027{
10028    PyObject *u;
10029    char *sbuf = PyUnicode_DATA(self);
10030    char *buf1 = PyUnicode_DATA(str1);
10031    char *buf2 = PyUnicode_DATA(str2);
10032    int srelease = 0, release1 = 0, release2 = 0;
10033    int skind = PyUnicode_KIND(self);
10034    int kind1 = PyUnicode_KIND(str1);
10035    int kind2 = PyUnicode_KIND(str2);
10036    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10037    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10038    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10039    int mayshrink;
10040    Py_UCS4 maxchar, maxchar_str2;
10041
10042    if (maxcount < 0)
10043        maxcount = PY_SSIZE_T_MAX;
10044    else if (maxcount == 0 || slen == 0)
10045        goto nothing;
10046
10047    if (str1 == str2)
10048        goto nothing;
10049    if (skind < kind1)
10050        /* substring too wide to be present */
10051        goto nothing;
10052
10053    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10054    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10055    /* Replacing str1 with str2 may cause a maxchar reduction in the
10056       result string. */
10057    mayshrink = (maxchar_str2 < maxchar);
10058    maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
10059
10060    if (len1 == len2) {
10061        /* same length */
10062        if (len1 == 0)
10063            goto nothing;
10064        if (len1 == 1) {
10065            /* replace characters */
10066            Py_UCS4 u1, u2;
10067            int rkind;
10068            Py_ssize_t index, pos;
10069            char *src;
10070
10071            u1 = PyUnicode_READ_CHAR(str1, 0);
10072            pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10073            if (pos < 0)
10074                goto nothing;
10075            u2 = PyUnicode_READ_CHAR(str2, 0);
10076            u = PyUnicode_New(slen, maxchar);
10077            if (!u)
10078                goto error;
10079            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10080            rkind = PyUnicode_KIND(u);
10081
10082            PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10083            index = 0;
10084            src = sbuf;
10085            while (--maxcount)
10086            {
10087                pos++;
10088                src += pos * PyUnicode_KIND(self);
10089                slen -= pos;
10090                index += pos;
10091                pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10092                if (pos < 0)
10093                    break;
10094                PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10095            }
10096        }
10097        else {
10098            int rkind = skind;
10099            char *res;
10100            Py_ssize_t i;
10101
10102            if (kind1 < rkind) {
10103                /* widen substring */
10104                buf1 = _PyUnicode_AsKind(str1, rkind);
10105                if (!buf1) goto error;
10106                release1 = 1;
10107            }
10108            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10109            if (i < 0)
10110                goto nothing;
10111            if (rkind > kind2) {
10112                /* widen replacement */
10113                buf2 = _PyUnicode_AsKind(str2, rkind);
10114                if (!buf2) goto error;
10115                release2 = 1;
10116            }
10117            else if (rkind < kind2) {
10118                /* widen self and buf1 */
10119                rkind = kind2;
10120                if (release1) PyMem_Free(buf1);
10121                release1 = 0;
10122                sbuf = _PyUnicode_AsKind(self, rkind);
10123                if (!sbuf) goto error;
10124                srelease = 1;
10125                buf1 = _PyUnicode_AsKind(str1, rkind);
10126                if (!buf1) goto error;
10127                release1 = 1;
10128            }
10129            u = PyUnicode_New(slen, maxchar);
10130            if (!u)
10131                goto error;
10132            assert(PyUnicode_KIND(u) == rkind);
10133            res = PyUnicode_DATA(u);
10134
10135            memcpy(res, sbuf, rkind * slen);
10136            /* change everything in-place, starting with this one */
10137            memcpy(res + rkind * i,
10138                   buf2,
10139                   rkind * len2);
10140            i += len1;
10141
10142            while ( --maxcount > 0) {
10143                i = anylib_find(rkind, self,
10144                                sbuf+rkind*i, slen-i,
10145                                str1, buf1, len1, i);
10146                if (i == -1)
10147                    break;
10148                memcpy(res + rkind * i,
10149                       buf2,
10150                       rkind * len2);
10151                i += len1;
10152            }
10153        }
10154    }
10155    else {
10156        Py_ssize_t n, i, j, ires;
10157        Py_ssize_t new_size;
10158        int rkind = skind;
10159        char *res;
10160
10161        if (kind1 < rkind) {
10162            /* widen substring */
10163            buf1 = _PyUnicode_AsKind(str1, rkind);
10164            if (!buf1) goto error;
10165            release1 = 1;
10166        }
10167        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10168        if (n == 0)
10169            goto nothing;
10170        if (kind2 < rkind) {
10171            /* widen replacement */
10172            buf2 = _PyUnicode_AsKind(str2, rkind);
10173            if (!buf2) goto error;
10174            release2 = 1;
10175        }
10176        else if (kind2 > rkind) {
10177            /* widen self and buf1 */
10178            rkind = kind2;
10179            sbuf = _PyUnicode_AsKind(self, rkind);
10180            if (!sbuf) goto error;
10181            srelease = 1;
10182            if (release1) PyMem_Free(buf1);
10183            release1 = 0;
10184            buf1 = _PyUnicode_AsKind(str1, rkind);
10185            if (!buf1) goto error;
10186            release1 = 1;
10187        }
10188        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10189           PyUnicode_GET_LENGTH(str1))); */
10190        if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10191                PyErr_SetString(PyExc_OverflowError,
10192                                "replace string is too long");
10193                goto error;
10194        }
10195        new_size = slen + n * (len2 - len1);
10196        if (new_size == 0) {
10197            Py_INCREF(unicode_empty);
10198            u = unicode_empty;
10199            goto done;
10200        }
10201        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10202            PyErr_SetString(PyExc_OverflowError,
10203                            "replace string is too long");
10204            goto error;
10205        }
10206        u = PyUnicode_New(new_size, maxchar);
10207        if (!u)
10208            goto error;
10209        assert(PyUnicode_KIND(u) == rkind);
10210        res = PyUnicode_DATA(u);
10211        ires = i = 0;
10212        if (len1 > 0) {
10213            while (n-- > 0) {
10214                /* look for next match */
10215                j = anylib_find(rkind, self,
10216                                sbuf + rkind * i, slen-i,
10217                                str1, buf1, len1, i);
10218                if (j == -1)
10219                    break;
10220                else if (j > i) {
10221                    /* copy unchanged part [i:j] */
10222                    memcpy(res + rkind * ires,
10223                           sbuf + rkind * i,
10224                           rkind * (j-i));
10225                    ires += j - i;
10226                }
10227                /* copy substitution string */
10228                if (len2 > 0) {
10229                    memcpy(res + rkind * ires,
10230                           buf2,
10231                           rkind * len2);
10232                    ires += len2;
10233                }
10234                i = j + len1;
10235            }
10236            if (i < slen)
10237                /* copy tail [i:] */
10238                memcpy(res + rkind * ires,
10239                       sbuf + rkind * i,
10240                       rkind * (slen-i));
10241        }
10242        else {
10243            /* interleave */
10244            while (n > 0) {
10245                memcpy(res + rkind * ires,
10246                       buf2,
10247                       rkind * len2);
10248                ires += len2;
10249                if (--n <= 0)
10250                    break;
10251                memcpy(res + rkind * ires,
10252                       sbuf + rkind * i,
10253                       rkind);
10254                ires++;
10255                i++;
10256            }
10257            memcpy(res + rkind * ires,
10258                   sbuf + rkind * i,
10259                   rkind * (slen-i));
10260        }
10261    }
10262
10263    if (mayshrink) {
10264        unicode_adjust_maxchar(&u);
10265        if (u == NULL)
10266            goto error;
10267    }
10268
10269  done:
10270    if (srelease)
10271        PyMem_FREE(sbuf);
10272    if (release1)
10273        PyMem_FREE(buf1);
10274    if (release2)
10275        PyMem_FREE(buf2);
10276    assert(_PyUnicode_CheckConsistency(u, 1));
10277    return u;
10278
10279  nothing:
10280    /* nothing to replace; return original string (when possible) */
10281    if (srelease)
10282        PyMem_FREE(sbuf);
10283    if (release1)
10284        PyMem_FREE(buf1);
10285    if (release2)
10286        PyMem_FREE(buf2);
10287    return unicode_result_unchanged(self);
10288
10289  error:
10290    if (srelease && sbuf)
10291        PyMem_FREE(sbuf);
10292    if (release1 && buf1)
10293        PyMem_FREE(buf1);
10294    if (release2 && buf2)
10295        PyMem_FREE(buf2);
10296    return NULL;
10297}
10298
10299/* --- Unicode Object Methods --------------------------------------------- */
10300
10301PyDoc_STRVAR(title__doc__,
10302             "S.title() -> str\n\
10303\n\
10304Return a titlecased version of S, i.e. words start with title case\n\
10305characters, all remaining cased characters have lower case.");
10306
10307static PyObject*
10308unicode_title(PyObject *self)
10309{
10310    if (PyUnicode_READY(self) == -1)
10311        return NULL;
10312    return case_operation(self, do_title);
10313}
10314
10315PyDoc_STRVAR(capitalize__doc__,
10316             "S.capitalize() -> str\n\
10317\n\
10318Return a capitalized version of S, i.e. make the first character\n\
10319have upper case and the rest lower case.");
10320
10321static PyObject*
10322unicode_capitalize(PyObject *self)
10323{
10324    if (PyUnicode_READY(self) == -1)
10325        return NULL;
10326    if (PyUnicode_GET_LENGTH(self) == 0)
10327        return unicode_result_unchanged(self);
10328    return case_operation(self, do_capitalize);
10329}
10330
10331PyDoc_STRVAR(casefold__doc__,
10332             "S.casefold() -> str\n\
10333\n\
10334Return a version of S suitable for caseless comparisons.");
10335
10336static PyObject *
10337unicode_casefold(PyObject *self)
10338{
10339    if (PyUnicode_READY(self) == -1)
10340        return NULL;
10341    if (PyUnicode_IS_ASCII(self))
10342        return ascii_upper_or_lower(self, 1);
10343    return case_operation(self, do_casefold);
10344}
10345
10346
10347/* Argument converter.  Coerces to a single unicode character */
10348
10349static int
10350convert_uc(PyObject *obj, void *addr)
10351{
10352    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10353    PyObject *uniobj;
10354
10355    uniobj = PyUnicode_FromObject(obj);
10356    if (uniobj == NULL) {
10357        PyErr_SetString(PyExc_TypeError,
10358                        "The fill character cannot be converted to Unicode");
10359        return 0;
10360    }
10361    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10362        PyErr_SetString(PyExc_TypeError,
10363                        "The fill character must be exactly one character long");
10364        Py_DECREF(uniobj);
10365        return 0;
10366    }
10367    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10368    Py_DECREF(uniobj);
10369    return 1;
10370}
10371
10372PyDoc_STRVAR(center__doc__,
10373             "S.center(width[, fillchar]) -> str\n\
10374\n\
10375Return S centered in a string of length width. Padding is\n\
10376done using the specified fill character (default is a space)");
10377
10378static PyObject *
10379unicode_center(PyObject *self, PyObject *args)
10380{
10381    Py_ssize_t marg, left;
10382    Py_ssize_t width;
10383    Py_UCS4 fillchar = ' ';
10384
10385    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10386        return NULL;
10387
10388    if (PyUnicode_READY(self) == -1)
10389        return NULL;
10390
10391    if (PyUnicode_GET_LENGTH(self) >= width)
10392        return unicode_result_unchanged(self);
10393
10394    marg = width - PyUnicode_GET_LENGTH(self);
10395    left = marg / 2 + (marg & width & 1);
10396
10397    return pad(self, left, marg - left, fillchar);
10398}
10399
10400/* This function assumes that str1 and str2 are readied by the caller. */
10401
10402static int
10403unicode_compare(PyObject *str1, PyObject *str2)
10404{
10405    int kind1, kind2;
10406    void *data1, *data2;
10407    Py_ssize_t len1, len2, i;
10408
10409    kind1 = PyUnicode_KIND(str1);
10410    kind2 = PyUnicode_KIND(str2);
10411    data1 = PyUnicode_DATA(str1);
10412    data2 = PyUnicode_DATA(str2);
10413    len1 = PyUnicode_GET_LENGTH(str1);
10414    len2 = PyUnicode_GET_LENGTH(str2);
10415
10416    for (i = 0; i < len1 && i < len2; ++i) {
10417        Py_UCS4 c1, c2;
10418        c1 = PyUnicode_READ(kind1, data1, i);
10419        c2 = PyUnicode_READ(kind2, data2, i);
10420
10421        if (c1 != c2)
10422            return (c1 < c2) ? -1 : 1;
10423    }
10424
10425    return (len1 < len2) ? -1 : (len1 != len2);
10426}
10427
10428int
10429PyUnicode_Compare(PyObject *left, PyObject *right)
10430{
10431    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10432        if (PyUnicode_READY(left) == -1 ||
10433            PyUnicode_READY(right) == -1)
10434            return -1;
10435        return unicode_compare(left, right);
10436    }
10437    PyErr_Format(PyExc_TypeError,
10438                 "Can't compare %.100s and %.100s",
10439                 left->ob_type->tp_name,
10440                 right->ob_type->tp_name);
10441    return -1;
10442}
10443
10444int
10445PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10446{
10447    Py_ssize_t i;
10448    int kind;
10449    void *data;
10450    Py_UCS4 chr;
10451
10452    assert(_PyUnicode_CHECK(uni));
10453    if (PyUnicode_READY(uni) == -1)
10454        return -1;
10455    kind = PyUnicode_KIND(uni);
10456    data = PyUnicode_DATA(uni);
10457    /* Compare Unicode string and source character set string */
10458    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10459        if (chr != str[i])
10460            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10461    /* This check keeps Python strings that end in '\0' from comparing equal
10462     to C strings identical up to that point. */
10463    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10464        return 1; /* uni is longer */
10465    if (str[i])
10466        return -1; /* str is longer */
10467    return 0;
10468}
10469
10470
10471#define TEST_COND(cond)                         \
10472    ((cond) ? Py_True : Py_False)
10473
10474PyObject *
10475PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10476{
10477    int result;
10478
10479    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10480        PyObject *v;
10481        if (PyUnicode_READY(left) == -1 ||
10482            PyUnicode_READY(right) == -1)
10483            return NULL;
10484        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10485            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
10486            if (op == Py_EQ) {
10487                Py_INCREF(Py_False);
10488                return Py_False;
10489            }
10490            if (op == Py_NE) {
10491                Py_INCREF(Py_True);
10492                return Py_True;
10493            }
10494        }
10495        if (left == right)
10496            result = 0;
10497        else
10498            result = unicode_compare(left, right);
10499
10500        /* Convert the return value to a Boolean */
10501        switch (op) {
10502        case Py_EQ:
10503            v = TEST_COND(result == 0);
10504            break;
10505        case Py_NE:
10506            v = TEST_COND(result != 0);
10507            break;
10508        case Py_LE:
10509            v = TEST_COND(result <= 0);
10510            break;
10511        case Py_GE:
10512            v = TEST_COND(result >= 0);
10513            break;
10514        case Py_LT:
10515            v = TEST_COND(result == -1);
10516            break;
10517        case Py_GT:
10518            v = TEST_COND(result == 1);
10519            break;
10520        default:
10521            PyErr_BadArgument();
10522            return NULL;
10523        }
10524        Py_INCREF(v);
10525        return v;
10526    }
10527
10528    Py_RETURN_NOTIMPLEMENTED;
10529}
10530
10531int
10532PyUnicode_Contains(PyObject *container, PyObject *element)
10533{
10534    PyObject *str, *sub;
10535    int kind1, kind2, kind;
10536    void *buf1, *buf2;
10537    Py_ssize_t len1, len2;
10538    int result;
10539
10540    /* Coerce the two arguments */
10541    sub = PyUnicode_FromObject(element);
10542    if (!sub) {
10543        PyErr_Format(PyExc_TypeError,
10544                     "'in <string>' requires string as left operand, not %s",
10545                     element->ob_type->tp_name);
10546        return -1;
10547    }
10548
10549    str = PyUnicode_FromObject(container);
10550    if (!str) {
10551        Py_DECREF(sub);
10552        return -1;
10553    }
10554    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10555        Py_DECREF(sub);
10556        Py_DECREF(str);
10557    }
10558
10559    kind1 = PyUnicode_KIND(str);
10560    kind2 = PyUnicode_KIND(sub);
10561    kind = kind1;
10562    buf1 = PyUnicode_DATA(str);
10563    buf2 = PyUnicode_DATA(sub);
10564    if (kind2 != kind) {
10565        if (kind2 > kind) {
10566            Py_DECREF(sub);
10567            Py_DECREF(str);
10568            return 0;
10569        }
10570        buf2 = _PyUnicode_AsKind(sub, kind);
10571    }
10572    if (!buf2) {
10573        Py_DECREF(sub);
10574        Py_DECREF(str);
10575        return -1;
10576    }
10577    len1 = PyUnicode_GET_LENGTH(str);
10578    len2 = PyUnicode_GET_LENGTH(sub);
10579
10580    switch (kind) {
10581    case PyUnicode_1BYTE_KIND:
10582        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10583        break;
10584    case PyUnicode_2BYTE_KIND:
10585        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10586        break;
10587    case PyUnicode_4BYTE_KIND:
10588        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10589        break;
10590    default:
10591        result = -1;
10592        assert(0);
10593    }
10594
10595    Py_DECREF(str);
10596    Py_DECREF(sub);
10597
10598    if (kind2 != kind)
10599        PyMem_Free(buf2);
10600
10601    return result;
10602}
10603
10604/* Concat to string or Unicode object giving a new Unicode object. */
10605
10606PyObject *
10607PyUnicode_Concat(PyObject *left, PyObject *right)
10608{
10609    PyObject *u = NULL, *v = NULL, *w;
10610    Py_UCS4 maxchar, maxchar2;
10611    Py_ssize_t u_len, v_len, new_len;
10612
10613    /* Coerce the two arguments */
10614    u = PyUnicode_FromObject(left);
10615    if (u == NULL)
10616        goto onError;
10617    v = PyUnicode_FromObject(right);
10618    if (v == NULL)
10619        goto onError;
10620
10621    /* Shortcuts */
10622    if (v == unicode_empty) {
10623        Py_DECREF(v);
10624        return u;
10625    }
10626    if (u == unicode_empty) {
10627        Py_DECREF(u);
10628        return v;
10629    }
10630
10631    u_len = PyUnicode_GET_LENGTH(u);
10632    v_len = PyUnicode_GET_LENGTH(v);
10633    if (u_len > PY_SSIZE_T_MAX - v_len) {
10634        PyErr_SetString(PyExc_OverflowError,
10635                        "strings are too large to concat");
10636        goto onError;
10637    }
10638    new_len = u_len + v_len;
10639
10640    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10641    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10642    maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10643
10644    /* Concat the two Unicode strings */
10645    w = PyUnicode_New(new_len, maxchar);
10646    if (w == NULL)
10647        goto onError;
10648    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10649    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
10650    Py_DECREF(u);
10651    Py_DECREF(v);
10652    assert(_PyUnicode_CheckConsistency(w, 1));
10653    return w;
10654
10655  onError:
10656    Py_XDECREF(u);
10657    Py_XDECREF(v);
10658    return NULL;
10659}
10660
10661void
10662PyUnicode_Append(PyObject **p_left, PyObject *right)
10663{
10664    PyObject *left, *res;
10665    Py_UCS4 maxchar, maxchar2;
10666    Py_ssize_t left_len, right_len, new_len;
10667
10668    if (p_left == NULL) {
10669        if (!PyErr_Occurred())
10670            PyErr_BadInternalCall();
10671        return;
10672    }
10673    left = *p_left;
10674    if (right == NULL || !PyUnicode_Check(left)) {
10675        if (!PyErr_Occurred())
10676            PyErr_BadInternalCall();
10677        goto error;
10678    }
10679
10680    if (PyUnicode_READY(left) == -1)
10681        goto error;
10682    if (PyUnicode_READY(right) == -1)
10683        goto error;
10684
10685    /* Shortcuts */
10686    if (left == unicode_empty) {
10687        Py_DECREF(left);
10688        Py_INCREF(right);
10689        *p_left = right;
10690        return;
10691    }
10692    if (right == unicode_empty)
10693        return;
10694
10695    left_len = PyUnicode_GET_LENGTH(left);
10696    right_len = PyUnicode_GET_LENGTH(right);
10697    if (left_len > PY_SSIZE_T_MAX - right_len) {
10698        PyErr_SetString(PyExc_OverflowError,
10699                        "strings are too large to concat");
10700        goto error;
10701    }
10702    new_len = left_len + right_len;
10703
10704    if (unicode_modifiable(left)
10705        && PyUnicode_CheckExact(right)
10706        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
10707        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10708           to change the structure size, but characters are stored just after
10709           the structure, and so it requires to move all characters which is
10710           not so different than duplicating the string. */
10711        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10712    {
10713        /* append inplace */
10714        if (unicode_resize(p_left, new_len) != 0) {
10715            /* XXX if _PyUnicode_Resize() fails, 'left' has been
10716             * deallocated so it cannot be put back into
10717             * 'variable'.  The MemoryError is raised when there
10718             * is no value in 'variable', which might (very
10719             * remotely) be a cause of incompatibilities.
10720             */
10721            goto error;
10722        }
10723        /* copy 'right' into the newly allocated area of 'left' */
10724        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
10725    }
10726    else {
10727        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10728        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10729        maxchar = MAX_MAXCHAR(maxchar, maxchar2);
10730
10731        /* Concat the two Unicode strings */
10732        res = PyUnicode_New(new_len, maxchar);
10733        if (res == NULL)
10734            goto error;
10735        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10736        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
10737        Py_DECREF(left);
10738        *p_left = res;
10739    }
10740    assert(_PyUnicode_CheckConsistency(*p_left, 1));
10741    return;
10742
10743error:
10744    Py_CLEAR(*p_left);
10745}
10746
10747void
10748PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10749{
10750    PyUnicode_Append(pleft, right);
10751    Py_XDECREF(right);
10752}
10753
10754PyDoc_STRVAR(count__doc__,
10755             "S.count(sub[, start[, end]]) -> int\n\
10756\n\
10757Return the number of non-overlapping occurrences of substring sub in\n\
10758string S[start:end].  Optional arguments start and end are\n\
10759interpreted as in slice notation.");
10760
10761static PyObject *
10762unicode_count(PyObject *self, PyObject *args)
10763{
10764    PyObject *substring;
10765    Py_ssize_t start = 0;
10766    Py_ssize_t end = PY_SSIZE_T_MAX;
10767    PyObject *result;
10768    int kind1, kind2, kind;
10769    void *buf1, *buf2;
10770    Py_ssize_t len1, len2, iresult;
10771
10772    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10773                                            &start, &end))
10774        return NULL;
10775
10776    kind1 = PyUnicode_KIND(self);
10777    kind2 = PyUnicode_KIND(substring);
10778    if (kind2 > kind1)
10779        return PyLong_FromLong(0);
10780    kind = kind1;
10781    buf1 = PyUnicode_DATA(self);
10782    buf2 = PyUnicode_DATA(substring);
10783    if (kind2 != kind)
10784        buf2 = _PyUnicode_AsKind(substring, kind);
10785    if (!buf2) {
10786        Py_DECREF(substring);
10787        return NULL;
10788    }
10789    len1 = PyUnicode_GET_LENGTH(self);
10790    len2 = PyUnicode_GET_LENGTH(substring);
10791
10792    ADJUST_INDICES(start, end, len1);
10793    switch (kind) {
10794    case PyUnicode_1BYTE_KIND:
10795        iresult = ucs1lib_count(
10796            ((Py_UCS1*)buf1) + start, end - start,
10797            buf2, len2, PY_SSIZE_T_MAX
10798            );
10799        break;
10800    case PyUnicode_2BYTE_KIND:
10801        iresult = ucs2lib_count(
10802            ((Py_UCS2*)buf1) + start, end - start,
10803            buf2, len2, PY_SSIZE_T_MAX
10804            );
10805        break;
10806    case PyUnicode_4BYTE_KIND:
10807        iresult = ucs4lib_count(
10808            ((Py_UCS4*)buf1) + start, end - start,
10809            buf2, len2, PY_SSIZE_T_MAX
10810            );
10811        break;
10812    default:
10813        assert(0); iresult = 0;
10814    }
10815
10816    result = PyLong_FromSsize_t(iresult);
10817
10818    if (kind2 != kind)
10819        PyMem_Free(buf2);
10820
10821    Py_DECREF(substring);
10822
10823    return result;
10824}
10825
10826PyDoc_STRVAR(encode__doc__,
10827             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10828\n\
10829Encode S using the codec registered for encoding. Default encoding\n\
10830is 'utf-8'. errors may be given to set a different error\n\
10831handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10832a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10833'xmlcharrefreplace' as well as any other name registered with\n\
10834codecs.register_error that can handle UnicodeEncodeErrors.");
10835
10836static PyObject *
10837unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10838{
10839    static char *kwlist[] = {"encoding", "errors", 0};
10840    char *encoding = NULL;
10841    char *errors = NULL;
10842
10843    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10844                                     kwlist, &encoding, &errors))
10845        return NULL;
10846    return PyUnicode_AsEncodedString(self, encoding, errors);
10847}
10848
10849PyDoc_STRVAR(expandtabs__doc__,
10850             "S.expandtabs([tabsize]) -> str\n\
10851\n\
10852Return a copy of S where all tab characters are expanded using spaces.\n\
10853If tabsize is not given, a tab size of 8 characters is assumed.");
10854
10855static PyObject*
10856unicode_expandtabs(PyObject *self, PyObject *args)
10857{
10858    Py_ssize_t i, j, line_pos, src_len, incr;
10859    Py_UCS4 ch;
10860    PyObject *u;
10861    void *src_data, *dest_data;
10862    int tabsize = 8;
10863    int kind;
10864    int found;
10865
10866    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10867        return NULL;
10868
10869    if (PyUnicode_READY(self) == -1)
10870        return NULL;
10871
10872    /* First pass: determine size of output string */
10873    src_len = PyUnicode_GET_LENGTH(self);
10874    i = j = line_pos = 0;
10875    kind = PyUnicode_KIND(self);
10876    src_data = PyUnicode_DATA(self);
10877    found = 0;
10878    for (; i < src_len; i++) {
10879        ch = PyUnicode_READ(kind, src_data, i);
10880        if (ch == '\t') {
10881            found = 1;
10882            if (tabsize > 0) {
10883                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10884                if (j > PY_SSIZE_T_MAX - incr)
10885                    goto overflow;
10886                line_pos += incr;
10887                j += incr;
10888            }
10889        }
10890        else {
10891            if (j > PY_SSIZE_T_MAX - 1)
10892                goto overflow;
10893            line_pos++;
10894            j++;
10895            if (ch == '\n' || ch == '\r')
10896                line_pos = 0;
10897        }
10898    }
10899    if (!found)
10900        return unicode_result_unchanged(self);
10901
10902    /* Second pass: create output string and fill it */
10903    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10904    if (!u)
10905        return NULL;
10906    dest_data = PyUnicode_DATA(u);
10907
10908    i = j = line_pos = 0;
10909
10910    for (; i < src_len; i++) {
10911        ch = PyUnicode_READ(kind, src_data, i);
10912        if (ch == '\t') {
10913            if (tabsize > 0) {
10914                incr = tabsize - (line_pos % tabsize);
10915                line_pos += incr;
10916                FILL(kind, dest_data, ' ', j, incr);
10917                j += incr;
10918            }
10919        }
10920        else {
10921            line_pos++;
10922            PyUnicode_WRITE(kind, dest_data, j, ch);
10923            j++;
10924            if (ch == '\n' || ch == '\r')
10925                line_pos = 0;
10926        }
10927    }
10928    assert (j == PyUnicode_GET_LENGTH(u));
10929    return unicode_result(u);
10930
10931  overflow:
10932    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10933    return NULL;
10934}
10935
10936PyDoc_STRVAR(find__doc__,
10937             "S.find(sub[, start[, end]]) -> int\n\
10938\n\
10939Return the lowest index in S where substring sub is found,\n\
10940such that sub is contained within S[start:end].  Optional\n\
10941arguments start and end are interpreted as in slice notation.\n\
10942\n\
10943Return -1 on failure.");
10944
10945static PyObject *
10946unicode_find(PyObject *self, PyObject *args)
10947{
10948    PyObject *substring;
10949    Py_ssize_t start;
10950    Py_ssize_t end;
10951    Py_ssize_t result;
10952
10953    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10954                                            &start, &end))
10955        return NULL;
10956
10957    if (PyUnicode_READY(self) == -1)
10958        return NULL;
10959    if (PyUnicode_READY(substring) == -1)
10960        return NULL;
10961
10962    result = any_find_slice(1, self, substring, start, end);
10963
10964    Py_DECREF(substring);
10965
10966    if (result == -2)
10967        return NULL;
10968
10969    return PyLong_FromSsize_t(result);
10970}
10971
10972static PyObject *
10973unicode_getitem(PyObject *self, Py_ssize_t index)
10974{
10975    void *data;
10976    enum PyUnicode_Kind kind;
10977    Py_UCS4 ch;
10978    PyObject *res;
10979
10980    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10981        PyErr_BadArgument();
10982        return NULL;
10983    }
10984    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10985        PyErr_SetString(PyExc_IndexError, "string index out of range");
10986        return NULL;
10987    }
10988    kind = PyUnicode_KIND(self);
10989    data = PyUnicode_DATA(self);
10990    ch = PyUnicode_READ(kind, data, index);
10991    if (ch < 256)
10992        return get_latin1_char(ch);
10993
10994    res = PyUnicode_New(1, ch);
10995    if (res == NULL)
10996        return NULL;
10997    kind = PyUnicode_KIND(res);
10998    data = PyUnicode_DATA(res);
10999    PyUnicode_WRITE(kind, data, 0, ch);
11000    assert(_PyUnicode_CheckConsistency(res, 1));
11001    return res;
11002}
11003
11004/* Believe it or not, this produces the same value for ASCII strings
11005   as bytes_hash(). */
11006static Py_hash_t
11007unicode_hash(PyObject *self)
11008{
11009    Py_ssize_t len;
11010    Py_uhash_t x;
11011
11012#ifdef Py_DEBUG
11013    assert(_Py_HashSecret_Initialized);
11014#endif
11015    if (_PyUnicode_HASH(self) != -1)
11016        return _PyUnicode_HASH(self);
11017    if (PyUnicode_READY(self) == -1)
11018        return -1;
11019    len = PyUnicode_GET_LENGTH(self);
11020    /*
11021      We make the hash of the empty string be 0, rather than using
11022      (prefix ^ suffix), since this slightly obfuscates the hash secret
11023    */
11024    if (len == 0) {
11025        _PyUnicode_HASH(self) = 0;
11026        return 0;
11027    }
11028
11029    /* The hash function as a macro, gets expanded three times below. */
11030#define HASH(P)                                            \
11031    x ^= (Py_uhash_t) *P << 7;                             \
11032    while (--len >= 0)                                     \
11033        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
11034
11035    x = (Py_uhash_t) _Py_HashSecret.prefix;
11036    switch (PyUnicode_KIND(self)) {
11037    case PyUnicode_1BYTE_KIND: {
11038        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11039        HASH(c);
11040        break;
11041    }
11042    case PyUnicode_2BYTE_KIND: {
11043        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11044        HASH(s);
11045        break;
11046    }
11047    default: {
11048        Py_UCS4 *l;
11049        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11050               "Impossible switch case in unicode_hash");
11051        l = PyUnicode_4BYTE_DATA(self);
11052        HASH(l);
11053        break;
11054    }
11055    }
11056    x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11057    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
11058
11059    if (x == -1)
11060        x = -2;
11061    _PyUnicode_HASH(self) = x;
11062    return x;
11063}
11064#undef HASH
11065
11066PyDoc_STRVAR(index__doc__,
11067             "S.index(sub[, start[, end]]) -> int\n\
11068\n\
11069Like S.find() but raise ValueError when the substring is not found.");
11070
11071static PyObject *
11072unicode_index(PyObject *self, PyObject *args)
11073{
11074    Py_ssize_t result;
11075    PyObject *substring;
11076    Py_ssize_t start;
11077    Py_ssize_t end;
11078
11079    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11080                                            &start, &end))
11081        return NULL;
11082
11083    if (PyUnicode_READY(self) == -1)
11084        return NULL;
11085    if (PyUnicode_READY(substring) == -1)
11086        return NULL;
11087
11088    result = any_find_slice(1, self, substring, start, end);
11089
11090    Py_DECREF(substring);
11091
11092    if (result == -2)
11093        return NULL;
11094
11095    if (result < 0) {
11096        PyErr_SetString(PyExc_ValueError, "substring not found");
11097        return NULL;
11098    }
11099
11100    return PyLong_FromSsize_t(result);
11101}
11102
11103PyDoc_STRVAR(islower__doc__,
11104             "S.islower() -> bool\n\
11105\n\
11106Return True if all cased characters in S are lowercase and there is\n\
11107at least one cased character in S, False otherwise.");
11108
11109static PyObject*
11110unicode_islower(PyObject *self)
11111{
11112    Py_ssize_t i, length;
11113    int kind;
11114    void *data;
11115    int cased;
11116
11117    if (PyUnicode_READY(self) == -1)
11118        return NULL;
11119    length = PyUnicode_GET_LENGTH(self);
11120    kind = PyUnicode_KIND(self);
11121    data = PyUnicode_DATA(self);
11122
11123    /* Shortcut for single character strings */
11124    if (length == 1)
11125        return PyBool_FromLong(
11126            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11127
11128    /* Special case for empty strings */
11129    if (length == 0)
11130        return PyBool_FromLong(0);
11131
11132    cased = 0;
11133    for (i = 0; i < length; i++) {
11134        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11135
11136        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11137            return PyBool_FromLong(0);
11138        else if (!cased && Py_UNICODE_ISLOWER(ch))
11139            cased = 1;
11140    }
11141    return PyBool_FromLong(cased);
11142}
11143
11144PyDoc_STRVAR(isupper__doc__,
11145             "S.isupper() -> bool\n\
11146\n\
11147Return True if all cased characters in S are uppercase and there is\n\
11148at least one cased character in S, False otherwise.");
11149
11150static PyObject*
11151unicode_isupper(PyObject *self)
11152{
11153    Py_ssize_t i, length;
11154    int kind;
11155    void *data;
11156    int cased;
11157
11158    if (PyUnicode_READY(self) == -1)
11159        return NULL;
11160    length = PyUnicode_GET_LENGTH(self);
11161    kind = PyUnicode_KIND(self);
11162    data = PyUnicode_DATA(self);
11163
11164    /* Shortcut for single character strings */
11165    if (length == 1)
11166        return PyBool_FromLong(
11167            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11168
11169    /* Special case for empty strings */
11170    if (length == 0)
11171        return PyBool_FromLong(0);
11172
11173    cased = 0;
11174    for (i = 0; i < length; i++) {
11175        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11176
11177        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11178            return PyBool_FromLong(0);
11179        else if (!cased && Py_UNICODE_ISUPPER(ch))
11180            cased = 1;
11181    }
11182    return PyBool_FromLong(cased);
11183}
11184
11185PyDoc_STRVAR(istitle__doc__,
11186             "S.istitle() -> bool\n\
11187\n\
11188Return True if S is a titlecased string and there is at least one\n\
11189character in S, i.e. upper- and titlecase characters may only\n\
11190follow uncased characters and lowercase characters only cased ones.\n\
11191Return False otherwise.");
11192
11193static PyObject*
11194unicode_istitle(PyObject *self)
11195{
11196    Py_ssize_t i, length;
11197    int kind;
11198    void *data;
11199    int cased, previous_is_cased;
11200
11201    if (PyUnicode_READY(self) == -1)
11202        return NULL;
11203    length = PyUnicode_GET_LENGTH(self);
11204    kind = PyUnicode_KIND(self);
11205    data = PyUnicode_DATA(self);
11206
11207    /* Shortcut for single character strings */
11208    if (length == 1) {
11209        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11210        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11211                               (Py_UNICODE_ISUPPER(ch) != 0));
11212    }
11213
11214    /* Special case for empty strings */
11215    if (length == 0)
11216        return PyBool_FromLong(0);
11217
11218    cased = 0;
11219    previous_is_cased = 0;
11220    for (i = 0; i < length; i++) {
11221        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11222
11223        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11224            if (previous_is_cased)
11225                return PyBool_FromLong(0);
11226            previous_is_cased = 1;
11227            cased = 1;
11228        }
11229        else if (Py_UNICODE_ISLOWER(ch)) {
11230            if (!previous_is_cased)
11231                return PyBool_FromLong(0);
11232            previous_is_cased = 1;
11233            cased = 1;
11234        }
11235        else
11236            previous_is_cased = 0;
11237    }
11238    return PyBool_FromLong(cased);
11239}
11240
11241PyDoc_STRVAR(isspace__doc__,
11242             "S.isspace() -> bool\n\
11243\n\
11244Return True if all characters in S are whitespace\n\
11245and there is at least one character in S, False otherwise.");
11246
11247static PyObject*
11248unicode_isspace(PyObject *self)
11249{
11250    Py_ssize_t i, length;
11251    int kind;
11252    void *data;
11253
11254    if (PyUnicode_READY(self) == -1)
11255        return NULL;
11256    length = PyUnicode_GET_LENGTH(self);
11257    kind = PyUnicode_KIND(self);
11258    data = PyUnicode_DATA(self);
11259
11260    /* Shortcut for single character strings */
11261    if (length == 1)
11262        return PyBool_FromLong(
11263            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11264
11265    /* Special case for empty strings */
11266    if (length == 0)
11267        return PyBool_FromLong(0);
11268
11269    for (i = 0; i < length; i++) {
11270        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11271        if (!Py_UNICODE_ISSPACE(ch))
11272            return PyBool_FromLong(0);
11273    }
11274    return PyBool_FromLong(1);
11275}
11276
11277PyDoc_STRVAR(isalpha__doc__,
11278             "S.isalpha() -> bool\n\
11279\n\
11280Return True if all characters in S are alphabetic\n\
11281and there is at least one character in S, False otherwise.");
11282
11283static PyObject*
11284unicode_isalpha(PyObject *self)
11285{
11286    Py_ssize_t i, length;
11287    int kind;
11288    void *data;
11289
11290    if (PyUnicode_READY(self) == -1)
11291        return NULL;
11292    length = PyUnicode_GET_LENGTH(self);
11293    kind = PyUnicode_KIND(self);
11294    data = PyUnicode_DATA(self);
11295
11296    /* Shortcut for single character strings */
11297    if (length == 1)
11298        return PyBool_FromLong(
11299            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11300
11301    /* Special case for empty strings */
11302    if (length == 0)
11303        return PyBool_FromLong(0);
11304
11305    for (i = 0; i < length; i++) {
11306        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11307            return PyBool_FromLong(0);
11308    }
11309    return PyBool_FromLong(1);
11310}
11311
11312PyDoc_STRVAR(isalnum__doc__,
11313             "S.isalnum() -> bool\n\
11314\n\
11315Return True if all characters in S are alphanumeric\n\
11316and there is at least one character in S, False otherwise.");
11317
11318static PyObject*
11319unicode_isalnum(PyObject *self)
11320{
11321    int kind;
11322    void *data;
11323    Py_ssize_t len, i;
11324
11325    if (PyUnicode_READY(self) == -1)
11326        return NULL;
11327
11328    kind = PyUnicode_KIND(self);
11329    data = PyUnicode_DATA(self);
11330    len = PyUnicode_GET_LENGTH(self);
11331
11332    /* Shortcut for single character strings */
11333    if (len == 1) {
11334        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11335        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11336    }
11337
11338    /* Special case for empty strings */
11339    if (len == 0)
11340        return PyBool_FromLong(0);
11341
11342    for (i = 0; i < len; i++) {
11343        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11344        if (!Py_UNICODE_ISALNUM(ch))
11345            return PyBool_FromLong(0);
11346    }
11347    return PyBool_FromLong(1);
11348}
11349
11350PyDoc_STRVAR(isdecimal__doc__,
11351             "S.isdecimal() -> bool\n\
11352\n\
11353Return True if there are only decimal characters in S,\n\
11354False otherwise.");
11355
11356static PyObject*
11357unicode_isdecimal(PyObject *self)
11358{
11359    Py_ssize_t i, length;
11360    int kind;
11361    void *data;
11362
11363    if (PyUnicode_READY(self) == -1)
11364        return NULL;
11365    length = PyUnicode_GET_LENGTH(self);
11366    kind = PyUnicode_KIND(self);
11367    data = PyUnicode_DATA(self);
11368
11369    /* Shortcut for single character strings */
11370    if (length == 1)
11371        return PyBool_FromLong(
11372            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11373
11374    /* Special case for empty strings */
11375    if (length == 0)
11376        return PyBool_FromLong(0);
11377
11378    for (i = 0; i < length; i++) {
11379        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11380            return PyBool_FromLong(0);
11381    }
11382    return PyBool_FromLong(1);
11383}
11384
11385PyDoc_STRVAR(isdigit__doc__,
11386             "S.isdigit() -> bool\n\
11387\n\
11388Return True if all characters in S are digits\n\
11389and there is at least one character in S, False otherwise.");
11390
11391static PyObject*
11392unicode_isdigit(PyObject *self)
11393{
11394    Py_ssize_t i, length;
11395    int kind;
11396    void *data;
11397
11398    if (PyUnicode_READY(self) == -1)
11399        return NULL;
11400    length = PyUnicode_GET_LENGTH(self);
11401    kind = PyUnicode_KIND(self);
11402    data = PyUnicode_DATA(self);
11403
11404    /* Shortcut for single character strings */
11405    if (length == 1) {
11406        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11407        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11408    }
11409
11410    /* Special case for empty strings */
11411    if (length == 0)
11412        return PyBool_FromLong(0);
11413
11414    for (i = 0; i < length; i++) {
11415        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11416            return PyBool_FromLong(0);
11417    }
11418    return PyBool_FromLong(1);
11419}
11420
11421PyDoc_STRVAR(isnumeric__doc__,
11422             "S.isnumeric() -> bool\n\
11423\n\
11424Return True if there are only numeric characters in S,\n\
11425False otherwise.");
11426
11427static PyObject*
11428unicode_isnumeric(PyObject *self)
11429{
11430    Py_ssize_t i, length;
11431    int kind;
11432    void *data;
11433
11434    if (PyUnicode_READY(self) == -1)
11435        return NULL;
11436    length = PyUnicode_GET_LENGTH(self);
11437    kind = PyUnicode_KIND(self);
11438    data = PyUnicode_DATA(self);
11439
11440    /* Shortcut for single character strings */
11441    if (length == 1)
11442        return PyBool_FromLong(
11443            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11444
11445    /* Special case for empty strings */
11446    if (length == 0)
11447        return PyBool_FromLong(0);
11448
11449    for (i = 0; i < length; i++) {
11450        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11451            return PyBool_FromLong(0);
11452    }
11453    return PyBool_FromLong(1);
11454}
11455
11456int
11457PyUnicode_IsIdentifier(PyObject *self)
11458{
11459    int kind;
11460    void *data;
11461    Py_ssize_t i;
11462    Py_UCS4 first;
11463
11464    if (PyUnicode_READY(self) == -1) {
11465        Py_FatalError("identifier not ready");
11466        return 0;
11467    }
11468
11469    /* Special case for empty strings */
11470    if (PyUnicode_GET_LENGTH(self) == 0)
11471        return 0;
11472    kind = PyUnicode_KIND(self);
11473    data = PyUnicode_DATA(self);
11474
11475    /* PEP 3131 says that the first character must be in
11476       XID_Start and subsequent characters in XID_Continue,
11477       and for the ASCII range, the 2.x rules apply (i.e
11478       start with letters and underscore, continue with
11479       letters, digits, underscore). However, given the current
11480       definition of XID_Start and XID_Continue, it is sufficient
11481       to check just for these, except that _ must be allowed
11482       as starting an identifier.  */
11483    first = PyUnicode_READ(kind, data, 0);
11484    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11485        return 0;
11486
11487    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11488        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11489            return 0;
11490    return 1;
11491}
11492
11493PyDoc_STRVAR(isidentifier__doc__,
11494             "S.isidentifier() -> bool\n\
11495\n\
11496Return True if S is a valid identifier according\n\
11497to the language definition.");
11498
11499static PyObject*
11500unicode_isidentifier(PyObject *self)
11501{
11502    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11503}
11504
11505PyDoc_STRVAR(isprintable__doc__,
11506             "S.isprintable() -> bool\n\
11507\n\
11508Return True if all characters in S are considered\n\
11509printable in repr() or S is empty, False otherwise.");
11510
11511static PyObject*
11512unicode_isprintable(PyObject *self)
11513{
11514    Py_ssize_t i, length;
11515    int kind;
11516    void *data;
11517
11518    if (PyUnicode_READY(self) == -1)
11519        return NULL;
11520    length = PyUnicode_GET_LENGTH(self);
11521    kind = PyUnicode_KIND(self);
11522    data = PyUnicode_DATA(self);
11523
11524    /* Shortcut for single character strings */
11525    if (length == 1)
11526        return PyBool_FromLong(
11527            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11528
11529    for (i = 0; i < length; i++) {
11530        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11531            Py_RETURN_FALSE;
11532        }
11533    }
11534    Py_RETURN_TRUE;
11535}
11536
11537PyDoc_STRVAR(join__doc__,
11538             "S.join(iterable) -> str\n\
11539\n\
11540Return a string which is the concatenation of the strings in the\n\
11541iterable.  The separator between elements is S.");
11542
11543static PyObject*
11544unicode_join(PyObject *self, PyObject *data)
11545{
11546    return PyUnicode_Join(self, data);
11547}
11548
11549static Py_ssize_t
11550unicode_length(PyObject *self)
11551{
11552    if (PyUnicode_READY(self) == -1)
11553        return -1;
11554    return PyUnicode_GET_LENGTH(self);
11555}
11556
11557PyDoc_STRVAR(ljust__doc__,
11558             "S.ljust(width[, fillchar]) -> str\n\
11559\n\
11560Return S left-justified in a Unicode string of length width. Padding is\n\
11561done using the specified fill character (default is a space).");
11562
11563static PyObject *
11564unicode_ljust(PyObject *self, PyObject *args)
11565{
11566    Py_ssize_t width;
11567    Py_UCS4 fillchar = ' ';
11568
11569    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11570        return NULL;
11571
11572    if (PyUnicode_READY(self) == -1)
11573        return NULL;
11574
11575    if (PyUnicode_GET_LENGTH(self) >= width)
11576        return unicode_result_unchanged(self);
11577
11578    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11579}
11580
11581PyDoc_STRVAR(lower__doc__,
11582             "S.lower() -> str\n\
11583\n\
11584Return a copy of the string S converted to lowercase.");
11585
11586static PyObject*
11587unicode_lower(PyObject *self)
11588{
11589    if (PyUnicode_READY(self) == -1)
11590        return NULL;
11591    if (PyUnicode_IS_ASCII(self))
11592        return ascii_upper_or_lower(self, 1);
11593    return case_operation(self, do_lower);
11594}
11595
11596#define LEFTSTRIP 0
11597#define RIGHTSTRIP 1
11598#define BOTHSTRIP 2
11599
11600/* Arrays indexed by above */
11601static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11602
11603#define STRIPNAME(i) (stripformat[i]+3)
11604
11605/* externally visible for str.strip(unicode) */
11606PyObject *
11607_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11608{
11609    void *data;
11610    int kind;
11611    Py_ssize_t i, j, len;
11612    BLOOM_MASK sepmask;
11613
11614    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11615        return NULL;
11616
11617    kind = PyUnicode_KIND(self);
11618    data = PyUnicode_DATA(self);
11619    len = PyUnicode_GET_LENGTH(self);
11620    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11621                              PyUnicode_DATA(sepobj),
11622                              PyUnicode_GET_LENGTH(sepobj));
11623
11624    i = 0;
11625    if (striptype != RIGHTSTRIP) {
11626        while (i < len &&
11627               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11628            i++;
11629        }
11630    }
11631
11632    j = len;
11633    if (striptype != LEFTSTRIP) {
11634        do {
11635            j--;
11636        } while (j >= i &&
11637                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11638        j++;
11639    }
11640
11641    return PyUnicode_Substring(self, i, j);
11642}
11643
11644PyObject*
11645PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11646{
11647    unsigned char *data;
11648    int kind;
11649    Py_ssize_t length;
11650
11651    if (PyUnicode_READY(self) == -1)
11652        return NULL;
11653
11654    length = PyUnicode_GET_LENGTH(self);
11655    end = Py_MIN(end, length);
11656
11657    if (start == 0 && end == length)
11658        return unicode_result_unchanged(self);
11659
11660    if (start < 0 || end < 0) {
11661        PyErr_SetString(PyExc_IndexError, "string index out of range");
11662        return NULL;
11663    }
11664    if (start >= length || end < start) {
11665        Py_INCREF(unicode_empty);
11666        return unicode_empty;
11667    }
11668
11669    length = end - start;
11670    if (PyUnicode_IS_ASCII(self)) {
11671        data = PyUnicode_1BYTE_DATA(self);
11672        return _PyUnicode_FromASCII((char*)(data + start), length);
11673    }
11674    else {
11675        kind = PyUnicode_KIND(self);
11676        data = PyUnicode_1BYTE_DATA(self);
11677        return PyUnicode_FromKindAndData(kind,
11678                                         data + kind * start,
11679                                         length);
11680    }
11681}
11682
11683static PyObject *
11684do_strip(PyObject *self, int striptype)
11685{
11686    int kind;
11687    void *data;
11688    Py_ssize_t len, i, j;
11689
11690    if (PyUnicode_READY(self) == -1)
11691        return NULL;
11692
11693    kind = PyUnicode_KIND(self);
11694    data = PyUnicode_DATA(self);
11695    len = PyUnicode_GET_LENGTH(self);
11696
11697    i = 0;
11698    if (striptype != RIGHTSTRIP) {
11699        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11700            i++;
11701        }
11702    }
11703
11704    j = len;
11705    if (striptype != LEFTSTRIP) {
11706        do {
11707            j--;
11708        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11709        j++;
11710    }
11711
11712    return PyUnicode_Substring(self, i, j);
11713}
11714
11715
11716static PyObject *
11717do_argstrip(PyObject *self, int striptype, PyObject *args)
11718{
11719    PyObject *sep = NULL;
11720
11721    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11722        return NULL;
11723
11724    if (sep != NULL && sep != Py_None) {
11725        if (PyUnicode_Check(sep))
11726            return _PyUnicode_XStrip(self, striptype, sep);
11727        else {
11728            PyErr_Format(PyExc_TypeError,
11729                         "%s arg must be None or str",
11730                         STRIPNAME(striptype));
11731            return NULL;
11732        }
11733    }
11734
11735    return do_strip(self, striptype);
11736}
11737
11738
11739PyDoc_STRVAR(strip__doc__,
11740             "S.strip([chars]) -> str\n\
11741\n\
11742Return a copy of the string S with leading and trailing\n\
11743whitespace removed.\n\
11744If chars is given and not None, remove characters in chars instead.");
11745
11746static PyObject *
11747unicode_strip(PyObject *self, PyObject *args)
11748{
11749    if (PyTuple_GET_SIZE(args) == 0)
11750        return do_strip(self, BOTHSTRIP); /* Common case */
11751    else
11752        return do_argstrip(self, BOTHSTRIP, args);
11753}
11754
11755
11756PyDoc_STRVAR(lstrip__doc__,
11757             "S.lstrip([chars]) -> str\n\
11758\n\
11759Return a copy of the string S with leading whitespace removed.\n\
11760If chars is given and not None, remove characters in chars instead.");
11761
11762static PyObject *
11763unicode_lstrip(PyObject *self, PyObject *args)
11764{
11765    if (PyTuple_GET_SIZE(args) == 0)
11766        return do_strip(self, LEFTSTRIP); /* Common case */
11767    else
11768        return do_argstrip(self, LEFTSTRIP, args);
11769}
11770
11771
11772PyDoc_STRVAR(rstrip__doc__,
11773             "S.rstrip([chars]) -> str\n\
11774\n\
11775Return a copy of the string S with trailing whitespace removed.\n\
11776If chars is given and not None, remove characters in chars instead.");
11777
11778static PyObject *
11779unicode_rstrip(PyObject *self, PyObject *args)
11780{
11781    if (PyTuple_GET_SIZE(args) == 0)
11782        return do_strip(self, RIGHTSTRIP); /* Common case */
11783    else
11784        return do_argstrip(self, RIGHTSTRIP, args);
11785}
11786
11787
11788static PyObject*
11789unicode_repeat(PyObject *str, Py_ssize_t len)
11790{
11791    PyObject *u;
11792    Py_ssize_t nchars, n;
11793
11794    if (len < 1) {
11795        Py_INCREF(unicode_empty);
11796        return unicode_empty;
11797    }
11798
11799    /* no repeat, return original string */
11800    if (len == 1)
11801        return unicode_result_unchanged(str);
11802
11803    if (PyUnicode_READY(str) == -1)
11804        return NULL;
11805
11806    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11807        PyErr_SetString(PyExc_OverflowError,
11808                        "repeated string is too long");
11809        return NULL;
11810    }
11811    nchars = len * PyUnicode_GET_LENGTH(str);
11812
11813    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11814    if (!u)
11815        return NULL;
11816    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11817
11818    if (PyUnicode_GET_LENGTH(str) == 1) {
11819        const int kind = PyUnicode_KIND(str);
11820        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11821        if (kind == PyUnicode_1BYTE_KIND) {
11822            void *to = PyUnicode_DATA(u);
11823            memset(to, (unsigned char)fill_char, len);
11824        }
11825        else if (kind == PyUnicode_2BYTE_KIND) {
11826            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
11827            for (n = 0; n < len; ++n)
11828                ucs2[n] = fill_char;
11829        } else {
11830            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11831            assert(kind == PyUnicode_4BYTE_KIND);
11832            for (n = 0; n < len; ++n)
11833                ucs4[n] = fill_char;
11834        }
11835    }
11836    else {
11837        /* number of characters copied this far */
11838        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11839        const Py_ssize_t char_size = PyUnicode_KIND(str);
11840        char *to = (char *) PyUnicode_DATA(u);
11841        Py_MEMCPY(to, PyUnicode_DATA(str),
11842                  PyUnicode_GET_LENGTH(str) * char_size);
11843        while (done < nchars) {
11844            n = (done <= nchars-done) ? done : nchars-done;
11845            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11846            done += n;
11847        }
11848    }
11849
11850    assert(_PyUnicode_CheckConsistency(u, 1));
11851    return u;
11852}
11853
11854PyObject *
11855PyUnicode_Replace(PyObject *obj,
11856                  PyObject *subobj,
11857                  PyObject *replobj,
11858                  Py_ssize_t maxcount)
11859{
11860    PyObject *self;
11861    PyObject *str1;
11862    PyObject *str2;
11863    PyObject *result;
11864
11865    self = PyUnicode_FromObject(obj);
11866    if (self == NULL)
11867        return NULL;
11868    str1 = PyUnicode_FromObject(subobj);
11869    if (str1 == NULL) {
11870        Py_DECREF(self);
11871        return NULL;
11872    }
11873    str2 = PyUnicode_FromObject(replobj);
11874    if (str2 == NULL) {
11875        Py_DECREF(self);
11876        Py_DECREF(str1);
11877        return NULL;
11878    }
11879    if (PyUnicode_READY(self) == -1 ||
11880        PyUnicode_READY(str1) == -1 ||
11881        PyUnicode_READY(str2) == -1)
11882        result = NULL;
11883    else
11884        result = replace(self, str1, str2, maxcount);
11885    Py_DECREF(self);
11886    Py_DECREF(str1);
11887    Py_DECREF(str2);
11888    return result;
11889}
11890
11891PyDoc_STRVAR(replace__doc__,
11892             "S.replace(old, new[, count]) -> str\n\
11893\n\
11894Return a copy of S with all occurrences of substring\n\
11895old replaced by new.  If the optional argument count is\n\
11896given, only the first count occurrences are replaced.");
11897
11898static PyObject*
11899unicode_replace(PyObject *self, PyObject *args)
11900{
11901    PyObject *str1;
11902    PyObject *str2;
11903    Py_ssize_t maxcount = -1;
11904    PyObject *result;
11905
11906    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11907        return NULL;
11908    if (PyUnicode_READY(self) == -1)
11909        return NULL;
11910    str1 = PyUnicode_FromObject(str1);
11911    if (str1 == NULL)
11912        return NULL;
11913    str2 = PyUnicode_FromObject(str2);
11914    if (str2 == NULL) {
11915        Py_DECREF(str1);
11916        return NULL;
11917    }
11918    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11919        result = NULL;
11920    else
11921        result = replace(self, str1, str2, maxcount);
11922
11923    Py_DECREF(str1);
11924    Py_DECREF(str2);
11925    return result;
11926}
11927
11928static PyObject *
11929unicode_repr(PyObject *unicode)
11930{
11931    PyObject *repr;
11932    Py_ssize_t isize;
11933    Py_ssize_t osize, squote, dquote, i, o;
11934    Py_UCS4 max, quote;
11935    int ikind, okind;
11936    void *idata, *odata;
11937
11938    if (PyUnicode_READY(unicode) == -1)
11939        return NULL;
11940
11941    isize = PyUnicode_GET_LENGTH(unicode);
11942    idata = PyUnicode_DATA(unicode);
11943
11944    /* Compute length of output, quote characters, and
11945       maximum character */
11946    osize = 2; /* quotes */
11947    max = 127;
11948    squote = dquote = 0;
11949    ikind = PyUnicode_KIND(unicode);
11950    for (i = 0; i < isize; i++) {
11951        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11952        switch (ch) {
11953        case '\'': squote++; osize++; break;
11954        case '"':  dquote++; osize++; break;
11955        case '\\': case '\t': case '\r': case '\n':
11956            osize += 2; break;
11957        default:
11958            /* Fast-path ASCII */
11959            if (ch < ' ' || ch == 0x7f)
11960                osize += 4; /* \xHH */
11961            else if (ch < 0x7f)
11962                osize++;
11963            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11964                osize++;
11965                max = ch > max ? ch : max;
11966            }
11967            else if (ch < 0x100)
11968                osize += 4; /* \xHH */
11969            else if (ch < 0x10000)
11970                osize += 6; /* \uHHHH */
11971            else
11972                osize += 10; /* \uHHHHHHHH */
11973        }
11974    }
11975
11976    quote = '\'';
11977    if (squote) {
11978        if (dquote)
11979            /* Both squote and dquote present. Use squote,
11980               and escape them */
11981            osize += squote;
11982        else
11983            quote = '"';
11984    }
11985
11986    repr = PyUnicode_New(osize, max);
11987    if (repr == NULL)
11988        return NULL;
11989    okind = PyUnicode_KIND(repr);
11990    odata = PyUnicode_DATA(repr);
11991
11992    PyUnicode_WRITE(okind, odata, 0, quote);
11993    PyUnicode_WRITE(okind, odata, osize-1, quote);
11994
11995    for (i = 0, o = 1; i < isize; i++) {
11996        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11997
11998        /* Escape quotes and backslashes */
11999        if ((ch == quote) || (ch == '\\')) {
12000            PyUnicode_WRITE(okind, odata, o++, '\\');
12001            PyUnicode_WRITE(okind, odata, o++, ch);
12002            continue;
12003        }
12004
12005        /* Map special whitespace to '\t', \n', '\r' */
12006        if (ch == '\t') {
12007            PyUnicode_WRITE(okind, odata, o++, '\\');
12008            PyUnicode_WRITE(okind, odata, o++, 't');
12009        }
12010        else if (ch == '\n') {
12011            PyUnicode_WRITE(okind, odata, o++, '\\');
12012            PyUnicode_WRITE(okind, odata, o++, 'n');
12013        }
12014        else if (ch == '\r') {
12015            PyUnicode_WRITE(okind, odata, o++, '\\');
12016            PyUnicode_WRITE(okind, odata, o++, 'r');
12017        }
12018
12019        /* Map non-printable US ASCII to '\xhh' */
12020        else if (ch < ' ' || ch == 0x7F) {
12021            PyUnicode_WRITE(okind, odata, o++, '\\');
12022            PyUnicode_WRITE(okind, odata, o++, 'x');
12023            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12024            PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12025        }
12026
12027        /* Copy ASCII characters as-is */
12028        else if (ch < 0x7F) {
12029            PyUnicode_WRITE(okind, odata, o++, ch);
12030        }
12031
12032        /* Non-ASCII characters */
12033        else {
12034            /* Map Unicode whitespace and control characters
12035               (categories Z* and C* except ASCII space)
12036            */
12037            if (!Py_UNICODE_ISPRINTABLE(ch)) {
12038                PyUnicode_WRITE(okind, odata, o++, '\\');
12039                /* Map 8-bit characters to '\xhh' */
12040                if (ch <= 0xff) {
12041                    PyUnicode_WRITE(okind, odata, o++, 'x');
12042                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12043                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12044                }
12045                /* Map 16-bit characters to '\uxxxx' */
12046                else if (ch <= 0xffff) {
12047                    PyUnicode_WRITE(okind, odata, o++, 'u');
12048                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12049                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12050                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12051                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12052                }
12053                /* Map 21-bit characters to '\U00xxxxxx' */
12054                else {
12055                    PyUnicode_WRITE(okind, odata, o++, 'U');
12056                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12057                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12058                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12059                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12060                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12061                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12062                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12063                    PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12064                }
12065            }
12066            /* Copy characters as-is */
12067            else {
12068                PyUnicode_WRITE(okind, odata, o++, ch);
12069            }
12070        }
12071    }
12072    /* Closing quote already added at the beginning */
12073    assert(_PyUnicode_CheckConsistency(repr, 1));
12074    return repr;
12075}
12076
12077PyDoc_STRVAR(rfind__doc__,
12078             "S.rfind(sub[, start[, end]]) -> int\n\
12079\n\
12080Return the highest index in S where substring sub is found,\n\
12081such that sub is contained within S[start:end].  Optional\n\
12082arguments start and end are interpreted as in slice notation.\n\
12083\n\
12084Return -1 on failure.");
12085
12086static PyObject *
12087unicode_rfind(PyObject *self, PyObject *args)
12088{
12089    PyObject *substring;
12090    Py_ssize_t start;
12091    Py_ssize_t end;
12092    Py_ssize_t result;
12093
12094    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12095                                            &start, &end))
12096        return NULL;
12097
12098    if (PyUnicode_READY(self) == -1)
12099        return NULL;
12100    if (PyUnicode_READY(substring) == -1)
12101        return NULL;
12102
12103    result = any_find_slice(-1, self, substring, start, end);
12104
12105    Py_DECREF(substring);
12106
12107    if (result == -2)
12108        return NULL;
12109
12110    return PyLong_FromSsize_t(result);
12111}
12112
12113PyDoc_STRVAR(rindex__doc__,
12114             "S.rindex(sub[, start[, end]]) -> int\n\
12115\n\
12116Like S.rfind() but raise ValueError when the substring is not found.");
12117
12118static PyObject *
12119unicode_rindex(PyObject *self, PyObject *args)
12120{
12121    PyObject *substring;
12122    Py_ssize_t start;
12123    Py_ssize_t end;
12124    Py_ssize_t result;
12125
12126    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12127                                            &start, &end))
12128        return NULL;
12129
12130    if (PyUnicode_READY(self) == -1)
12131        return NULL;
12132    if (PyUnicode_READY(substring) == -1)
12133        return NULL;
12134
12135    result = any_find_slice(-1, self, substring, start, end);
12136
12137    Py_DECREF(substring);
12138
12139    if (result == -2)
12140        return NULL;
12141
12142    if (result < 0) {
12143        PyErr_SetString(PyExc_ValueError, "substring not found");
12144        return NULL;
12145    }
12146
12147    return PyLong_FromSsize_t(result);
12148}
12149
12150PyDoc_STRVAR(rjust__doc__,
12151             "S.rjust(width[, fillchar]) -> str\n\
12152\n\
12153Return S right-justified in a string of length width. Padding is\n\
12154done using the specified fill character (default is a space).");
12155
12156static PyObject *
12157unicode_rjust(PyObject *self, PyObject *args)
12158{
12159    Py_ssize_t width;
12160    Py_UCS4 fillchar = ' ';
12161
12162    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12163        return NULL;
12164
12165    if (PyUnicode_READY(self) == -1)
12166        return NULL;
12167
12168    if (PyUnicode_GET_LENGTH(self) >= width)
12169        return unicode_result_unchanged(self);
12170
12171    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12172}
12173
12174PyObject *
12175PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12176{
12177    PyObject *result;
12178
12179    s = PyUnicode_FromObject(s);
12180    if (s == NULL)
12181        return NULL;
12182    if (sep != NULL) {
12183        sep = PyUnicode_FromObject(sep);
12184        if (sep == NULL) {
12185            Py_DECREF(s);
12186            return NULL;
12187        }
12188    }
12189
12190    result = split(s, sep, maxsplit);
12191
12192    Py_DECREF(s);
12193    Py_XDECREF(sep);
12194    return result;
12195}
12196
12197PyDoc_STRVAR(split__doc__,
12198             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12199\n\
12200Return a list of the words in S, using sep as the\n\
12201delimiter string.  If maxsplit is given, at most maxsplit\n\
12202splits are done. If sep is not specified or is None, any\n\
12203whitespace string is a separator and empty strings are\n\
12204removed from the result.");
12205
12206static PyObject*
12207unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12208{
12209    static char *kwlist[] = {"sep", "maxsplit", 0};
12210    PyObject *substring = Py_None;
12211    Py_ssize_t maxcount = -1;
12212
12213    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12214                                     kwlist, &substring, &maxcount))
12215        return NULL;
12216
12217    if (substring == Py_None)
12218        return split(self, NULL, maxcount);
12219    else if (PyUnicode_Check(substring))
12220        return split(self, substring, maxcount);
12221    else
12222        return PyUnicode_Split(self, substring, maxcount);
12223}
12224
12225PyObject *
12226PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12227{
12228    PyObject* str_obj;
12229    PyObject* sep_obj;
12230    PyObject* out;
12231    int kind1, kind2, kind;
12232    void *buf1 = NULL, *buf2 = NULL;
12233    Py_ssize_t len1, len2;
12234
12235    str_obj = PyUnicode_FromObject(str_in);
12236    if (!str_obj)
12237        return NULL;
12238    sep_obj = PyUnicode_FromObject(sep_in);
12239    if (!sep_obj) {
12240        Py_DECREF(str_obj);
12241        return NULL;
12242    }
12243    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12244        Py_DECREF(sep_obj);
12245        Py_DECREF(str_obj);
12246        return NULL;
12247    }
12248
12249    kind1 = PyUnicode_KIND(str_obj);
12250    kind2 = PyUnicode_KIND(sep_obj);
12251    kind = Py_MAX(kind1, kind2);
12252    buf1 = PyUnicode_DATA(str_obj);
12253    if (kind1 != kind)
12254        buf1 = _PyUnicode_AsKind(str_obj, kind);
12255    if (!buf1)
12256        goto onError;
12257    buf2 = PyUnicode_DATA(sep_obj);
12258    if (kind2 != kind)
12259        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12260    if (!buf2)
12261        goto onError;
12262    len1 = PyUnicode_GET_LENGTH(str_obj);
12263    len2 = PyUnicode_GET_LENGTH(sep_obj);
12264
12265    switch (PyUnicode_KIND(str_obj)) {
12266    case PyUnicode_1BYTE_KIND:
12267        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12268            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12269        else
12270            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12271        break;
12272    case PyUnicode_2BYTE_KIND:
12273        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12274        break;
12275    case PyUnicode_4BYTE_KIND:
12276        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12277        break;
12278    default:
12279        assert(0);
12280        out = 0;
12281    }
12282
12283    Py_DECREF(sep_obj);
12284    Py_DECREF(str_obj);
12285    if (kind1 != kind)
12286        PyMem_Free(buf1);
12287    if (kind2 != kind)
12288        PyMem_Free(buf2);
12289
12290    return out;
12291  onError:
12292    Py_DECREF(sep_obj);
12293    Py_DECREF(str_obj);
12294    if (kind1 != kind && buf1)
12295        PyMem_Free(buf1);
12296    if (kind2 != kind && buf2)
12297        PyMem_Free(buf2);
12298    return NULL;
12299}
12300
12301
12302PyObject *
12303PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12304{
12305    PyObject* str_obj;
12306    PyObject* sep_obj;
12307    PyObject* out;
12308    int kind1, kind2, kind;
12309    void *buf1 = NULL, *buf2 = NULL;
12310    Py_ssize_t len1, len2;
12311
12312    str_obj = PyUnicode_FromObject(str_in);
12313    if (!str_obj)
12314        return NULL;
12315    sep_obj = PyUnicode_FromObject(sep_in);
12316    if (!sep_obj) {
12317        Py_DECREF(str_obj);
12318        return NULL;
12319    }
12320
12321    kind1 = PyUnicode_KIND(str_in);
12322    kind2 = PyUnicode_KIND(sep_obj);
12323    kind = Py_MAX(kind1, kind2);
12324    buf1 = PyUnicode_DATA(str_in);
12325    if (kind1 != kind)
12326        buf1 = _PyUnicode_AsKind(str_in, kind);
12327    if (!buf1)
12328        goto onError;
12329    buf2 = PyUnicode_DATA(sep_obj);
12330    if (kind2 != kind)
12331        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12332    if (!buf2)
12333        goto onError;
12334    len1 = PyUnicode_GET_LENGTH(str_obj);
12335    len2 = PyUnicode_GET_LENGTH(sep_obj);
12336
12337    switch (PyUnicode_KIND(str_in)) {
12338    case PyUnicode_1BYTE_KIND:
12339        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12340            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12341        else
12342            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12343        break;
12344    case PyUnicode_2BYTE_KIND:
12345        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12346        break;
12347    case PyUnicode_4BYTE_KIND:
12348        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12349        break;
12350    default:
12351        assert(0);
12352        out = 0;
12353    }
12354
12355    Py_DECREF(sep_obj);
12356    Py_DECREF(str_obj);
12357    if (kind1 != kind)
12358        PyMem_Free(buf1);
12359    if (kind2 != kind)
12360        PyMem_Free(buf2);
12361
12362    return out;
12363  onError:
12364    Py_DECREF(sep_obj);
12365    Py_DECREF(str_obj);
12366    if (kind1 != kind && buf1)
12367        PyMem_Free(buf1);
12368    if (kind2 != kind && buf2)
12369        PyMem_Free(buf2);
12370    return NULL;
12371}
12372
12373PyDoc_STRVAR(partition__doc__,
12374             "S.partition(sep) -> (head, sep, tail)\n\
12375\n\
12376Search for the separator sep in S, and return the part before it,\n\
12377the separator itself, and the part after it.  If the separator is not\n\
12378found, return S and two empty strings.");
12379
12380static PyObject*
12381unicode_partition(PyObject *self, PyObject *separator)
12382{
12383    return PyUnicode_Partition(self, separator);
12384}
12385
12386PyDoc_STRVAR(rpartition__doc__,
12387             "S.rpartition(sep) -> (head, sep, tail)\n\
12388\n\
12389Search for the separator sep in S, starting at the end of S, and return\n\
12390the part before it, the separator itself, and the part after it.  If the\n\
12391separator is not found, return two empty strings and S.");
12392
12393static PyObject*
12394unicode_rpartition(PyObject *self, PyObject *separator)
12395{
12396    return PyUnicode_RPartition(self, separator);
12397}
12398
12399PyObject *
12400PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12401{
12402    PyObject *result;
12403
12404    s = PyUnicode_FromObject(s);
12405    if (s == NULL)
12406        return NULL;
12407    if (sep != NULL) {
12408        sep = PyUnicode_FromObject(sep);
12409        if (sep == NULL) {
12410            Py_DECREF(s);
12411            return NULL;
12412        }
12413    }
12414
12415    result = rsplit(s, sep, maxsplit);
12416
12417    Py_DECREF(s);
12418    Py_XDECREF(sep);
12419    return result;
12420}
12421
12422PyDoc_STRVAR(rsplit__doc__,
12423             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12424\n\
12425Return a list of the words in S, using sep as the\n\
12426delimiter string, starting at the end of the string and\n\
12427working to the front.  If maxsplit is given, at most maxsplit\n\
12428splits are done. If sep is not specified, any whitespace string\n\
12429is a separator.");
12430
12431static PyObject*
12432unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12433{
12434    static char *kwlist[] = {"sep", "maxsplit", 0};
12435    PyObject *substring = Py_None;
12436    Py_ssize_t maxcount = -1;
12437
12438    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12439                                     kwlist, &substring, &maxcount))
12440        return NULL;
12441
12442    if (substring == Py_None)
12443        return rsplit(self, NULL, maxcount);
12444    else if (PyUnicode_Check(substring))
12445        return rsplit(self, substring, maxcount);
12446    else
12447        return PyUnicode_RSplit(self, substring, maxcount);
12448}
12449
12450PyDoc_STRVAR(splitlines__doc__,
12451             "S.splitlines([keepends]) -> list of strings\n\
12452\n\
12453Return a list of the lines in S, breaking at line boundaries.\n\
12454Line breaks are not included in the resulting list unless keepends\n\
12455is given and true.");
12456
12457static PyObject*
12458unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12459{
12460    static char *kwlist[] = {"keepends", 0};
12461    int keepends = 0;
12462
12463    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12464                                     kwlist, &keepends))
12465        return NULL;
12466
12467    return PyUnicode_Splitlines(self, keepends);
12468}
12469
12470static
12471PyObject *unicode_str(PyObject *self)
12472{
12473    return unicode_result_unchanged(self);
12474}
12475
12476PyDoc_STRVAR(swapcase__doc__,
12477             "S.swapcase() -> str\n\
12478\n\
12479Return a copy of S with uppercase characters converted to lowercase\n\
12480and vice versa.");
12481
12482static PyObject*
12483unicode_swapcase(PyObject *self)
12484{
12485    if (PyUnicode_READY(self) == -1)
12486        return NULL;
12487    return case_operation(self, do_swapcase);
12488}
12489
12490PyDoc_STRVAR(maketrans__doc__,
12491             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12492\n\
12493Return a translation table usable for str.translate().\n\
12494If there is only one argument, it must be a dictionary mapping Unicode\n\
12495ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12496Character keys will be then converted to ordinals.\n\
12497If there are two arguments, they must be strings of equal length, and\n\
12498in the resulting dictionary, each character in x will be mapped to the\n\
12499character at the same position in y. If there is a third argument, it\n\
12500must be a string, whose characters will be mapped to None in the result.");
12501
12502static PyObject*
12503unicode_maketrans(PyObject *null, PyObject *args)
12504{
12505    PyObject *x, *y = NULL, *z = NULL;
12506    PyObject *new = NULL, *key, *value;
12507    Py_ssize_t i = 0;
12508    int res;
12509
12510    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12511        return NULL;
12512    new = PyDict_New();
12513    if (!new)
12514        return NULL;
12515    if (y != NULL) {
12516        int x_kind, y_kind, z_kind;
12517        void *x_data, *y_data, *z_data;
12518
12519        /* x must be a string too, of equal length */
12520        if (!PyUnicode_Check(x)) {
12521            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12522                            "be a string if there is a second argument");
12523            goto err;
12524        }
12525        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12526            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12527                            "arguments must have equal length");
12528            goto err;
12529        }
12530        /* create entries for translating chars in x to those in y */
12531        x_kind = PyUnicode_KIND(x);
12532        y_kind = PyUnicode_KIND(y);
12533        x_data = PyUnicode_DATA(x);
12534        y_data = PyUnicode_DATA(y);
12535        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12536            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12537            if (!key)
12538                goto err;
12539            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12540            if (!value) {
12541                Py_DECREF(key);
12542                goto err;
12543            }
12544            res = PyDict_SetItem(new, key, value);
12545            Py_DECREF(key);
12546            Py_DECREF(value);
12547            if (res < 0)
12548                goto err;
12549        }
12550        /* create entries for deleting chars in z */
12551        if (z != NULL) {
12552            z_kind = PyUnicode_KIND(z);
12553            z_data = PyUnicode_DATA(z);
12554            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12555                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12556                if (!key)
12557                    goto err;
12558                res = PyDict_SetItem(new, key, Py_None);
12559                Py_DECREF(key);
12560                if (res < 0)
12561                    goto err;
12562            }
12563        }
12564    } else {
12565        int kind;
12566        void *data;
12567
12568        /* x must be a dict */
12569        if (!PyDict_CheckExact(x)) {
12570            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12571                            "to maketrans it must be a dict");
12572            goto err;
12573        }
12574        /* copy entries into the new dict, converting string keys to int keys */
12575        while (PyDict_Next(x, &i, &key, &value)) {
12576            if (PyUnicode_Check(key)) {
12577                /* convert string keys to integer keys */
12578                PyObject *newkey;
12579                if (PyUnicode_GET_LENGTH(key) != 1) {
12580                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12581                                    "table must be of length 1");
12582                    goto err;
12583                }
12584                kind = PyUnicode_KIND(key);
12585                data = PyUnicode_DATA(key);
12586                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12587                if (!newkey)
12588                    goto err;
12589                res = PyDict_SetItem(new, newkey, value);
12590                Py_DECREF(newkey);
12591                if (res < 0)
12592                    goto err;
12593            } else if (PyLong_Check(key)) {
12594                /* just keep integer keys */
12595                if (PyDict_SetItem(new, key, value) < 0)
12596                    goto err;
12597            } else {
12598                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12599                                "be strings or integers");
12600                goto err;
12601            }
12602        }
12603    }
12604    return new;
12605  err:
12606    Py_DECREF(new);
12607    return NULL;
12608}
12609
12610PyDoc_STRVAR(translate__doc__,
12611             "S.translate(table) -> str\n\
12612\n\
12613Return a copy of the string S, where all characters have been mapped\n\
12614through the given translation table, which must be a mapping of\n\
12615Unicode ordinals to Unicode ordinals, strings, or None.\n\
12616Unmapped characters are left untouched. Characters mapped to None\n\
12617are deleted.");
12618
12619static PyObject*
12620unicode_translate(PyObject *self, PyObject *table)
12621{
12622    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12623}
12624
12625PyDoc_STRVAR(upper__doc__,
12626             "S.upper() -> str\n\
12627\n\
12628Return a copy of S converted to uppercase.");
12629
12630static PyObject*
12631unicode_upper(PyObject *self)
12632{
12633    if (PyUnicode_READY(self) == -1)
12634        return NULL;
12635    if (PyUnicode_IS_ASCII(self))
12636        return ascii_upper_or_lower(self, 0);
12637    return case_operation(self, do_upper);
12638}
12639
12640PyDoc_STRVAR(zfill__doc__,
12641             "S.zfill(width) -> str\n\
12642\n\
12643Pad a numeric string S with zeros on the left, to fill a field\n\
12644of the specified width. The string S is never truncated.");
12645
12646static PyObject *
12647unicode_zfill(PyObject *self, PyObject *args)
12648{
12649    Py_ssize_t fill;
12650    PyObject *u;
12651    Py_ssize_t width;
12652    int kind;
12653    void *data;
12654    Py_UCS4 chr;
12655
12656    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12657        return NULL;
12658
12659    if (PyUnicode_READY(self) == -1)
12660        return NULL;
12661
12662    if (PyUnicode_GET_LENGTH(self) >= width)
12663        return unicode_result_unchanged(self);
12664
12665    fill = width - PyUnicode_GET_LENGTH(self);
12666
12667    u = pad(self, fill, 0, '0');
12668
12669    if (u == NULL)
12670        return NULL;
12671
12672    kind = PyUnicode_KIND(u);
12673    data = PyUnicode_DATA(u);
12674    chr = PyUnicode_READ(kind, data, fill);
12675
12676    if (chr == '+' || chr == '-') {
12677        /* move sign to beginning of string */
12678        PyUnicode_WRITE(kind, data, 0, chr);
12679        PyUnicode_WRITE(kind, data, fill, '0');
12680    }
12681
12682    assert(_PyUnicode_CheckConsistency(u, 1));
12683    return u;
12684}
12685
12686#if 0
12687static PyObject *
12688unicode__decimal2ascii(PyObject *self)
12689{
12690    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12691}
12692#endif
12693
12694PyDoc_STRVAR(startswith__doc__,
12695             "S.startswith(prefix[, start[, end]]) -> bool\n\
12696\n\
12697Return True if S starts with the specified prefix, False otherwise.\n\
12698With optional start, test S beginning at that position.\n\
12699With optional end, stop comparing S at that position.\n\
12700prefix can also be a tuple of strings to try.");
12701
12702static PyObject *
12703unicode_startswith(PyObject *self,
12704                   PyObject *args)
12705{
12706    PyObject *subobj;
12707    PyObject *substring;
12708    Py_ssize_t start = 0;
12709    Py_ssize_t end = PY_SSIZE_T_MAX;
12710    int result;
12711
12712    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12713        return NULL;
12714    if (PyTuple_Check(subobj)) {
12715        Py_ssize_t i;
12716        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12717            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12718            if (substring == NULL)
12719                return NULL;
12720            result = tailmatch(self, substring, start, end, -1);
12721            Py_DECREF(substring);
12722            if (result) {
12723                Py_RETURN_TRUE;
12724            }
12725        }
12726        /* nothing matched */
12727        Py_RETURN_FALSE;
12728    }
12729    substring = PyUnicode_FromObject(subobj);
12730    if (substring == NULL) {
12731        if (PyErr_ExceptionMatches(PyExc_TypeError))
12732            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12733                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12734        return NULL;
12735    }
12736    result = tailmatch(self, substring, start, end, -1);
12737    Py_DECREF(substring);
12738    return PyBool_FromLong(result);
12739}
12740
12741
12742PyDoc_STRVAR(endswith__doc__,
12743             "S.endswith(suffix[, start[, end]]) -> bool\n\
12744\n\
12745Return True if S ends with the specified suffix, False otherwise.\n\
12746With optional start, test S beginning at that position.\n\
12747With optional end, stop comparing S at that position.\n\
12748suffix can also be a tuple of strings to try.");
12749
12750static PyObject *
12751unicode_endswith(PyObject *self,
12752                 PyObject *args)
12753{
12754    PyObject *subobj;
12755    PyObject *substring;
12756    Py_ssize_t start = 0;
12757    Py_ssize_t end = PY_SSIZE_T_MAX;
12758    int result;
12759
12760    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12761        return NULL;
12762    if (PyTuple_Check(subobj)) {
12763        Py_ssize_t i;
12764        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12765            substring = PyUnicode_FromObject(
12766                PyTuple_GET_ITEM(subobj, i));
12767            if (substring == NULL)
12768                return NULL;
12769            result = tailmatch(self, substring, start, end, +1);
12770            Py_DECREF(substring);
12771            if (result) {
12772                Py_RETURN_TRUE;
12773            }
12774        }
12775        Py_RETURN_FALSE;
12776    }
12777    substring = PyUnicode_FromObject(subobj);
12778    if (substring == NULL) {
12779        if (PyErr_ExceptionMatches(PyExc_TypeError))
12780            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12781                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12782        return NULL;
12783    }
12784    result = tailmatch(self, substring, start, end, +1);
12785    Py_DECREF(substring);
12786    return PyBool_FromLong(result);
12787}
12788
12789Py_LOCAL_INLINE(void)
12790_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
12791{
12792    writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12793    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12794    writer->data = PyUnicode_DATA(writer->buffer);
12795    writer->kind = PyUnicode_KIND(writer->buffer);
12796}
12797
12798void
12799_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
12800{
12801    memset(writer, 0, sizeof(*writer));
12802#ifdef Py_DEBUG
12803    writer->kind = 5;    /* invalid kind */
12804#endif
12805    writer->min_length = Py_MAX(min_length, 100);
12806    writer->overallocate = (min_length > 0);
12807}
12808
12809int
12810_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12811                                 Py_ssize_t length, Py_UCS4 maxchar)
12812{
12813    Py_ssize_t newlen;
12814    PyObject *newbuffer;
12815
12816    assert(length > 0);
12817
12818    if (length > PY_SSIZE_T_MAX - writer->pos) {
12819        PyErr_NoMemory();
12820        return -1;
12821    }
12822    newlen = writer->pos + length;
12823
12824    if (writer->buffer == NULL) {
12825        if (writer->overallocate) {
12826            /* overallocate 25% to limit the number of resize */
12827            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12828                newlen += newlen / 4;
12829            if (newlen < writer->min_length)
12830                newlen = writer->min_length;
12831        }
12832        writer->buffer = PyUnicode_New(newlen, maxchar);
12833        if (writer->buffer == NULL)
12834            return -1;
12835        _PyUnicodeWriter_Update(writer);
12836        return 0;
12837    }
12838
12839    if (newlen > writer->size) {
12840        if (writer->overallocate) {
12841            /* overallocate 25% to limit the number of resize */
12842            if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12843                newlen += newlen / 4;
12844            if (newlen < writer->min_length)
12845                newlen = writer->min_length;
12846        }
12847
12848        if (maxchar > writer->maxchar || writer->readonly) {
12849            /* resize + widen */
12850            newbuffer = PyUnicode_New(newlen, maxchar);
12851            if (newbuffer == NULL)
12852                return -1;
12853            _PyUnicode_FastCopyCharacters(newbuffer, 0,
12854                                          writer->buffer, 0, writer->pos);
12855            Py_DECREF(writer->buffer);
12856            writer->readonly = 0;
12857        }
12858        else {
12859            newbuffer = resize_compact(writer->buffer, newlen);
12860            if (newbuffer == NULL)
12861                return -1;
12862        }
12863        writer->buffer = newbuffer;
12864        _PyUnicodeWriter_Update(writer);
12865    }
12866    else if (maxchar > writer->maxchar) {
12867        assert(!writer->readonly);
12868        newbuffer = PyUnicode_New(writer->size, maxchar);
12869        if (newbuffer == NULL)
12870            return -1;
12871        _PyUnicode_FastCopyCharacters(newbuffer, 0,
12872                                      writer->buffer, 0, writer->pos);
12873        Py_DECREF(writer->buffer);
12874        writer->buffer = newbuffer;
12875        _PyUnicodeWriter_Update(writer);
12876    }
12877    return 0;
12878}
12879
12880int
12881_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12882{
12883    Py_UCS4 maxchar;
12884    Py_ssize_t len;
12885
12886    if (PyUnicode_READY(str) == -1)
12887        return -1;
12888    len = PyUnicode_GET_LENGTH(str);
12889    if (len == 0)
12890        return 0;
12891    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12892    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
12893        if (writer->buffer == NULL && !writer->overallocate) {
12894            Py_INCREF(str);
12895            writer->buffer = str;
12896            _PyUnicodeWriter_Update(writer);
12897            writer->readonly = 1;
12898            writer->size = 0;
12899            writer->pos += len;
12900            return 0;
12901        }
12902        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12903            return -1;
12904    }
12905    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12906                                  str, 0, len);
12907    writer->pos += len;
12908    return 0;
12909}
12910
12911PyObject *
12912_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
12913{
12914    if (writer->pos == 0) {
12915        Py_XDECREF(writer->buffer);
12916        Py_INCREF(unicode_empty);
12917        return unicode_empty;
12918    }
12919    if (writer->readonly) {
12920        assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12921        return writer->buffer;
12922    }
12923    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12924        PyObject *newbuffer;
12925        newbuffer = resize_compact(writer->buffer, writer->pos);
12926        if (newbuffer == NULL) {
12927            Py_DECREF(writer->buffer);
12928            return NULL;
12929        }
12930        writer->buffer = newbuffer;
12931    }
12932    assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
12933    return writer->buffer;
12934}
12935
12936void
12937_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
12938{
12939    Py_CLEAR(writer->buffer);
12940}
12941
12942#include "stringlib/unicode_format.h"
12943
12944PyDoc_STRVAR(format__doc__,
12945             "S.format(*args, **kwargs) -> str\n\
12946\n\
12947Return a formatted version of S, using substitutions from args and kwargs.\n\
12948The substitutions are identified by braces ('{' and '}').");
12949
12950PyDoc_STRVAR(format_map__doc__,
12951             "S.format_map(mapping) -> str\n\
12952\n\
12953Return a formatted version of S, using substitutions from mapping.\n\
12954The substitutions are identified by braces ('{' and '}').");
12955
12956static PyObject *
12957unicode__format__(PyObject* self, PyObject* args)
12958{
12959    PyObject *format_spec;
12960    _PyUnicodeWriter writer;
12961    int ret;
12962
12963    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12964        return NULL;
12965
12966    if (PyUnicode_READY(self) == -1)
12967        return NULL;
12968    _PyUnicodeWriter_Init(&writer, 0);
12969    ret = _PyUnicode_FormatAdvancedWriter(&writer,
12970                                          self, format_spec, 0,
12971                                          PyUnicode_GET_LENGTH(format_spec));
12972    if (ret == -1) {
12973        _PyUnicodeWriter_Dealloc(&writer);
12974        return NULL;
12975    }
12976    return _PyUnicodeWriter_Finish(&writer);
12977}
12978
12979PyDoc_STRVAR(p_format__doc__,
12980             "S.__format__(format_spec) -> str\n\
12981\n\
12982Return a formatted version of S as described by format_spec.");
12983
12984static PyObject *
12985unicode__sizeof__(PyObject *v)
12986{
12987    Py_ssize_t size;
12988
12989    /* If it's a compact object, account for base structure +
12990       character data. */
12991    if (PyUnicode_IS_COMPACT_ASCII(v))
12992        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12993    else if (PyUnicode_IS_COMPACT(v))
12994        size = sizeof(PyCompactUnicodeObject) +
12995            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
12996    else {
12997        /* If it is a two-block object, account for base object, and
12998           for character block if present. */
12999        size = sizeof(PyUnicodeObject);
13000        if (_PyUnicode_DATA_ANY(v))
13001            size += (PyUnicode_GET_LENGTH(v) + 1) *
13002                PyUnicode_KIND(v);
13003    }
13004    /* If the wstr pointer is present, account for it unless it is shared
13005       with the data pointer. Check if the data is not shared. */
13006    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13007        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13008    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13009        size += PyUnicode_UTF8_LENGTH(v) + 1;
13010
13011    return PyLong_FromSsize_t(size);
13012}
13013
13014PyDoc_STRVAR(sizeof__doc__,
13015             "S.__sizeof__() -> size of S in memory, in bytes");
13016
13017static PyObject *
13018unicode_getnewargs(PyObject *v)
13019{
13020    PyObject *copy = _PyUnicode_Copy(v);
13021    if (!copy)
13022        return NULL;
13023    return Py_BuildValue("(N)", copy);
13024}
13025
13026static PyMethodDef unicode_methods[] = {
13027    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13028    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13029    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13030    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13031    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13032    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13033    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13034    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13035    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13036    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13037    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13038    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13039    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13040    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13041    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13042    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13043    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13044    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13045    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13046    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13047    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13048    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13049    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13050    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13051    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13052    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13053    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13054    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13055    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13056    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13057    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13058    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13059    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13060    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13061    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13062    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13063    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13064    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13065    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13066    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13067    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13068    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13069    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13070    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13071    {"maketrans", (PyCFunction) unicode_maketrans,
13072     METH_VARARGS | METH_STATIC, maketrans__doc__},
13073    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13074#if 0
13075    /* These methods are just used for debugging the implementation. */
13076    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13077#endif
13078
13079    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13080    {NULL, NULL}
13081};
13082
13083static PyObject *
13084unicode_mod(PyObject *v, PyObject *w)
13085{
13086    if (!PyUnicode_Check(v))
13087        Py_RETURN_NOTIMPLEMENTED;
13088    return PyUnicode_Format(v, w);
13089}
13090
13091static PyNumberMethods unicode_as_number = {
13092    0,              /*nb_add*/
13093    0,              /*nb_subtract*/
13094    0,              /*nb_multiply*/
13095    unicode_mod,            /*nb_remainder*/
13096};
13097
13098static PySequenceMethods unicode_as_sequence = {
13099    (lenfunc) unicode_length,       /* sq_length */
13100    PyUnicode_Concat,           /* sq_concat */
13101    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13102    (ssizeargfunc) unicode_getitem,     /* sq_item */
13103    0,                  /* sq_slice */
13104    0,                  /* sq_ass_item */
13105    0,                  /* sq_ass_slice */
13106    PyUnicode_Contains,         /* sq_contains */
13107};
13108
13109static PyObject*
13110unicode_subscript(PyObject* self, PyObject* item)
13111{
13112    if (PyUnicode_READY(self) == -1)
13113        return NULL;
13114
13115    if (PyIndex_Check(item)) {
13116        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13117        if (i == -1 && PyErr_Occurred())
13118            return NULL;
13119        if (i < 0)
13120            i += PyUnicode_GET_LENGTH(self);
13121        return unicode_getitem(self, i);
13122    } else if (PySlice_Check(item)) {
13123        Py_ssize_t start, stop, step, slicelength, cur, i;
13124        PyObject *result;
13125        void *src_data, *dest_data;
13126        int src_kind, dest_kind;
13127        Py_UCS4 ch, max_char, kind_limit;
13128
13129        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13130                                 &start, &stop, &step, &slicelength) < 0) {
13131            return NULL;
13132        }
13133
13134        if (slicelength <= 0) {
13135            Py_INCREF(unicode_empty);
13136            return unicode_empty;
13137        } else if (start == 0 && step == 1 &&
13138                   slicelength == PyUnicode_GET_LENGTH(self)) {
13139            return unicode_result_unchanged(self);
13140        } else if (step == 1) {
13141            return PyUnicode_Substring(self,
13142                                       start, start + slicelength);
13143        }
13144        /* General case */
13145        src_kind = PyUnicode_KIND(self);
13146        src_data = PyUnicode_DATA(self);
13147        if (!PyUnicode_IS_ASCII(self)) {
13148            kind_limit = kind_maxchar_limit(src_kind);
13149            max_char = 0;
13150            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13151                ch = PyUnicode_READ(src_kind, src_data, cur);
13152                if (ch > max_char) {
13153                    max_char = ch;
13154                    if (max_char >= kind_limit)
13155                        break;
13156                }
13157            }
13158        }
13159        else
13160            max_char = 127;
13161        result = PyUnicode_New(slicelength, max_char);
13162        if (result == NULL)
13163            return NULL;
13164        dest_kind = PyUnicode_KIND(result);
13165        dest_data = PyUnicode_DATA(result);
13166
13167        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13168            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13169            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13170        }
13171        assert(_PyUnicode_CheckConsistency(result, 1));
13172        return result;
13173    } else {
13174        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13175        return NULL;
13176    }
13177}
13178
13179static PyMappingMethods unicode_as_mapping = {
13180    (lenfunc)unicode_length,        /* mp_length */
13181    (binaryfunc)unicode_subscript,  /* mp_subscript */
13182    (objobjargproc)0,           /* mp_ass_subscript */
13183};
13184
13185
13186/* Helpers for PyUnicode_Format() */
13187
13188static PyObject *
13189getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
13190{
13191    Py_ssize_t argidx = *p_argidx;
13192    if (argidx < arglen) {
13193        (*p_argidx)++;
13194        if (arglen < 0)
13195            return args;
13196        else
13197            return PyTuple_GetItem(args, argidx);
13198    }
13199    PyErr_SetString(PyExc_TypeError,
13200                    "not enough arguments for format string");
13201    return NULL;
13202}
13203
13204/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13205
13206static int
13207formatfloat(PyObject *v, int flags, int prec, int type,
13208            PyObject **p_output, _PyUnicodeWriter *writer)
13209{
13210    char *p;
13211    double x;
13212    Py_ssize_t len;
13213
13214    x = PyFloat_AsDouble(v);
13215    if (x == -1.0 && PyErr_Occurred())
13216        return -1;
13217
13218    if (prec < 0)
13219        prec = 6;
13220
13221    p = PyOS_double_to_string(x, type, prec,
13222                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
13223    if (p == NULL)
13224        return -1;
13225    len = strlen(p);
13226    if (writer) {
13227        if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13228            PyMem_Free(p);
13229            return -1;
13230        }
13231        unicode_write_cstr(writer->buffer, writer->pos, p, len);
13232        writer->pos += len;
13233    }
13234    else
13235        *p_output = _PyUnicode_FromASCII(p, len);
13236    PyMem_Free(p);
13237    return 0;
13238}
13239
13240/* formatlong() emulates the format codes d, u, o, x and X, and
13241 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13242 * Python's regular ints.
13243 * Return value:  a new PyUnicodeObject*, or NULL if error.
13244 *     The output string is of the form
13245 *         "-"? ("0x" | "0X")? digit+
13246 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13247 *         set in flags.  The case of hex digits will be correct,
13248 *     There will be at least prec digits, zero-filled on the left if
13249 *         necessary to get that many.
13250 * val          object to be converted
13251 * flags        bitmask of format flags; only F_ALT is looked at
13252 * prec         minimum number of digits; 0-fill on left if needed
13253 * type         a character in [duoxX]; u acts the same as d
13254 *
13255 * CAUTION:  o, x and X conversions on regular ints can never
13256 * produce a '-' sign, but can for Python's unbounded ints.
13257 */
13258static PyObject*
13259formatlong(PyObject *val, int flags, int prec, int type)
13260{
13261    PyObject *result = NULL;
13262    char *buf;
13263    Py_ssize_t i;
13264    int sign;           /* 1 if '-', else 0 */
13265    int len;            /* number of characters */
13266    Py_ssize_t llen;
13267    int numdigits;      /* len == numnondigits + numdigits */
13268    int numnondigits = 0;
13269
13270    /* Avoid exceeding SSIZE_T_MAX */
13271    if (prec > INT_MAX-3) {
13272        PyErr_SetString(PyExc_OverflowError,
13273                        "precision too large");
13274        return NULL;
13275    }
13276
13277    assert(PyLong_Check(val));
13278
13279    switch (type) {
13280    case 'd':
13281    case 'u':
13282        /* Special-case boolean: we want 0/1 */
13283        if (PyBool_Check(val))
13284            result = PyNumber_ToBase(val, 10);
13285        else
13286            result = Py_TYPE(val)->tp_str(val);
13287        break;
13288    case 'o':
13289        numnondigits = 2;
13290        result = PyNumber_ToBase(val, 8);
13291        break;
13292    case 'x':
13293    case 'X':
13294        numnondigits = 2;
13295        result = PyNumber_ToBase(val, 16);
13296        break;
13297    default:
13298        assert(!"'type' not in [duoxX]");
13299    }
13300    if (!result)
13301        return NULL;
13302
13303    assert(unicode_modifiable(result));
13304    assert(PyUnicode_IS_READY(result));
13305    assert(PyUnicode_IS_ASCII(result));
13306
13307    /* To modify the string in-place, there can only be one reference. */
13308    if (Py_REFCNT(result) != 1) {
13309        PyErr_BadInternalCall();
13310        return NULL;
13311    }
13312    buf = PyUnicode_DATA(result);
13313    llen = PyUnicode_GET_LENGTH(result);
13314    if (llen > INT_MAX) {
13315        PyErr_SetString(PyExc_ValueError,
13316                        "string too large in _PyBytes_FormatLong");
13317        return NULL;
13318    }
13319    len = (int)llen;
13320    sign = buf[0] == '-';
13321    numnondigits += sign;
13322    numdigits = len - numnondigits;
13323    assert(numdigits > 0);
13324
13325    /* Get rid of base marker unless F_ALT */
13326    if (((flags & F_ALT) == 0 &&
13327        (type == 'o' || type == 'x' || type == 'X'))) {
13328        assert(buf[sign] == '0');
13329        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13330               buf[sign+1] == 'o');
13331        numnondigits -= 2;
13332        buf += 2;
13333        len -= 2;
13334        if (sign)
13335            buf[0] = '-';
13336        assert(len == numnondigits + numdigits);
13337        assert(numdigits > 0);
13338    }
13339
13340    /* Fill with leading zeroes to meet minimum width. */
13341    if (prec > numdigits) {
13342        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13343                                numnondigits + prec);
13344        char *b1;
13345        if (!r1) {
13346            Py_DECREF(result);
13347            return NULL;
13348        }
13349        b1 = PyBytes_AS_STRING(r1);
13350        for (i = 0; i < numnondigits; ++i)
13351            *b1++ = *buf++;
13352        for (i = 0; i < prec - numdigits; i++)
13353            *b1++ = '0';
13354        for (i = 0; i < numdigits; i++)
13355            *b1++ = *buf++;
13356        *b1 = '\0';
13357        Py_DECREF(result);
13358        result = r1;
13359        buf = PyBytes_AS_STRING(result);
13360        len = numnondigits + prec;
13361    }
13362
13363    /* Fix up case for hex conversions. */
13364    if (type == 'X') {
13365        /* Need to convert all lower case letters to upper case.
13366           and need to convert 0x to 0X (and -0x to -0X). */
13367        for (i = 0; i < len; i++)
13368            if (buf[i] >= 'a' && buf[i] <= 'x')
13369                buf[i] -= 'a'-'A';
13370    }
13371    if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13372        PyObject *unicode;
13373        unicode = _PyUnicode_FromASCII(buf, len);
13374        Py_DECREF(result);
13375        result = unicode;
13376    }
13377    return result;
13378}
13379
13380static Py_UCS4
13381formatchar(PyObject *v)
13382{
13383    /* presume that the buffer is at least 3 characters long */
13384    if (PyUnicode_Check(v)) {
13385        if (PyUnicode_GET_LENGTH(v) == 1) {
13386            return PyUnicode_READ_CHAR(v, 0);
13387        }
13388        goto onError;
13389    }
13390    else {
13391        /* Integer input truncated to a character */
13392        long x;
13393        x = PyLong_AsLong(v);
13394        if (x == -1 && PyErr_Occurred())
13395            goto onError;
13396
13397        if (x < 0 || x > MAX_UNICODE) {
13398            PyErr_SetString(PyExc_OverflowError,
13399                            "%c arg not in range(0x110000)");
13400            return (Py_UCS4) -1;
13401        }
13402
13403        return (Py_UCS4) x;
13404    }
13405
13406  onError:
13407    PyErr_SetString(PyExc_TypeError,
13408                    "%c requires int or char");
13409    return (Py_UCS4) -1;
13410}
13411
13412PyObject *
13413PyUnicode_Format(PyObject *format, PyObject *args)
13414{
13415    Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
13416    int args_owned = 0;
13417    PyObject *dict = NULL;
13418    PyObject *temp = NULL;
13419    PyObject *second = NULL;
13420    PyObject *uformat;
13421    void *fmt;
13422    enum PyUnicode_Kind kind, fmtkind;
13423    _PyUnicodeWriter writer;
13424    Py_ssize_t sublen;
13425    Py_UCS4 maxchar;
13426
13427    if (format == NULL || args == NULL) {
13428        PyErr_BadInternalCall();
13429        return NULL;
13430    }
13431    uformat = PyUnicode_FromObject(format);
13432    if (uformat == NULL)
13433        return NULL;
13434    if (PyUnicode_READY(uformat) == -1) {
13435        Py_DECREF(uformat);
13436        return NULL;
13437    }
13438
13439    fmt = PyUnicode_DATA(uformat);
13440    fmtkind = PyUnicode_KIND(uformat);
13441    fmtcnt = PyUnicode_GET_LENGTH(uformat);
13442    fmtpos = 0;
13443
13444    _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
13445
13446    if (PyTuple_Check(args)) {
13447        arglen = PyTuple_Size(args);
13448        argidx = 0;
13449    }
13450    else {
13451        arglen = -1;
13452        argidx = -2;
13453    }
13454    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
13455        dict = args;
13456
13457    while (--fmtcnt >= 0) {
13458        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13459            Py_ssize_t nonfmtpos;
13460            nonfmtpos = fmtpos++;
13461            while (fmtcnt >= 0 &&
13462                   PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13463                fmtpos++;
13464                fmtcnt--;
13465            }
13466            if (fmtcnt < 0)
13467                fmtpos--;
13468            sublen = fmtpos - nonfmtpos;
13469            maxchar = _PyUnicode_FindMaxChar(uformat,
13470                                             nonfmtpos, nonfmtpos + sublen);
13471            if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
13472                goto onError;
13473
13474            _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13475                                          uformat, nonfmtpos, sublen);
13476            writer.pos += sublen;
13477        }
13478        else {
13479            /* Got a format specifier */
13480            int flags = 0;
13481            Py_ssize_t width = -1;
13482            int prec = -1;
13483            Py_UCS4 c = '\0';
13484            Py_UCS4 fill;
13485            int sign;
13486            Py_UCS4 signchar;
13487            int isnumok;
13488            PyObject *v = NULL;
13489            void *pbuf = NULL;
13490            Py_ssize_t pindex, len;
13491            Py_UCS4 bufmaxchar;
13492            Py_ssize_t buflen;
13493
13494            fmtpos++;
13495            c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13496            if (c == '(') {
13497                Py_ssize_t keystart;
13498                Py_ssize_t keylen;
13499                PyObject *key;
13500                int pcount = 1;
13501
13502                if (dict == NULL) {
13503                    PyErr_SetString(PyExc_TypeError,
13504                                    "format requires a mapping");
13505                    goto onError;
13506                }
13507                ++fmtpos;
13508                --fmtcnt;
13509                keystart = fmtpos;
13510                /* Skip over balanced parentheses */
13511                while (pcount > 0 && --fmtcnt >= 0) {
13512                    c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13513                    if (c == ')')
13514                        --pcount;
13515                    else if (c == '(')
13516                        ++pcount;
13517                    fmtpos++;
13518                }
13519                keylen = fmtpos - keystart - 1;
13520                if (fmtcnt < 0 || pcount > 0) {
13521                    PyErr_SetString(PyExc_ValueError,
13522                                    "incomplete format key");
13523                    goto onError;
13524                }
13525                key = PyUnicode_Substring(uformat,
13526                                          keystart, keystart + keylen);
13527                if (key == NULL)
13528                    goto onError;
13529                if (args_owned) {
13530                    Py_DECREF(args);
13531                    args_owned = 0;
13532                }
13533                args = PyObject_GetItem(dict, key);
13534                Py_DECREF(key);
13535                if (args == NULL) {
13536                    goto onError;
13537                }
13538                args_owned = 1;
13539                arglen = -1;
13540                argidx = -2;
13541            }
13542            while (--fmtcnt >= 0) {
13543                c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13544                switch (c) {
13545                case '-': flags |= F_LJUST; continue;
13546                case '+': flags |= F_SIGN; continue;
13547                case ' ': flags |= F_BLANK; continue;
13548                case '#': flags |= F_ALT; continue;
13549                case '0': flags |= F_ZERO; continue;
13550                }
13551                break;
13552            }
13553            if (c == '*') {
13554                v = getnextarg(args, arglen, &argidx);
13555                if (v == NULL)
13556                    goto onError;
13557                if (!PyLong_Check(v)) {
13558                    PyErr_SetString(PyExc_TypeError,
13559                                    "* wants int");
13560                    goto onError;
13561                }
13562                width = PyLong_AsLong(v);
13563                if (width == -1 && PyErr_Occurred())
13564                    goto onError;
13565                if (width < 0) {
13566                    flags |= F_LJUST;
13567                    width = -width;
13568                }
13569                if (--fmtcnt >= 0)
13570                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13571            }
13572            else if (c >= '0' && c <= '9') {
13573                width = c - '0';
13574                while (--fmtcnt >= 0) {
13575                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13576                    if (c < '0' || c > '9')
13577                        break;
13578                    /* Since c is unsigned, the RHS would end up as unsigned,
13579                       mixing signed and unsigned comparison. Since c is between
13580                       '0' and '9', casting to int is safe. */
13581                    if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
13582                        PyErr_SetString(PyExc_ValueError,
13583                                        "width too big");
13584                        goto onError;
13585                    }
13586                    width = width*10 + (c - '0');
13587                }
13588            }
13589            if (c == '.') {
13590                prec = 0;
13591                if (--fmtcnt >= 0)
13592                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13593                if (c == '*') {
13594                    v = getnextarg(args, arglen, &argidx);
13595                    if (v == NULL)
13596                        goto onError;
13597                    if (!PyLong_Check(v)) {
13598                        PyErr_SetString(PyExc_TypeError,
13599                                        "* wants int");
13600                        goto onError;
13601                    }
13602                    prec = PyLong_AsLong(v);
13603                    if (prec == -1 && PyErr_Occurred())
13604                        goto onError;
13605                    if (prec < 0)
13606                        prec = 0;
13607                    if (--fmtcnt >= 0)
13608                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13609                }
13610                else if (c >= '0' && c <= '9') {
13611                    prec = c - '0';
13612                    while (--fmtcnt >= 0) {
13613                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13614                        if (c < '0' || c > '9')
13615                            break;
13616                        if (prec > (INT_MAX - ((int)c - '0')) / 10) {
13617                            PyErr_SetString(PyExc_ValueError,
13618                                            "prec too big");
13619                            goto onError;
13620                        }
13621                        prec = prec*10 + (c - '0');
13622                    }
13623                }
13624            } /* prec */
13625            if (fmtcnt >= 0) {
13626                if (c == 'h' || c == 'l' || c == 'L') {
13627                    if (--fmtcnt >= 0)
13628                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13629                }
13630            }
13631            if (fmtcnt < 0) {
13632                PyErr_SetString(PyExc_ValueError,
13633                                "incomplete format");
13634                goto onError;
13635            }
13636            if (fmtcnt == 0)
13637                writer.overallocate = 0;
13638
13639            if (c == '%') {
13640                if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
13641                    goto onError;
13642                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13643                writer.pos += 1;
13644                continue;
13645            }
13646
13647            v = getnextarg(args, arglen, &argidx);
13648            if (v == NULL)
13649                goto onError;
13650
13651            sign = 0;
13652            signchar = '\0';
13653            fill = ' ';
13654            switch (c) {
13655
13656            case 's':
13657            case 'r':
13658            case 'a':
13659                if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13660                    /* Fast path */
13661                    if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13662                        goto onError;
13663                    goto nextarg;
13664                }
13665
13666                if (PyUnicode_CheckExact(v) && c == 's') {
13667                    temp = v;
13668                    Py_INCREF(temp);
13669                }
13670                else {
13671                    if (c == 's')
13672                        temp = PyObject_Str(v);
13673                    else if (c == 'r')
13674                        temp = PyObject_Repr(v);
13675                    else
13676                        temp = PyObject_ASCII(v);
13677                }
13678                break;
13679
13680            case 'i':
13681            case 'd':
13682            case 'u':
13683            case 'o':
13684            case 'x':
13685            case 'X':
13686                if (PyLong_CheckExact(v)
13687                    && width == -1 && prec == -1
13688                    && !(flags & (F_SIGN | F_BLANK)))
13689                {
13690                    /* Fast path */
13691                    switch(c)
13692                    {
13693                    case 'd':
13694                    case 'i':
13695                    case 'u':
13696                        if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13697                            goto onError;
13698                        goto nextarg;
13699                    case 'x':
13700                        if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13701                            goto onError;
13702                        goto nextarg;
13703                    case 'o':
13704                        if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13705                            goto onError;
13706                        goto nextarg;
13707                    default:
13708                        break;
13709                    }
13710                }
13711
13712                isnumok = 0;
13713                if (PyNumber_Check(v)) {
13714                    PyObject *iobj=NULL;
13715
13716                    if (PyLong_Check(v)) {
13717                        iobj = v;
13718                        Py_INCREF(iobj);
13719                    }
13720                    else {
13721                        iobj = PyNumber_Long(v);
13722                    }
13723                    if (iobj!=NULL) {
13724                        if (PyLong_Check(iobj)) {
13725                            isnumok = 1;
13726                            sign = 1;
13727                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
13728                            Py_DECREF(iobj);
13729                        }
13730                        else {
13731                            Py_DECREF(iobj);
13732                        }
13733                    }
13734                }
13735                if (!isnumok) {
13736                    PyErr_Format(PyExc_TypeError,
13737                                 "%%%c format: a number is required, "
13738                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13739                    goto onError;
13740                }
13741                if (flags & F_ZERO)
13742                    fill = '0';
13743                break;
13744
13745            case 'e':
13746            case 'E':
13747            case 'f':
13748            case 'F':
13749            case 'g':
13750            case 'G':
13751                if (width == -1 && prec == -1
13752                    && !(flags & (F_SIGN | F_BLANK)))
13753                {
13754                    /* Fast path */
13755                    if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13756                        goto onError;
13757                    goto nextarg;
13758                }
13759
13760                sign = 1;
13761                if (flags & F_ZERO)
13762                    fill = '0';
13763                if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13764                    temp = NULL;
13765                break;
13766
13767            case 'c':
13768            {
13769                Py_UCS4 ch = formatchar(v);
13770                if (ch == (Py_UCS4) -1)
13771                    goto onError;
13772                if (width == -1 && prec == -1) {
13773                    /* Fast path */
13774                    if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13775                        goto onError;
13776                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13777                    writer.pos += 1;
13778                    goto nextarg;
13779                }
13780                temp = PyUnicode_FromOrdinal(ch);
13781                break;
13782            }
13783
13784            default:
13785                PyErr_Format(PyExc_ValueError,
13786                             "unsupported format character '%c' (0x%x) "
13787                             "at index %zd",
13788                             (31<=c && c<=126) ? (char)c : '?',
13789                             (int)c,
13790                             fmtpos - 1);
13791                goto onError;
13792            }
13793            if (temp == NULL)
13794                goto onError;
13795            assert (PyUnicode_Check(temp));
13796
13797            if (width == -1 && prec == -1
13798                && !(flags & (F_SIGN | F_BLANK)))
13799            {
13800                /* Fast path */
13801                if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13802                    goto onError;
13803                goto nextarg;
13804            }
13805
13806            if (PyUnicode_READY(temp) == -1) {
13807                Py_CLEAR(temp);
13808                goto onError;
13809            }
13810            kind = PyUnicode_KIND(temp);
13811            pbuf = PyUnicode_DATA(temp);
13812            len = PyUnicode_GET_LENGTH(temp);
13813
13814            if (c == 's' || c == 'r' || c == 'a') {
13815                if (prec >= 0 && len > prec)
13816                    len = prec;
13817            }
13818
13819            /* pbuf is initialized here. */
13820            pindex = 0;
13821            if (sign) {
13822                Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13823                if (ch == '-' || ch == '+') {
13824                    signchar = ch;
13825                    len--;
13826                    pindex++;
13827                }
13828                else if (flags & F_SIGN)
13829                    signchar = '+';
13830                else if (flags & F_BLANK)
13831                    signchar = ' ';
13832                else
13833                    sign = 0;
13834            }
13835            if (width < len)
13836                width = len;
13837
13838            /* Compute the length and maximum character of the
13839               written characters */
13840            bufmaxchar = 127;
13841            if (!(flags & F_LJUST)) {
13842                if (sign) {
13843                    if ((width-1) > len)
13844                        bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13845                }
13846                else {
13847                    if (width > len)
13848                        bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13849                }
13850            }
13851            maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
13852            bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13853
13854            buflen = width;
13855            if (sign && len == width)
13856                buflen++;
13857
13858            if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
13859                goto onError;
13860
13861            /* Write characters */
13862            if (sign) {
13863                if (fill != ' ') {
13864                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13865                    writer.pos += 1;
13866                }
13867                if (width > len)
13868                    width--;
13869            }
13870            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13871                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13872                assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
13873                if (fill != ' ') {
13874                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13875                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13876                    writer.pos += 2;
13877                    pindex += 2;
13878                }
13879                width -= 2;
13880                if (width < 0)
13881                    width = 0;
13882                len -= 2;
13883            }
13884            if (width > len && !(flags & F_LJUST)) {
13885                sublen = width - len;
13886                FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13887                writer.pos += sublen;
13888                width = len;
13889            }
13890            if (fill == ' ') {
13891                if (sign) {
13892                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13893                    writer.pos += 1;
13894                }
13895                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
13896                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13897                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13898                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13899                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13900                    writer.pos += 2;
13901                    pindex += 2;
13902                }
13903            }
13904
13905            if (len) {
13906                _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13907                                              temp, pindex, len);
13908                writer.pos += len;
13909            }
13910            if (width > len) {
13911                sublen = width - len;
13912                FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13913                writer.pos += sublen;
13914            }
13915
13916nextarg:
13917            if (dict && (argidx < arglen) && c != '%') {
13918                PyErr_SetString(PyExc_TypeError,
13919                                "not all arguments converted during string formatting");
13920                goto onError;
13921            }
13922            Py_CLEAR(temp);
13923        } /* '%' */
13924    } /* until end */
13925    if (argidx < arglen && !dict) {
13926        PyErr_SetString(PyExc_TypeError,
13927                        "not all arguments converted during string formatting");
13928        goto onError;
13929    }
13930
13931    if (args_owned) {
13932        Py_DECREF(args);
13933    }
13934    Py_DECREF(uformat);
13935    Py_XDECREF(temp);
13936    Py_XDECREF(second);
13937    return _PyUnicodeWriter_Finish(&writer);
13938
13939  onError:
13940    Py_DECREF(uformat);
13941    Py_XDECREF(temp);
13942    Py_XDECREF(second);
13943    _PyUnicodeWriter_Dealloc(&writer);
13944    if (args_owned) {
13945        Py_DECREF(args);
13946    }
13947    return NULL;
13948}
13949
13950static PyObject *
13951unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13952
13953static PyObject *
13954unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13955{
13956    PyObject *x = NULL;
13957    static char *kwlist[] = {"object", "encoding", "errors", 0};
13958    char *encoding = NULL;
13959    char *errors = NULL;
13960
13961    if (type != &PyUnicode_Type)
13962        return unicode_subtype_new(type, args, kwds);
13963    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
13964                                     kwlist, &x, &encoding, &errors))
13965        return NULL;
13966    if (x == NULL) {
13967        Py_INCREF(unicode_empty);
13968        return unicode_empty;
13969    }
13970    if (encoding == NULL && errors == NULL)
13971        return PyObject_Str(x);
13972    else
13973        return PyUnicode_FromEncodedObject(x, encoding, errors);
13974}
13975
13976static PyObject *
13977unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13978{
13979    PyObject *unicode, *self;
13980    Py_ssize_t length, char_size;
13981    int share_wstr, share_utf8;
13982    unsigned int kind;
13983    void *data;
13984
13985    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13986
13987    unicode = unicode_new(&PyUnicode_Type, args, kwds);
13988    if (unicode == NULL)
13989        return NULL;
13990    assert(_PyUnicode_CHECK(unicode));
13991    if (PyUnicode_READY(unicode) == -1) {
13992        Py_DECREF(unicode);
13993        return NULL;
13994    }
13995
13996    self = type->tp_alloc(type, 0);
13997    if (self == NULL) {
13998        Py_DECREF(unicode);
13999        return NULL;
14000    }
14001    kind = PyUnicode_KIND(unicode);
14002    length = PyUnicode_GET_LENGTH(unicode);
14003
14004    _PyUnicode_LENGTH(self) = length;
14005#ifdef Py_DEBUG
14006    _PyUnicode_HASH(self) = -1;
14007#else
14008    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14009#endif
14010    _PyUnicode_STATE(self).interned = 0;
14011    _PyUnicode_STATE(self).kind = kind;
14012    _PyUnicode_STATE(self).compact = 0;
14013    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14014    _PyUnicode_STATE(self).ready = 1;
14015    _PyUnicode_WSTR(self) = NULL;
14016    _PyUnicode_UTF8_LENGTH(self) = 0;
14017    _PyUnicode_UTF8(self) = NULL;
14018    _PyUnicode_WSTR_LENGTH(self) = 0;
14019    _PyUnicode_DATA_ANY(self) = NULL;
14020
14021    share_utf8 = 0;
14022    share_wstr = 0;
14023    if (kind == PyUnicode_1BYTE_KIND) {
14024        char_size = 1;
14025        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14026            share_utf8 = 1;
14027    }
14028    else if (kind == PyUnicode_2BYTE_KIND) {
14029        char_size = 2;
14030        if (sizeof(wchar_t) == 2)
14031            share_wstr = 1;
14032    }
14033    else {
14034        assert(kind == PyUnicode_4BYTE_KIND);
14035        char_size = 4;
14036        if (sizeof(wchar_t) == 4)
14037            share_wstr = 1;
14038    }
14039
14040    /* Ensure we won't overflow the length. */
14041    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14042        PyErr_NoMemory();
14043        goto onError;
14044    }
14045    data = PyObject_MALLOC((length + 1) * char_size);
14046    if (data == NULL) {
14047        PyErr_NoMemory();
14048        goto onError;
14049    }
14050
14051    _PyUnicode_DATA_ANY(self) = data;
14052    if (share_utf8) {
14053        _PyUnicode_UTF8_LENGTH(self) = length;
14054        _PyUnicode_UTF8(self) = data;
14055    }
14056    if (share_wstr) {
14057        _PyUnicode_WSTR_LENGTH(self) = length;
14058        _PyUnicode_WSTR(self) = (wchar_t *)data;
14059    }
14060
14061    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14062              kind * (length + 1));
14063    assert(_PyUnicode_CheckConsistency(self, 1));
14064#ifdef Py_DEBUG
14065    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14066#endif
14067    Py_DECREF(unicode);
14068    return self;
14069
14070onError:
14071    Py_DECREF(unicode);
14072    Py_DECREF(self);
14073    return NULL;
14074}
14075
14076PyDoc_STRVAR(unicode_doc,
14077"str(object='') -> str\n\
14078str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14079\n\
14080Create a new string object from the given object. If encoding or\n\
14081errors is specified, then the object must expose a data buffer\n\
14082that will be decoded using the given encoding and error handler.\n\
14083Otherwise, returns the result of object.__str__() (if defined)\n\
14084or repr(object).\n\
14085encoding defaults to sys.getdefaultencoding().\n\
14086errors defaults to 'strict'.");
14087
14088static PyObject *unicode_iter(PyObject *seq);
14089
14090PyTypeObject PyUnicode_Type = {
14091    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14092    "str",              /* tp_name */
14093    sizeof(PyUnicodeObject),        /* tp_size */
14094    0,                  /* tp_itemsize */
14095    /* Slots */
14096    (destructor)unicode_dealloc,    /* tp_dealloc */
14097    0,                  /* tp_print */
14098    0,                  /* tp_getattr */
14099    0,                  /* tp_setattr */
14100    0,                  /* tp_reserved */
14101    unicode_repr,           /* tp_repr */
14102    &unicode_as_number,         /* tp_as_number */
14103    &unicode_as_sequence,       /* tp_as_sequence */
14104    &unicode_as_mapping,        /* tp_as_mapping */
14105    (hashfunc) unicode_hash,        /* tp_hash*/
14106    0,                  /* tp_call*/
14107    (reprfunc) unicode_str,     /* tp_str */
14108    PyObject_GenericGetAttr,        /* tp_getattro */
14109    0,                  /* tp_setattro */
14110    0,                  /* tp_as_buffer */
14111    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14112    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14113    unicode_doc,            /* tp_doc */
14114    0,                  /* tp_traverse */
14115    0,                  /* tp_clear */
14116    PyUnicode_RichCompare,      /* tp_richcompare */
14117    0,                  /* tp_weaklistoffset */
14118    unicode_iter,           /* tp_iter */
14119    0,                  /* tp_iternext */
14120    unicode_methods,            /* tp_methods */
14121    0,                  /* tp_members */
14122    0,                  /* tp_getset */
14123    &PyBaseObject_Type,         /* tp_base */
14124    0,                  /* tp_dict */
14125    0,                  /* tp_descr_get */
14126    0,                  /* tp_descr_set */
14127    0,                  /* tp_dictoffset */
14128    0,                  /* tp_init */
14129    0,                  /* tp_alloc */
14130    unicode_new,            /* tp_new */
14131    PyObject_Del,           /* tp_free */
14132};
14133
14134/* Initialize the Unicode implementation */
14135
14136int _PyUnicode_Init(void)
14137{
14138    int i;
14139
14140    /* XXX - move this array to unicodectype.c ? */
14141    Py_UCS2 linebreak[] = {
14142        0x000A, /* LINE FEED */
14143        0x000D, /* CARRIAGE RETURN */
14144        0x001C, /* FILE SEPARATOR */
14145        0x001D, /* GROUP SEPARATOR */
14146        0x001E, /* RECORD SEPARATOR */
14147        0x0085, /* NEXT LINE */
14148        0x2028, /* LINE SEPARATOR */
14149        0x2029, /* PARAGRAPH SEPARATOR */
14150    };
14151
14152    /* Init the implementation */
14153    unicode_empty = PyUnicode_New(0, 0);
14154    if (!unicode_empty)
14155        Py_FatalError("Can't create empty string");
14156    assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
14157
14158    for (i = 0; i < 256; i++)
14159        unicode_latin1[i] = NULL;
14160    if (PyType_Ready(&PyUnicode_Type) < 0)
14161        Py_FatalError("Can't initialize 'unicode'");
14162
14163    /* initialize the linebreak bloom filter */
14164    bloom_linebreak = make_bloom_mask(
14165        PyUnicode_2BYTE_KIND, linebreak,
14166        Py_ARRAY_LENGTH(linebreak));
14167
14168    PyType_Ready(&EncodingMapType);
14169
14170    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14171        Py_FatalError("Can't initialize field name iterator type");
14172
14173    if (PyType_Ready(&PyFormatterIter_Type) < 0)
14174        Py_FatalError("Can't initialize formatter iter type");
14175
14176#ifdef HAVE_MBCS
14177    winver.dwOSVersionInfoSize = sizeof(winver);
14178    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14179        PyErr_SetFromWindowsErr(0);
14180        return -1;
14181    }
14182#endif
14183    return 0;
14184}
14185
14186/* Finalize the Unicode implementation */
14187
14188int
14189PyUnicode_ClearFreeList(void)
14190{
14191    return 0;
14192}
14193
14194void
14195_PyUnicode_Fini(void)
14196{
14197    int i;
14198
14199    Py_XDECREF(unicode_empty);
14200    unicode_empty = NULL;
14201
14202    for (i = 0; i < 256; i++) {
14203        if (unicode_latin1[i]) {
14204            Py_DECREF(unicode_latin1[i]);
14205            unicode_latin1[i] = NULL;
14206        }
14207    }
14208    _PyUnicode_ClearStaticStrings();
14209    (void)PyUnicode_ClearFreeList();
14210}
14211
14212void
14213PyUnicode_InternInPlace(PyObject **p)
14214{
14215    register PyObject *s = *p;
14216    PyObject *t;
14217#ifdef Py_DEBUG
14218    assert(s != NULL);
14219    assert(_PyUnicode_CHECK(s));
14220#else
14221    if (s == NULL || !PyUnicode_Check(s))
14222        return;
14223#endif
14224    /* If it's a subclass, we don't really know what putting
14225       it in the interned dict might do. */
14226    if (!PyUnicode_CheckExact(s))
14227        return;
14228    if (PyUnicode_CHECK_INTERNED(s))
14229        return;
14230    if (interned == NULL) {
14231        interned = PyDict_New();
14232        if (interned == NULL) {
14233            PyErr_Clear(); /* Don't leave an exception */
14234            return;
14235        }
14236    }
14237    /* It might be that the GetItem call fails even
14238       though the key is present in the dictionary,
14239       namely when this happens during a stack overflow. */
14240    Py_ALLOW_RECURSION
14241    t = PyDict_GetItem(interned, s);
14242    Py_END_ALLOW_RECURSION
14243
14244        if (t) {
14245            Py_INCREF(t);
14246            Py_DECREF(*p);
14247            *p = t;
14248            return;
14249        }
14250
14251    PyThreadState_GET()->recursion_critical = 1;
14252    if (PyDict_SetItem(interned, s, s) < 0) {
14253        PyErr_Clear();
14254        PyThreadState_GET()->recursion_critical = 0;
14255        return;
14256    }
14257    PyThreadState_GET()->recursion_critical = 0;
14258    /* The two references in interned are not counted by refcnt.
14259       The deallocator will take care of this */
14260    Py_REFCNT(s) -= 2;
14261    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
14262}
14263
14264void
14265PyUnicode_InternImmortal(PyObject **p)
14266{
14267    PyUnicode_InternInPlace(p);
14268    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
14269        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
14270        Py_INCREF(*p);
14271    }
14272}
14273
14274PyObject *
14275PyUnicode_InternFromString(const char *cp)
14276{
14277    PyObject *s = PyUnicode_FromString(cp);
14278    if (s == NULL)
14279        return NULL;
14280    PyUnicode_InternInPlace(&s);
14281    return s;
14282}
14283
14284void
14285_Py_ReleaseInternedUnicodeStrings(void)
14286{
14287    PyObject *keys;
14288    PyObject *s;
14289    Py_ssize_t i, n;
14290    Py_ssize_t immortal_size = 0, mortal_size = 0;
14291
14292    if (interned == NULL || !PyDict_Check(interned))
14293        return;
14294    keys = PyDict_Keys(interned);
14295    if (keys == NULL || !PyList_Check(keys)) {
14296        PyErr_Clear();
14297        return;
14298    }
14299
14300    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14301       detector, interned unicode strings are not forcibly deallocated;
14302       rather, we give them their stolen references back, and then clear
14303       and DECREF the interned dict. */
14304
14305    n = PyList_GET_SIZE(keys);
14306    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
14307            n);
14308    for (i = 0; i < n; i++) {
14309        s = PyList_GET_ITEM(keys, i);
14310        if (PyUnicode_READY(s) == -1) {
14311            assert(0 && "could not ready string");
14312            fprintf(stderr, "could not ready string\n");
14313        }
14314        switch (PyUnicode_CHECK_INTERNED(s)) {
14315        case SSTATE_NOT_INTERNED:
14316            /* XXX Shouldn't happen */
14317            break;
14318        case SSTATE_INTERNED_IMMORTAL:
14319            Py_REFCNT(s) += 1;
14320            immortal_size += PyUnicode_GET_LENGTH(s);
14321            break;
14322        case SSTATE_INTERNED_MORTAL:
14323            Py_REFCNT(s) += 2;
14324            mortal_size += PyUnicode_GET_LENGTH(s);
14325            break;
14326        default:
14327            Py_FatalError("Inconsistent interned string state.");
14328        }
14329        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
14330    }
14331    fprintf(stderr, "total size of all interned strings: "
14332            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14333            "mortal/immortal\n", mortal_size, immortal_size);
14334    Py_DECREF(keys);
14335    PyDict_Clear(interned);
14336    Py_DECREF(interned);
14337    interned = NULL;
14338}
14339
14340
14341/********************* Unicode Iterator **************************/
14342
14343typedef struct {
14344    PyObject_HEAD
14345    Py_ssize_t it_index;
14346    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14347} unicodeiterobject;
14348
14349static void
14350unicodeiter_dealloc(unicodeiterobject *it)
14351{
14352    _PyObject_GC_UNTRACK(it);
14353    Py_XDECREF(it->it_seq);
14354    PyObject_GC_Del(it);
14355}
14356
14357static int
14358unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14359{
14360    Py_VISIT(it->it_seq);
14361    return 0;
14362}
14363
14364static PyObject *
14365unicodeiter_next(unicodeiterobject *it)
14366{
14367    PyObject *seq, *item;
14368
14369    assert(it != NULL);
14370    seq = it->it_seq;
14371    if (seq == NULL)
14372        return NULL;
14373    assert(_PyUnicode_CHECK(seq));
14374
14375    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14376        int kind = PyUnicode_KIND(seq);
14377        void *data = PyUnicode_DATA(seq);
14378        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14379        item = PyUnicode_FromOrdinal(chr);
14380        if (item != NULL)
14381            ++it->it_index;
14382        return item;
14383    }
14384
14385    Py_DECREF(seq);
14386    it->it_seq = NULL;
14387    return NULL;
14388}
14389
14390static PyObject *
14391unicodeiter_len(unicodeiterobject *it)
14392{
14393    Py_ssize_t len = 0;
14394    if (it->it_seq)
14395        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14396    return PyLong_FromSsize_t(len);
14397}
14398
14399PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14400
14401static PyObject *
14402unicodeiter_reduce(unicodeiterobject *it)
14403{
14404    if (it->it_seq != NULL) {
14405        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
14406                             it->it_seq, it->it_index);
14407    } else {
14408        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14409        if (u == NULL)
14410            return NULL;
14411        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
14412    }
14413}
14414
14415PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14416
14417static PyObject *
14418unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14419{
14420    Py_ssize_t index = PyLong_AsSsize_t(state);
14421    if (index == -1 && PyErr_Occurred())
14422        return NULL;
14423    if (index < 0)
14424        index = 0;
14425    it->it_index = index;
14426    Py_RETURN_NONE;
14427}
14428
14429PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14430
14431static PyMethodDef unicodeiter_methods[] = {
14432    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14433     length_hint_doc},
14434    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14435     reduce_doc},
14436    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
14437     setstate_doc},
14438    {NULL,      NULL}       /* sentinel */
14439};
14440
14441PyTypeObject PyUnicodeIter_Type = {
14442    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14443    "str_iterator",         /* tp_name */
14444    sizeof(unicodeiterobject),      /* tp_basicsize */
14445    0,                  /* tp_itemsize */
14446    /* methods */
14447    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14448    0,                  /* tp_print */
14449    0,                  /* tp_getattr */
14450    0,                  /* tp_setattr */
14451    0,                  /* tp_reserved */
14452    0,                  /* tp_repr */
14453    0,                  /* tp_as_number */
14454    0,                  /* tp_as_sequence */
14455    0,                  /* tp_as_mapping */
14456    0,                  /* tp_hash */
14457    0,                  /* tp_call */
14458    0,                  /* tp_str */
14459    PyObject_GenericGetAttr,        /* tp_getattro */
14460    0,                  /* tp_setattro */
14461    0,                  /* tp_as_buffer */
14462    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14463    0,                  /* tp_doc */
14464    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14465    0,                  /* tp_clear */
14466    0,                  /* tp_richcompare */
14467    0,                  /* tp_weaklistoffset */
14468    PyObject_SelfIter,          /* tp_iter */
14469    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14470    unicodeiter_methods,            /* tp_methods */
14471    0,
14472};
14473
14474static PyObject *
14475unicode_iter(PyObject *seq)
14476{
14477    unicodeiterobject *it;
14478
14479    if (!PyUnicode_Check(seq)) {
14480        PyErr_BadInternalCall();
14481        return NULL;
14482    }
14483    if (PyUnicode_READY(seq) == -1)
14484        return NULL;
14485    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14486    if (it == NULL)
14487        return NULL;
14488    it->it_index = 0;
14489    Py_INCREF(seq);
14490    it->it_seq = seq;
14491    _PyObject_GC_TRACK(it);
14492    return (PyObject *)it;
14493}
14494
14495
14496size_t
14497Py_UNICODE_strlen(const Py_UNICODE *u)
14498{
14499    int res = 0;
14500    while(*u++)
14501        res++;
14502    return res;
14503}
14504
14505Py_UNICODE*
14506Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14507{
14508    Py_UNICODE *u = s1;
14509    while ((*u++ = *s2++));
14510    return s1;
14511}
14512
14513Py_UNICODE*
14514Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14515{
14516    Py_UNICODE *u = s1;
14517    while ((*u++ = *s2++))
14518        if (n-- == 0)
14519            break;
14520    return s1;
14521}
14522
14523Py_UNICODE*
14524Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14525{
14526    Py_UNICODE *u1 = s1;
14527    u1 += Py_UNICODE_strlen(u1);
14528    Py_UNICODE_strcpy(u1, s2);
14529    return s1;
14530}
14531
14532int
14533Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14534{
14535    while (*s1 && *s2 && *s1 == *s2)
14536        s1++, s2++;
14537    if (*s1 && *s2)
14538        return (*s1 < *s2) ? -1 : +1;
14539    if (*s1)
14540        return 1;
14541    if (*s2)
14542        return -1;
14543    return 0;
14544}
14545
14546int
14547Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14548{
14549    register Py_UNICODE u1, u2;
14550    for (; n != 0; n--) {
14551        u1 = *s1;
14552        u2 = *s2;
14553        if (u1 != u2)
14554            return (u1 < u2) ? -1 : +1;
14555        if (u1 == '\0')
14556            return 0;
14557        s1++;
14558        s2++;
14559    }
14560    return 0;
14561}
14562
14563Py_UNICODE*
14564Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14565{
14566    const Py_UNICODE *p;
14567    for (p = s; *p; p++)
14568        if (*p == c)
14569            return (Py_UNICODE*)p;
14570    return NULL;
14571}
14572
14573Py_UNICODE*
14574Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14575{
14576    const Py_UNICODE *p;
14577    p = s + Py_UNICODE_strlen(s);
14578    while (p != s) {
14579        p--;
14580        if (*p == c)
14581            return (Py_UNICODE*)p;
14582    }
14583    return NULL;
14584}
14585
14586Py_UNICODE*
14587PyUnicode_AsUnicodeCopy(PyObject *unicode)
14588{
14589    Py_UNICODE *u, *copy;
14590    Py_ssize_t len, size;
14591
14592    if (!PyUnicode_Check(unicode)) {
14593        PyErr_BadArgument();
14594        return NULL;
14595    }
14596    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14597    if (u == NULL)
14598        return NULL;
14599    /* Ensure we won't overflow the size. */
14600    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14601        PyErr_NoMemory();
14602        return NULL;
14603    }
14604    size = len + 1; /* copy the null character */
14605    size *= sizeof(Py_UNICODE);
14606    copy = PyMem_Malloc(size);
14607    if (copy == NULL) {
14608        PyErr_NoMemory();
14609        return NULL;
14610    }
14611    memcpy(copy, u, size);
14612    return copy;
14613}
14614
14615/* A _string module, to export formatter_parser and formatter_field_name_split
14616   to the string.Formatter class implemented in Python. */
14617
14618static PyMethodDef _string_methods[] = {
14619    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14620     METH_O, PyDoc_STR("split the argument as a field name")},
14621    {"formatter_parser", (PyCFunction) formatter_parser,
14622     METH_O, PyDoc_STR("parse the argument as a format string")},
14623    {NULL, NULL}
14624};
14625
14626static struct PyModuleDef _string_module = {
14627    PyModuleDef_HEAD_INIT,
14628    "_string",
14629    PyDoc_STR("string helper module"),
14630    0,
14631    _string_methods,
14632    NULL,
14633    NULL,
14634    NULL,
14635    NULL
14636};
14637
14638PyMODINIT_FUNC
14639PyInit__string(void)
14640{
14641    return PyModule_Create(&_string_module);
14642}
14643
14644
14645#ifdef __cplusplus
14646}
14647#endif
14648