unicodeobject.c revision d6e8de179758bcb7a68a694b9b6086d02bf22c3e
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "bytes_methods.h"
45
46#include "unicodeobject.h"
47#include "ucnhash.h"
48
49#ifdef MS_WINDOWS
50#include <windows.h>
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96/* This dictionary holds all interned unicode strings.  Note that references
97   to strings in this dictionary are *not* counted in the string's ob_refcnt.
98   When the interned string reaches a refcnt of 0 the string deallocation
99   function will delete the reference from this dictionary.
100
101   Another way to look at this is that to say that the actual reference
102   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
103*/
104static PyObject *interned;
105
106/* Free list for Unicode objects */
107static PyUnicodeObject *free_list;
108static int numfree;
109
110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114   shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
117/* Default encoding to use and assume when NULL is passed as encoding
118   parameter; it is fixed to "utf-8".  Always use the
119   PyUnicode_GetDefaultEncoding() API to access this global.
120
121   Don't forget to alter Py_FileSystemDefaultEncoding if you change the
122   hard coded default!
123*/
124static const char unicode_default_encoding[] = "utf-8";
125
126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128	0, 0, 0, 0, 0, 0, 0, 0,
129/*     case 0x0009: * HORIZONTAL TABULATION */
130/*     case 0x000A: * LINE FEED */
131/*     case 0x000B: * VERTICAL TABULATION */
132/*     case 0x000C: * FORM FEED */
133/*     case 0x000D: * CARRIAGE RETURN */
134	0, 1, 1, 1, 1, 1, 0, 0,
135	0, 0, 0, 0, 0, 0, 0, 0,
136/*     case 0x001C: * FILE SEPARATOR */
137/*     case 0x001D: * GROUP SEPARATOR */
138/*     case 0x001E: * RECORD SEPARATOR */
139/*     case 0x001F: * UNIT SEPARATOR */
140	0, 0, 0, 0, 1, 1, 1, 1,
141/*     case 0x0020: * SPACE */
142	1, 0, 0, 0, 0, 0, 0, 0,
143	0, 0, 0, 0, 0, 0, 0, 0,
144	0, 0, 0, 0, 0, 0, 0, 0,
145	0, 0, 0, 0, 0, 0, 0, 0,
146
147	0, 0, 0, 0, 0, 0, 0, 0,
148	0, 0, 0, 0, 0, 0, 0, 0,
149	0, 0, 0, 0, 0, 0, 0, 0,
150	0, 0, 0, 0, 0, 0, 0, 0,
151	0, 0, 0, 0, 0, 0, 0, 0,
152	0, 0, 0, 0, 0, 0, 0, 0,
153	0, 0, 0, 0, 0, 0, 0, 0,
154	0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159	0, 0, 0, 0, 0, 0, 0, 0,
160/*         0x000A, * LINE FEED */
161/*         0x000D, * CARRIAGE RETURN */
162	0, 0, 1, 0, 0, 1, 0, 0,
163	0, 0, 0, 0, 0, 0, 0, 0,
164/*         0x001C, * FILE SEPARATOR */
165/*         0x001D, * GROUP SEPARATOR */
166/*         0x001E, * RECORD SEPARATOR */
167	0, 0, 0, 0, 1, 1, 1, 0,
168	0, 0, 0, 0, 0, 0, 0, 0,
169	0, 0, 0, 0, 0, 0, 0, 0,
170	0, 0, 0, 0, 0, 0, 0, 0,
171	0, 0, 0, 0, 0, 0, 0, 0,
172
173	0, 0, 0, 0, 0, 0, 0, 0,
174	0, 0, 0, 0, 0, 0, 0, 0,
175	0, 0, 0, 0, 0, 0, 0, 0,
176	0, 0, 0, 0, 0, 0, 0, 0,
177	0, 0, 0, 0, 0, 0, 0, 0,
178	0, 0, 0, 0, 0, 0, 0, 0,
179	0, 0, 0, 0, 0, 0, 0, 0,
180	0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
184Py_UNICODE
185PyUnicode_GetMax(void)
186{
187#ifdef Py_UNICODE_WIDE
188	return 0x10FFFF;
189#else
190	/* This is actually an illegal character, so it should
191	   not be passed to unichr. */
192	return 0xFFFF;
193#endif
194}
195
196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199   to keep things simple, we use a single bitmask, using the least 5
200   bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
210#define BLOOM_LINEBREAK(ch) \
211    ((ch) < 128U ? ascii_linebreak[(ch)] : \
212    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216    /* calculate simple bloom-style bitmask for a given unicode string */
217
218    long mask;
219    Py_ssize_t i;
220
221    mask = 0;
222    for (i = 0; i < len; i++)
223        mask |= (1 << (ptr[i] & 0x1F));
224
225    return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230    Py_ssize_t i;
231
232    for (i = 0; i < setlen; i++)
233        if (set[i] == chr)
234            return 1;
235
236    return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
242/* --- Unicode Object ----------------------------------------------------- */
243
244static
245int unicode_resize(register PyUnicodeObject *unicode,
246                      Py_ssize_t length)
247{
248    void *oldstr;
249
250    /* Shortcut if there's nothing much to do. */
251    if (unicode->length == length)
252	goto reset;
253
254    /* Resizing shared object (unicode_empty or single character
255       objects) in-place is not allowed. Use PyUnicode_Resize()
256       instead ! */
257
258    if (unicode == unicode_empty ||
259	(unicode->length == 1 &&
260	 unicode->str[0] < 256U &&
261	 unicode_latin1[unicode->str[0]] == unicode)) {
262        PyErr_SetString(PyExc_SystemError,
263                        "can't resize shared str objects");
264        return -1;
265    }
266
267    /* We allocate one more byte to make sure the string is Ux0000 terminated.
268       The overallocation is also used by fastsearch, which assumes that it's
269       safe to look at str[length] (without making any assumptions about what
270       it contains). */
271
272    oldstr = unicode->str;
273    unicode->str = PyObject_REALLOC(unicode->str,
274				    sizeof(Py_UNICODE) * (length + 1));
275    if (!unicode->str) {
276	unicode->str = (Py_UNICODE *)oldstr;
277        PyErr_NoMemory();
278        return -1;
279    }
280    unicode->str[length] = 0;
281    unicode->length = length;
282
283 reset:
284    /* Reset the object caches */
285    if (unicode->defenc) {
286        Py_DECREF(unicode->defenc);
287        unicode->defenc = NULL;
288    }
289    unicode->hash = -1;
290
291    return 0;
292}
293
294/* We allocate one more byte to make sure the string is
295   Ux0000 terminated; some code (e.g. new_identifier)
296   relies on that.
297
298   XXX This allocator could further be enhanced by assuring that the
299       free list never reduces its size below 1.
300
301*/
302
303static
304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305{
306    register PyUnicodeObject *unicode;
307
308    /* Optimization for empty strings */
309    if (length == 0 && unicode_empty != NULL) {
310        Py_INCREF(unicode_empty);
311        return unicode_empty;
312    }
313
314    /* Ensure we won't overflow the size. */
315    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316        return (PyUnicodeObject *)PyErr_NoMemory();
317    }
318
319    /* Unicode freelist & memory allocation */
320    if (free_list) {
321        unicode = free_list;
322        free_list = *(PyUnicodeObject **)unicode;
323        numfree--;
324	if (unicode->str) {
325	    /* Keep-Alive optimization: we only upsize the buffer,
326	       never downsize it. */
327	    if ((unicode->length < length) &&
328                unicode_resize(unicode, length) < 0) {
329		PyObject_DEL(unicode->str);
330		unicode->str = NULL;
331	    }
332	}
333        else {
334	    size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335	    unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336        }
337        PyObject_INIT(unicode, &PyUnicode_Type);
338    }
339    else {
340	size_t new_size;
341        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
342        if (unicode == NULL)
343            return NULL;
344	new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345	unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
346    }
347
348    if (!unicode->str) {
349	PyErr_NoMemory();
350	goto onError;
351    }
352    /* Initialize the first element to guard against cases where
353     * the caller fails before initializing str -- unicode_resize()
354     * reads str[0], and the Keep-Alive optimization can keep memory
355     * allocated for str alive across a call to unicode_dealloc(unicode).
356     * We don't want unicode_resize to read uninitialized memory in
357     * that case.
358     */
359    unicode->str[0] = 0;
360    unicode->str[length] = 0;
361    unicode->length = length;
362    unicode->hash = -1;
363    unicode->state = 0;
364    unicode->defenc = NULL;
365    return unicode;
366
367 onError:
368    /* XXX UNREF/NEWREF interface should be more symmetrical */
369    _Py_DEC_REFTOTAL;
370    _Py_ForgetReference((PyObject *)unicode);
371    PyObject_Del(unicode);
372    return NULL;
373}
374
375static
376void unicode_dealloc(register PyUnicodeObject *unicode)
377{
378    switch (PyUnicode_CHECK_INTERNED(unicode)) {
379        case SSTATE_NOT_INTERNED:
380            break;
381
382        case SSTATE_INTERNED_MORTAL:
383            /* revive dead object temporarily for DelItem */
384            Py_REFCNT(unicode) = 3;
385            if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386                Py_FatalError(
387                    "deletion of interned string failed");
388            break;
389
390        case SSTATE_INTERNED_IMMORTAL:
391            Py_FatalError("Immortal interned string died.");
392
393        default:
394            Py_FatalError("Inconsistent interned string state.");
395    }
396
397    if (PyUnicode_CheckExact(unicode) &&
398	numfree < PyUnicode_MAXFREELIST) {
399        /* Keep-Alive optimization */
400	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
401	    PyObject_DEL(unicode->str);
402	    unicode->str = NULL;
403	    unicode->length = 0;
404	}
405	if (unicode->defenc) {
406	    Py_DECREF(unicode->defenc);
407	    unicode->defenc = NULL;
408	}
409	/* Add to free list */
410        *(PyUnicodeObject **)unicode = free_list;
411        free_list = unicode;
412        numfree++;
413    }
414    else {
415	PyObject_DEL(unicode->str);
416	Py_XDECREF(unicode->defenc);
417	Py_TYPE(unicode)->tp_free((PyObject *)unicode);
418    }
419}
420
421static
422int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
423{
424    register PyUnicodeObject *v;
425
426    /* Argument checks */
427    if (unicode == NULL) {
428	PyErr_BadInternalCall();
429	return -1;
430    }
431    v = *unicode;
432    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
433	PyErr_BadInternalCall();
434	return -1;
435    }
436
437    /* Resizing unicode_empty and single character objects is not
438       possible since these are being shared. We simply return a fresh
439       copy with the same Unicode content. */
440    if (v->length != length &&
441	(v == unicode_empty || v->length == 1)) {
442	PyUnicodeObject *w = _PyUnicode_New(length);
443	if (w == NULL)
444	    return -1;
445	Py_UNICODE_COPY(w->str, v->str,
446			length < v->length ? length : v->length);
447	Py_DECREF(*unicode);
448	*unicode = w;
449	return 0;
450    }
451
452    /* Note that we don't have to modify *unicode for unshared Unicode
453       objects, since we can modify them in-place. */
454    return unicode_resize(v, length);
455}
456
457int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
458{
459    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
460}
461
462PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
463				Py_ssize_t size)
464{
465    PyUnicodeObject *unicode;
466
467    /* If the Unicode data is known at construction time, we can apply
468       some optimizations which share commonly used objects. */
469    if (u != NULL) {
470
471	/* Optimization for empty strings */
472	if (size == 0 && unicode_empty != NULL) {
473	    Py_INCREF(unicode_empty);
474	    return (PyObject *)unicode_empty;
475	}
476
477	/* Single character Unicode objects in the Latin-1 range are
478	   shared when using this constructor */
479	if (size == 1 && *u < 256) {
480	    unicode = unicode_latin1[*u];
481	    if (!unicode) {
482		unicode = _PyUnicode_New(1);
483		if (!unicode)
484		    return NULL;
485		unicode->str[0] = *u;
486		unicode_latin1[*u] = unicode;
487	    }
488	    Py_INCREF(unicode);
489	    return (PyObject *)unicode;
490	}
491    }
492
493    unicode = _PyUnicode_New(size);
494    if (!unicode)
495        return NULL;
496
497    /* Copy the Unicode data into the new object */
498    if (u != NULL)
499	Py_UNICODE_COPY(unicode->str, u, size);
500
501    return (PyObject *)unicode;
502}
503
504PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
505{
506    PyUnicodeObject *unicode;
507
508	if (size < 0) {
509		PyErr_SetString(PyExc_SystemError,
510		    "Negative size passed to PyUnicode_FromStringAndSize");
511		return NULL;
512	}
513
514    /* If the Unicode data is known at construction time, we can apply
515       some optimizations which share commonly used objects.
516       Also, this means the input must be UTF-8, so fall back to the
517       UTF-8 decoder at the end. */
518    if (u != NULL) {
519
520	/* Optimization for empty strings */
521	if (size == 0 && unicode_empty != NULL) {
522	    Py_INCREF(unicode_empty);
523	    return (PyObject *)unicode_empty;
524	}
525
526	/* Single characters are shared when using this constructor.
527           Restrict to ASCII, since the input must be UTF-8. */
528	if (size == 1 && Py_CHARMASK(*u) < 128) {
529	    unicode = unicode_latin1[Py_CHARMASK(*u)];
530	    if (!unicode) {
531		unicode = _PyUnicode_New(1);
532		if (!unicode)
533		    return NULL;
534		unicode->str[0] = Py_CHARMASK(*u);
535		unicode_latin1[Py_CHARMASK(*u)] = unicode;
536	    }
537	    Py_INCREF(unicode);
538	    return (PyObject *)unicode;
539	}
540
541        return PyUnicode_DecodeUTF8(u, size, NULL);
542    }
543
544    unicode = _PyUnicode_New(size);
545    if (!unicode)
546        return NULL;
547
548    return (PyObject *)unicode;
549}
550
551PyObject *PyUnicode_FromString(const char *u)
552{
553    size_t size = strlen(u);
554    if (size > PY_SSIZE_T_MAX) {
555        PyErr_SetString(PyExc_OverflowError, "input too long");
556        return NULL;
557    }
558
559    return PyUnicode_FromStringAndSize(u, size);
560}
561
562#ifdef HAVE_WCHAR_H
563
564PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
565				 Py_ssize_t size)
566{
567    PyUnicodeObject *unicode;
568
569    if (w == NULL) {
570        if (size == 0)
571            return PyUnicode_FromStringAndSize(NULL, 0);
572	PyErr_BadInternalCall();
573	return NULL;
574    }
575
576    if (size == -1) {
577        size = wcslen(w);
578    }
579
580    unicode = _PyUnicode_New(size);
581    if (!unicode)
582        return NULL;
583
584    /* Copy the wchar_t data into the new object */
585#ifdef HAVE_USABLE_WCHAR_T
586    memcpy(unicode->str, w, size * sizeof(wchar_t));
587#else
588    {
589	register Py_UNICODE *u;
590	register Py_ssize_t i;
591	u = PyUnicode_AS_UNICODE(unicode);
592	for (i = size; i > 0; i--)
593	    *u++ = *w++;
594    }
595#endif
596
597    return (PyObject *)unicode;
598}
599
600static void
601makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
602{
603	*fmt++ = '%';
604	if (width) {
605		if (zeropad)
606			*fmt++ = '0';
607		fmt += sprintf(fmt, "%d", width);
608	}
609	if (precision)
610		fmt += sprintf(fmt, ".%d", precision);
611	if (longflag)
612		*fmt++ = 'l';
613	else if (size_tflag) {
614		char *f = PY_FORMAT_SIZE_T;
615		while (*f)
616			*fmt++ = *f++;
617	}
618	*fmt++ = c;
619	*fmt = '\0';
620}
621
622#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
623
624PyObject *
625PyUnicode_FromFormatV(const char *format, va_list vargs)
626{
627	va_list count;
628	Py_ssize_t callcount = 0;
629	PyObject **callresults = NULL;
630	PyObject **callresult = NULL;
631	Py_ssize_t n = 0;
632	int width = 0;
633	int precision = 0;
634	int zeropad;
635	const char* f;
636	Py_UNICODE *s;
637	PyObject *string;
638	/* used by sprintf */
639	char buffer[21];
640	/* use abuffer instead of buffer, if we need more space
641	 * (which can happen if there's a format specifier with width). */
642	char *abuffer = NULL;
643	char *realbuffer;
644	Py_ssize_t abuffersize = 0;
645	char fmt[60]; /* should be enough for %0width.precisionld */
646	const char *copy;
647
648#ifdef VA_LIST_IS_ARRAY
649	Py_MEMCPY(count, vargs, sizeof(va_list));
650#else
651#ifdef  __va_copy
652	__va_copy(count, vargs);
653#else
654	count = vargs;
655#endif
656#endif
657	/* step 1: count the number of %S/%R/%A format specifications
658	 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
659	 * these objects once during step 3 and put the result in
660	   an array) */
661	for (f = format; *f; f++) {
662		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
663			++callcount;
664	}
665	/* step 2: allocate memory for the results of
666	 * PyObject_Str()/PyObject_Repr() calls */
667	if (callcount) {
668		callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
669		if (!callresults) {
670			PyErr_NoMemory();
671			return NULL;
672		}
673		callresult = callresults;
674	}
675	/* step 3: figure out how large a buffer we need */
676	for (f = format; *f; f++) {
677		if (*f == '%') {
678			const char* p = f;
679			width = 0;
680			while (ISDIGIT((unsigned)*f))
681				width = (width*10) + *f++ - '0';
682			while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
683				;
684
685			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
686			 * they don't affect the amount of space we reserve.
687			 */
688			if ((*f == 'l' || *f == 'z') &&
689					(f[1] == 'd' || f[1] == 'u'))
690                                ++f;
691
692			switch (*f) {
693			case 'c':
694				(void)va_arg(count, int);
695				/* fall through... */
696			case '%':
697				n++;
698				break;
699			case 'd': case 'u': case 'i': case 'x':
700				(void) va_arg(count, int);
701				/* 20 bytes is enough to hold a 64-bit
702				   integer.  Decimal takes the most space.
703				   This isn't enough for octal.
704				   If a width is specified we need more
705				   (which we allocate later). */
706				if (width < 20)
707					width = 20;
708				n += width;
709				if (abuffersize < width)
710					abuffersize = width;
711				break;
712			case 's':
713			{
714				/* UTF-8 */
715				unsigned char*s;
716				s = va_arg(count, unsigned char*);
717				while (*s) {
718					if (*s < 128) {
719						n++; s++;
720					} else if (*s < 0xc0) {
721						/* invalid UTF-8 */
722						n++; s++;
723					} else if (*s < 0xc0) {
724						n++;
725						s++; if(!*s)break;
726						s++;
727					} else if (*s < 0xe0) {
728						n++;
729						s++; if(!*s)break;
730						s++; if(!*s)break;
731						s++;
732					} else {
733						#ifdef Py_UNICODE_WIDE
734						n++;
735						#else
736						n+=2;
737						#endif
738						s++; if(!*s)break;
739						s++; if(!*s)break;
740						s++; if(!*s)break;
741						s++;
742					}
743				}
744				break;
745			}
746			case 'U':
747			{
748				PyObject *obj = va_arg(count, PyObject *);
749				assert(obj && PyUnicode_Check(obj));
750				n += PyUnicode_GET_SIZE(obj);
751				break;
752			}
753			case 'V':
754			{
755				PyObject *obj = va_arg(count, PyObject *);
756				const char *str = va_arg(count, const char *);
757				assert(obj || str);
758				assert(!obj || PyUnicode_Check(obj));
759				if (obj)
760					n += PyUnicode_GET_SIZE(obj);
761				else
762					n += strlen(str);
763				break;
764			}
765			case 'S':
766			{
767				PyObject *obj = va_arg(count, PyObject *);
768				PyObject *str;
769				assert(obj);
770				str = PyObject_Str(obj);
771				if (!str)
772					goto fail;
773				n += PyUnicode_GET_SIZE(str);
774				/* Remember the str and switch to the next slot */
775				*callresult++ = str;
776				break;
777			}
778			case 'R':
779			{
780				PyObject *obj = va_arg(count, PyObject *);
781				PyObject *repr;
782				assert(obj);
783				repr = PyObject_Repr(obj);
784				if (!repr)
785					goto fail;
786				n += PyUnicode_GET_SIZE(repr);
787				/* Remember the repr and switch to the next slot */
788				*callresult++ = repr;
789				break;
790			}
791			case 'A':
792			{
793				PyObject *obj = va_arg(count, PyObject *);
794				PyObject *ascii;
795				assert(obj);
796				ascii = PyObject_ASCII(obj);
797				if (!ascii)
798					goto fail;
799				n += PyUnicode_GET_SIZE(ascii);
800				/* Remember the repr and switch to the next slot */
801				*callresult++ = ascii;
802				break;
803			}
804			case 'p':
805				(void) va_arg(count, int);
806				/* maximum 64-bit pointer representation:
807				 * 0xffffffffffffffff
808				 * so 19 characters is enough.
809				 * XXX I count 18 -- what's the extra for?
810				 */
811				n += 19;
812				break;
813			default:
814				/* if we stumble upon an unknown
815				   formatting code, copy the rest of
816				   the format string to the output
817				   string. (we cannot just skip the
818				   code, since there's no way to know
819				   what's in the argument list) */
820				n += strlen(p);
821				goto expand;
822			}
823		} else
824			n++;
825	}
826 expand:
827	if (abuffersize > 20) {
828		abuffer = PyObject_Malloc(abuffersize);
829		if (!abuffer) {
830			PyErr_NoMemory();
831			goto fail;
832		}
833		realbuffer = abuffer;
834	}
835	else
836		realbuffer = buffer;
837	/* step 4: fill the buffer */
838	/* Since we've analyzed how much space we need for the worst case,
839	   we don't have to resize the string.
840	   There can be no errors beyond this point. */
841	string = PyUnicode_FromUnicode(NULL, n);
842	if (!string)
843		goto fail;
844
845	s = PyUnicode_AS_UNICODE(string);
846	callresult = callresults;
847
848	for (f = format; *f; f++) {
849		if (*f == '%') {
850			const char* p = f++;
851			int longflag = 0;
852			int size_tflag = 0;
853			zeropad = (*f == '0');
854			/* parse the width.precision part */
855			width = 0;
856			while (ISDIGIT((unsigned)*f))
857				width = (width*10) + *f++ - '0';
858			precision = 0;
859			if (*f == '.') {
860				f++;
861				while (ISDIGIT((unsigned)*f))
862					precision = (precision*10) + *f++ - '0';
863			}
864			/* handle the long flag, but only for %ld and %lu.
865			   others can be added when necessary. */
866			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
867				longflag = 1;
868				++f;
869			}
870			/* handle the size_t flag. */
871			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
872				size_tflag = 1;
873				++f;
874			}
875
876			switch (*f) {
877			case 'c':
878				*s++ = va_arg(vargs, int);
879				break;
880			case 'd':
881				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
882				if (longflag)
883					sprintf(realbuffer, fmt, va_arg(vargs, long));
884				else if (size_tflag)
885					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
886				else
887					sprintf(realbuffer, fmt, va_arg(vargs, int));
888				appendstring(realbuffer);
889				break;
890			case 'u':
891				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
892				if (longflag)
893					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
894				else if (size_tflag)
895					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
896				else
897					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
898				appendstring(realbuffer);
899				break;
900			case 'i':
901				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
902				sprintf(realbuffer, fmt, va_arg(vargs, int));
903				appendstring(realbuffer);
904				break;
905			case 'x':
906				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
907				sprintf(realbuffer, fmt, va_arg(vargs, int));
908				appendstring(realbuffer);
909				break;
910			case 's':
911			{
912				/* Parameter must be UTF-8 encoded.
913				   In case of encoding errors, use
914				   the replacement character. */
915				PyObject *u;
916				p = va_arg(vargs, char*);
917				u = PyUnicode_DecodeUTF8(p, strlen(p),
918							 "replace");
919				if (!u)
920					goto fail;
921				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
922						PyUnicode_GET_SIZE(u));
923				s += PyUnicode_GET_SIZE(u);
924				Py_DECREF(u);
925				break;
926			}
927			case 'U':
928			{
929				PyObject *obj = va_arg(vargs, PyObject *);
930				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932				s += size;
933				break;
934			}
935			case 'V':
936			{
937				PyObject *obj = va_arg(vargs, PyObject *);
938				const char *str = va_arg(vargs, const char *);
939				if (obj) {
940					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
941					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
942					s += size;
943				} else {
944					appendstring(str);
945				}
946				break;
947			}
948			case 'S':
949			case 'R':
950			{
951				Py_UNICODE *ucopy;
952				Py_ssize_t usize;
953				Py_ssize_t upos;
954				/* unused, since we already have the result */
955				(void) va_arg(vargs, PyObject *);
956				ucopy = PyUnicode_AS_UNICODE(*callresult);
957				usize = PyUnicode_GET_SIZE(*callresult);
958				for (upos = 0; upos<usize;)
959					*s++ = ucopy[upos++];
960				/* We're done with the unicode()/repr() => forget it */
961				Py_DECREF(*callresult);
962				/* switch to next unicode()/repr() result */
963				++callresult;
964				break;
965			}
966			case 'p':
967				sprintf(buffer, "%p", va_arg(vargs, void*));
968				/* %p is ill-defined:  ensure leading 0x. */
969				if (buffer[1] == 'X')
970					buffer[1] = 'x';
971				else if (buffer[1] != 'x') {
972					memmove(buffer+2, buffer, strlen(buffer)+1);
973					buffer[0] = '0';
974					buffer[1] = 'x';
975				}
976				appendstring(buffer);
977				break;
978			case '%':
979				*s++ = '%';
980				break;
981			default:
982				appendstring(p);
983				goto end;
984			}
985		} else
986			*s++ = *f;
987	}
988
989 end:
990	if (callresults)
991		PyObject_Free(callresults);
992	if (abuffer)
993		PyObject_Free(abuffer);
994	PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
995	return string;
996 fail:
997	if (callresults) {
998		PyObject **callresult2 = callresults;
999		while (callresult2 < callresult) {
1000			Py_DECREF(*callresult2);
1001			++callresult2;
1002		}
1003		PyObject_Free(callresults);
1004	}
1005	if (abuffer)
1006		PyObject_Free(abuffer);
1007	return NULL;
1008}
1009
1010#undef appendstring
1011
1012PyObject *
1013PyUnicode_FromFormat(const char *format, ...)
1014{
1015	PyObject* ret;
1016	va_list vargs;
1017
1018#ifdef HAVE_STDARG_PROTOTYPES
1019	va_start(vargs, format);
1020#else
1021	va_start(vargs);
1022#endif
1023	ret = PyUnicode_FromFormatV(format, vargs);
1024	va_end(vargs);
1025	return ret;
1026}
1027
1028Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1029				wchar_t *w,
1030				Py_ssize_t size)
1031{
1032    if (unicode == NULL) {
1033	PyErr_BadInternalCall();
1034	return -1;
1035    }
1036
1037    /* If possible, try to copy the 0-termination as well */
1038    if (size > PyUnicode_GET_SIZE(unicode))
1039	size = PyUnicode_GET_SIZE(unicode) + 1;
1040
1041#ifdef HAVE_USABLE_WCHAR_T
1042    memcpy(w, unicode->str, size * sizeof(wchar_t));
1043#else
1044    {
1045	register Py_UNICODE *u;
1046	register Py_ssize_t i;
1047	u = PyUnicode_AS_UNICODE(unicode);
1048	for (i = size; i > 0; i--)
1049	    *w++ = *u++;
1050    }
1051#endif
1052
1053    if (size > PyUnicode_GET_SIZE(unicode))
1054        return PyUnicode_GET_SIZE(unicode);
1055    else
1056    return size;
1057}
1058
1059#endif
1060
1061PyObject *PyUnicode_FromOrdinal(int ordinal)
1062{
1063    Py_UNICODE s[2];
1064
1065    if (ordinal < 0 || ordinal > 0x10ffff) {
1066	PyErr_SetString(PyExc_ValueError,
1067			"chr() arg not in range(0x110000)");
1068	return NULL;
1069    }
1070
1071#ifndef Py_UNICODE_WIDE
1072    if (ordinal > 0xffff) {
1073        ordinal -= 0x10000;
1074        s[0] = 0xD800 | (ordinal >> 10);
1075        s[1] = 0xDC00 | (ordinal & 0x3FF);
1076        return PyUnicode_FromUnicode(s, 2);
1077    }
1078#endif
1079
1080    s[0] = (Py_UNICODE)ordinal;
1081    return PyUnicode_FromUnicode(s, 1);
1082}
1083
1084PyObject *PyUnicode_FromObject(register PyObject *obj)
1085{
1086    /* XXX Perhaps we should make this API an alias of
1087           PyObject_Str() instead ?! */
1088    if (PyUnicode_CheckExact(obj)) {
1089	Py_INCREF(obj);
1090	return obj;
1091    }
1092    if (PyUnicode_Check(obj)) {
1093	/* For a Unicode subtype that's not a Unicode object,
1094	   return a true Unicode object with the same data. */
1095	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1096				     PyUnicode_GET_SIZE(obj));
1097    }
1098    PyErr_Format(PyExc_TypeError,
1099                 "Can't convert '%.100s' object to str implicitly",
1100                 Py_TYPE(obj)->tp_name);
1101    return NULL;
1102}
1103
1104PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1105				      const char *encoding,
1106				      const char *errors)
1107{
1108    const char *s = NULL;
1109    Py_ssize_t len;
1110    PyObject *v;
1111
1112    if (obj == NULL) {
1113	PyErr_BadInternalCall();
1114	return NULL;
1115    }
1116
1117    if (PyUnicode_Check(obj)) {
1118	PyErr_SetString(PyExc_TypeError,
1119			"decoding str is not supported");
1120	return NULL;
1121	}
1122
1123    /* Coerce object */
1124    if (PyBytes_Check(obj)) {
1125        s = PyBytes_AS_STRING(obj);
1126        len = PyBytes_GET_SIZE(obj);
1127    }
1128    else if (PyByteArray_Check(obj)) {
1129        s = PyByteArray_AS_STRING(obj);
1130        len = PyByteArray_GET_SIZE(obj);
1131    }
1132    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1133	/* Overwrite the error message with something more useful in
1134	   case of a TypeError. */
1135	if (PyErr_ExceptionMatches(PyExc_TypeError))
1136            PyErr_Format(PyExc_TypeError,
1137			 "coercing to str: need string or buffer, "
1138			 "%.80s found",
1139		     Py_TYPE(obj)->tp_name);
1140	goto onError;
1141    }
1142
1143    /* Convert to Unicode */
1144    if (len == 0) {
1145	Py_INCREF(unicode_empty);
1146	v = (PyObject *)unicode_empty;
1147    }
1148    else
1149	v = PyUnicode_Decode(s, len, encoding, errors);
1150
1151    return v;
1152
1153 onError:
1154    return NULL;
1155}
1156
1157PyObject *PyUnicode_Decode(const char *s,
1158			   Py_ssize_t size,
1159			   const char *encoding,
1160			   const char *errors)
1161{
1162    PyObject *buffer = NULL, *unicode;
1163    Py_buffer info;
1164    char lower[20];  /* Enough for any encoding name we recognize */
1165    char *l;
1166    const char *e;
1167
1168    if (encoding == NULL)
1169        encoding = PyUnicode_GetDefaultEncoding();
1170
1171    /* Convert encoding to lower case and replace '_' with '-' in order to
1172       catch e.g. UTF_8 */
1173    e = encoding;
1174    l = lower;
1175    while (*e && l < &lower[(sizeof lower) - 2]) {
1176        if (ISUPPER(*e)) {
1177            *l++ = TOLOWER(*e++);
1178        }
1179        else if (*e == '_') {
1180            *l++ = '-';
1181            e++;
1182        }
1183        else {
1184            *l++ = *e++;
1185        }
1186    }
1187    *l = '\0';
1188
1189    /* Shortcuts for common default encodings */
1190    if (strcmp(lower, "utf-8") == 0)
1191        return PyUnicode_DecodeUTF8(s, size, errors);
1192    else if ((strcmp(lower, "latin-1") == 0) ||
1193             (strcmp(lower, "iso-8859-1") == 0))
1194        return PyUnicode_DecodeLatin1(s, size, errors);
1195#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196    else if (strcmp(lower, "mbcs") == 0)
1197        return PyUnicode_DecodeMBCS(s, size, errors);
1198#endif
1199    else if (strcmp(lower, "ascii") == 0)
1200        return PyUnicode_DecodeASCII(s, size, errors);
1201    else if (strcmp(lower, "utf-16") == 0)
1202        return PyUnicode_DecodeUTF16(s, size, errors, 0);
1203    else if (strcmp(lower, "utf-32") == 0)
1204        return PyUnicode_DecodeUTF32(s, size, errors, 0);
1205
1206    /* Decode via the codec registry */
1207    buffer = NULL;
1208    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1209        goto onError;
1210    buffer = PyMemoryView_FromBuffer(&info);
1211    if (buffer == NULL)
1212        goto onError;
1213    unicode = PyCodec_Decode(buffer, encoding, errors);
1214    if (unicode == NULL)
1215        goto onError;
1216    if (!PyUnicode_Check(unicode)) {
1217        PyErr_Format(PyExc_TypeError,
1218                     "decoder did not return a str object (type=%.400s)",
1219                     Py_TYPE(unicode)->tp_name);
1220        Py_DECREF(unicode);
1221        goto onError;
1222    }
1223    Py_DECREF(buffer);
1224    return unicode;
1225
1226 onError:
1227    Py_XDECREF(buffer);
1228    return NULL;
1229}
1230
1231PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1232                                    const char *encoding,
1233                                    const char *errors)
1234{
1235    PyObject *v;
1236
1237    if (!PyUnicode_Check(unicode)) {
1238        PyErr_BadArgument();
1239        goto onError;
1240    }
1241
1242    if (encoding == NULL)
1243	encoding = PyUnicode_GetDefaultEncoding();
1244
1245    /* Decode via the codec registry */
1246    v = PyCodec_Decode(unicode, encoding, errors);
1247    if (v == NULL)
1248        goto onError;
1249    return v;
1250
1251 onError:
1252    return NULL;
1253}
1254
1255PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1256                                     const char *encoding,
1257                                     const char *errors)
1258{
1259    PyObject *v;
1260
1261    if (!PyUnicode_Check(unicode)) {
1262        PyErr_BadArgument();
1263        goto onError;
1264    }
1265
1266    if (encoding == NULL)
1267	encoding = PyUnicode_GetDefaultEncoding();
1268
1269    /* Decode via the codec registry */
1270    v = PyCodec_Decode(unicode, encoding, errors);
1271    if (v == NULL)
1272        goto onError;
1273    if (!PyUnicode_Check(v)) {
1274        PyErr_Format(PyExc_TypeError,
1275                     "decoder did not return a str object (type=%.400s)",
1276                     Py_TYPE(v)->tp_name);
1277        Py_DECREF(v);
1278        goto onError;
1279    }
1280    return v;
1281
1282 onError:
1283    return NULL;
1284}
1285
1286PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1287			   Py_ssize_t size,
1288			   const char *encoding,
1289			   const char *errors)
1290{
1291    PyObject *v, *unicode;
1292
1293    unicode = PyUnicode_FromUnicode(s, size);
1294    if (unicode == NULL)
1295	return NULL;
1296    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1297    Py_DECREF(unicode);
1298    return v;
1299}
1300
1301PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1302                                    const char *encoding,
1303                                    const char *errors)
1304{
1305    PyObject *v;
1306
1307    if (!PyUnicode_Check(unicode)) {
1308        PyErr_BadArgument();
1309        goto onError;
1310    }
1311
1312    if (encoding == NULL)
1313	encoding = PyUnicode_GetDefaultEncoding();
1314
1315    /* Encode via the codec registry */
1316    v = PyCodec_Encode(unicode, encoding, errors);
1317    if (v == NULL)
1318        goto onError;
1319    return v;
1320
1321 onError:
1322    return NULL;
1323}
1324
1325PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1326                                    const char *encoding,
1327                                    const char *errors)
1328{
1329    PyObject *v;
1330
1331    if (!PyUnicode_Check(unicode)) {
1332        PyErr_BadArgument();
1333        return NULL;
1334    }
1335
1336    if (encoding == NULL)
1337	encoding = PyUnicode_GetDefaultEncoding();
1338
1339    /* Shortcuts for common default encodings */
1340    if (errors == NULL) {
1341	if (strcmp(encoding, "utf-8") == 0)
1342	    return PyUnicode_AsUTF8String(unicode);
1343	else if (strcmp(encoding, "latin-1") == 0)
1344	    return PyUnicode_AsLatin1String(unicode);
1345#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1346	else if (strcmp(encoding, "mbcs") == 0)
1347	    return PyUnicode_AsMBCSString(unicode);
1348#endif
1349	else if (strcmp(encoding, "ascii") == 0)
1350	    return PyUnicode_AsASCIIString(unicode);
1351        /* During bootstrap, we may need to find the encodings
1352           package, to load the file system encoding, and require the
1353           file system encoding in order to load the encodings
1354           package.
1355
1356           Break out of this dependency by assuming that the path to
1357           the encodings module is ASCII-only.  XXX could try wcstombs
1358           instead, if the file system encoding is the locale's
1359           encoding. */
1360        else if (Py_FileSystemDefaultEncoding &&
1361                 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1362                 !PyThreadState_GET()->interp->codecs_initialized)
1363	    return PyUnicode_AsASCIIString(unicode);
1364    }
1365
1366    /* Encode via the codec registry */
1367    v = PyCodec_Encode(unicode, encoding, errors);
1368    if (v == NULL)
1369        return NULL;
1370
1371    /* The normal path */
1372    if (PyBytes_Check(v))
1373        return v;
1374
1375    /* If the codec returns a buffer, raise a warning and convert to bytes */
1376    if (PyByteArray_Check(v)) {
1377        char msg[100];
1378        PyObject *b;
1379        PyOS_snprintf(msg, sizeof(msg),
1380                      "encoder %s returned buffer instead of bytes",
1381                      encoding);
1382        if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1383            Py_DECREF(v);
1384            return NULL;
1385        }
1386
1387        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1388        Py_DECREF(v);
1389        return b;
1390    }
1391
1392    PyErr_Format(PyExc_TypeError,
1393                 "encoder did not return a bytes object (type=%.400s)",
1394                 Py_TYPE(v)->tp_name);
1395    Py_DECREF(v);
1396    return NULL;
1397}
1398
1399PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1400                                     const char *encoding,
1401                                     const char *errors)
1402{
1403    PyObject *v;
1404
1405    if (!PyUnicode_Check(unicode)) {
1406        PyErr_BadArgument();
1407        goto onError;
1408    }
1409
1410    if (encoding == NULL)
1411	encoding = PyUnicode_GetDefaultEncoding();
1412
1413    /* Encode via the codec registry */
1414    v = PyCodec_Encode(unicode, encoding, errors);
1415    if (v == NULL)
1416        goto onError;
1417    if (!PyUnicode_Check(v)) {
1418        PyErr_Format(PyExc_TypeError,
1419                     "encoder did not return an str object (type=%.400s)",
1420                     Py_TYPE(v)->tp_name);
1421        Py_DECREF(v);
1422        goto onError;
1423    }
1424    return v;
1425
1426 onError:
1427    return NULL;
1428}
1429
1430PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1431					    const char *errors)
1432{
1433    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1434    if (v)
1435        return v;
1436    if (errors != NULL)
1437        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1438    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1439                             PyUnicode_GET_SIZE(unicode),
1440                             NULL);
1441    if (!v)
1442        return NULL;
1443    ((PyUnicodeObject *)unicode)->defenc = v;
1444    return v;
1445}
1446
1447PyObject*
1448PyUnicode_DecodeFSDefault(const char *s) {
1449    Py_ssize_t size = (Py_ssize_t)strlen(s);
1450    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1451}
1452
1453PyObject*
1454PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1455{
1456    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1457       can be undefined. If it is case, decode using UTF-8. The following assumes
1458       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1459       bootstrapping process where the codecs aren't ready yet.
1460    */
1461    if (Py_FileSystemDefaultEncoding) {
1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1463        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1464            return PyUnicode_DecodeMBCS(s, size, "replace");
1465        }
1466#elif defined(__APPLE__)
1467        if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1468            return PyUnicode_DecodeUTF8(s, size, "replace");
1469        }
1470#endif
1471        return PyUnicode_Decode(s, size,
1472                                Py_FileSystemDefaultEncoding,
1473                                "replace");
1474    }
1475    else {
1476        return PyUnicode_DecodeUTF8(s, size, "replace");
1477    }
1478}
1479
1480char*
1481_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1482{
1483    PyObject *bytes;
1484    if (!PyUnicode_Check(unicode)) {
1485        PyErr_BadArgument();
1486        return NULL;
1487    }
1488    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1489    if (bytes == NULL)
1490        return NULL;
1491    if (psize != NULL)
1492        *psize = PyBytes_GET_SIZE(bytes);
1493    return PyBytes_AS_STRING(bytes);
1494}
1495
1496char*
1497_PyUnicode_AsString(PyObject *unicode)
1498{
1499    return _PyUnicode_AsStringAndSize(unicode, NULL);
1500}
1501
1502Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1503{
1504    if (!PyUnicode_Check(unicode)) {
1505        PyErr_BadArgument();
1506        goto onError;
1507    }
1508    return PyUnicode_AS_UNICODE(unicode);
1509
1510 onError:
1511    return NULL;
1512}
1513
1514Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1515{
1516    if (!PyUnicode_Check(unicode)) {
1517        PyErr_BadArgument();
1518        goto onError;
1519    }
1520    return PyUnicode_GET_SIZE(unicode);
1521
1522 onError:
1523    return -1;
1524}
1525
1526const char *PyUnicode_GetDefaultEncoding(void)
1527{
1528    return unicode_default_encoding;
1529}
1530
1531int PyUnicode_SetDefaultEncoding(const char *encoding)
1532{
1533    if (strcmp(encoding, unicode_default_encoding) != 0) {
1534        PyErr_Format(PyExc_ValueError,
1535                     "Can only set default encoding to %s",
1536                     unicode_default_encoding);
1537        return -1;
1538    }
1539    return 0;
1540}
1541
1542/* error handling callback helper:
1543   build arguments, call the callback and check the arguments,
1544   if no exception occurred, copy the replacement to the output
1545   and adjust various state variables.
1546   return 0 on success, -1 on error
1547*/
1548
1549static
1550int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1551                 const char *encoding, const char *reason,
1552                 const char **input, const char **inend, Py_ssize_t *startinpos,
1553                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1554                 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1555{
1556    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
1557
1558    PyObject *restuple = NULL;
1559    PyObject *repunicode = NULL;
1560    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1561    Py_ssize_t insize;
1562    Py_ssize_t requiredsize;
1563    Py_ssize_t newpos;
1564    Py_UNICODE *repptr;
1565    PyObject *inputobj = NULL;
1566    Py_ssize_t repsize;
1567    int res = -1;
1568
1569    if (*errorHandler == NULL) {
1570	*errorHandler = PyCodec_LookupError(errors);
1571	if (*errorHandler == NULL)
1572	   goto onError;
1573    }
1574
1575    if (*exceptionObject == NULL) {
1576    	*exceptionObject = PyUnicodeDecodeError_Create(
1577	    encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1578	if (*exceptionObject == NULL)
1579	   goto onError;
1580    }
1581    else {
1582	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1583	    goto onError;
1584	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1585	    goto onError;
1586	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1587	    goto onError;
1588    }
1589
1590    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1591    if (restuple == NULL)
1592	goto onError;
1593    if (!PyTuple_Check(restuple)) {
1594	PyErr_Format(PyExc_TypeError, &argparse[4]);
1595	goto onError;
1596    }
1597    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1598	goto onError;
1599
1600    /* Copy back the bytes variables, which might have been modified by the
1601       callback */
1602    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1603    if (!inputobj)
1604        goto onError;
1605    if (!PyBytes_Check(inputobj)) {
1606	PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1607    }
1608    *input = PyBytes_AS_STRING(inputobj);
1609    insize = PyBytes_GET_SIZE(inputobj);
1610    *inend = *input + insize;
1611    /* we can DECREF safely, as the exception has another reference,
1612       so the object won't go away. */
1613    Py_DECREF(inputobj);
1614
1615    if (newpos<0)
1616	newpos = insize+newpos;
1617    if (newpos<0 || newpos>insize) {
1618	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1619	goto onError;
1620    }
1621
1622    /* need more space? (at least enough for what we
1623       have+the replacement+the rest of the string (starting
1624       at the new input position), so we won't have to check space
1625       when there are no errors in the rest of the string) */
1626    repptr = PyUnicode_AS_UNICODE(repunicode);
1627    repsize = PyUnicode_GET_SIZE(repunicode);
1628    requiredsize = *outpos + repsize + insize-newpos;
1629    if (requiredsize > outsize) {
1630	if (requiredsize<2*outsize)
1631	    requiredsize = 2*outsize;
1632	if (_PyUnicode_Resize(output, requiredsize) < 0)
1633	    goto onError;
1634	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1635    }
1636    *endinpos = newpos;
1637    *inptr = *input + newpos;
1638    Py_UNICODE_COPY(*outptr, repptr, repsize);
1639    *outptr += repsize;
1640    *outpos += repsize;
1641
1642    /* we made it! */
1643    res = 0;
1644
1645    onError:
1646    Py_XDECREF(restuple);
1647    return res;
1648}
1649
1650/* --- UTF-7 Codec -------------------------------------------------------- */
1651
1652/* see RFC2152 for details */
1653
1654static
1655char utf7_special[128] = {
1656    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1657       encoded:
1658	   0 - not special
1659	   1 - special
1660	   2 - whitespace (optional)
1661	   3 - RFC2152 Set O (optional) */
1662    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1663    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1664    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1665    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1666    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1668    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1669    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1670
1671};
1672
1673/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1674   warnings about the comparison always being false; since
1675   utf7_special[0] is 1, we can safely make that one comparison
1676   true  */
1677
1678#define SPECIAL(c, encodeO, encodeWS) \
1679    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1680     (encodeWS && (utf7_special[(c)] == 2)) || \
1681     (encodeO && (utf7_special[(c)] == 3)))
1682
1683#define B64(n)  \
1684    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1685#define B64CHAR(c) \
1686    (ISALNUM(c) || (c) == '+' || (c) == '/')
1687#define UB64(c) \
1688    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1689     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1690
1691#define ENCODE(out, ch, bits)                   \
1692    while (bits >= 6) {                         \
1693        *out++ = B64(ch >> (bits-6));           \
1694        bits -= 6;                              \
1695    }
1696
1697#define DECODE(out, ch, bits, surrogate)                                \
1698    while (bits >= 16) {                                                \
1699        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1700        bits -= 16;                                                     \
1701        if (surrogate) {                                                \
1702            /* We have already generated an error for the high surrogate \
1703               so let's not bother seeing if the low surrogate is correct or not */ \
1704            surrogate = 0;                                              \
1705        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1706            /* This is a surrogate pair. Unfortunately we can't represent \
1707               it in a 16-bit character */                              \
1708            surrogate = 1;                                              \
1709            errmsg = "code pairs are not supported";                    \
1710            goto utf7Error;                                             \
1711        } else {                                                        \
1712            *out++ = outCh;                                             \
1713        }                                                               \
1714    }
1715
1716PyObject *PyUnicode_DecodeUTF7(const char *s,
1717			       Py_ssize_t size,
1718			       const char *errors)
1719{
1720    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1721}
1722
1723PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1724			       Py_ssize_t size,
1725			       const char *errors,
1726			       Py_ssize_t *consumed)
1727{
1728    const char *starts = s;
1729    Py_ssize_t startinpos;
1730    Py_ssize_t endinpos;
1731    Py_ssize_t outpos;
1732    const char *e;
1733    PyUnicodeObject *unicode;
1734    Py_UNICODE *p;
1735    const char *errmsg = "";
1736    int inShift = 0;
1737    unsigned int bitsleft = 0;
1738    unsigned long charsleft = 0;
1739    int surrogate = 0;
1740    PyObject *errorHandler = NULL;
1741    PyObject *exc = NULL;
1742
1743    unicode = _PyUnicode_New(size);
1744    if (!unicode)
1745        return NULL;
1746    if (size == 0) {
1747        if (consumed)
1748            *consumed = 0;
1749        return (PyObject *)unicode;
1750    }
1751
1752    p = unicode->str;
1753    e = s + size;
1754
1755    while (s < e) {
1756        Py_UNICODE ch;
1757        restart:
1758        ch = (unsigned char) *s;
1759
1760        if (inShift) {
1761            if ((ch == '-') || !B64CHAR(ch)) {
1762                inShift = 0;
1763                s++;
1764
1765                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1766                if (bitsleft >= 6) {
1767                    /* The shift sequence has a partial character in it. If
1768                       bitsleft < 6 then we could just classify it as padding
1769                       but that is not the case here */
1770
1771                    errmsg = "partial character in shift sequence";
1772                    goto utf7Error;
1773                }
1774                /* According to RFC2152 the remaining bits should be zero. We
1775                   choose to signal an error/insert a replacement character
1776                   here so indicate the potential of a misencoded character. */
1777
1778                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1779                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1780                    errmsg = "non-zero padding bits in shift sequence";
1781                    goto utf7Error;
1782                }
1783
1784                if (ch == '-') {
1785                    if ((s < e) && (*(s) == '-')) {
1786                        *p++ = '-';
1787                        inShift = 1;
1788                    }
1789                } else if (SPECIAL(ch,0,0)) {
1790                    errmsg = "unexpected special character";
1791	                goto utf7Error;
1792                } else  {
1793                    *p++ = ch;
1794                }
1795            } else {
1796                charsleft = (charsleft << 6) | UB64(ch);
1797                bitsleft += 6;
1798                s++;
1799                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1800            }
1801        }
1802        else if ( ch == '+' ) {
1803            startinpos = s-starts;
1804            s++;
1805            if (s < e && *s == '-') {
1806                s++;
1807                *p++ = '+';
1808            } else
1809            {
1810                inShift = 1;
1811                bitsleft = 0;
1812            }
1813        }
1814        else if (SPECIAL(ch,0,0)) {
1815            startinpos = s-starts;
1816            errmsg = "unexpected special character";
1817            s++;
1818            goto utf7Error;
1819        }
1820        else {
1821            *p++ = ch;
1822            s++;
1823        }
1824        continue;
1825    utf7Error:
1826        outpos = p-PyUnicode_AS_UNICODE(unicode);
1827        endinpos = s-starts;
1828        if (unicode_decode_call_errorhandler(
1829             errors, &errorHandler,
1830             "utf7", errmsg,
1831             &starts, &e, &startinpos, &endinpos, &exc, &s,
1832             &unicode, &outpos, &p))
1833        goto onError;
1834    }
1835
1836    if (inShift && !consumed) {
1837        outpos = p-PyUnicode_AS_UNICODE(unicode);
1838        endinpos = size;
1839        if (unicode_decode_call_errorhandler(
1840             errors, &errorHandler,
1841             "utf7", "unterminated shift sequence",
1842             &starts, &e, &startinpos, &endinpos, &exc, &s,
1843             &unicode, &outpos, &p))
1844            goto onError;
1845        if (s < e)
1846           goto restart;
1847    }
1848    if (consumed) {
1849        if(inShift)
1850            *consumed = startinpos;
1851        else
1852            *consumed = s-starts;
1853    }
1854
1855    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1856        goto onError;
1857
1858    Py_XDECREF(errorHandler);
1859    Py_XDECREF(exc);
1860    return (PyObject *)unicode;
1861
1862onError:
1863    Py_XDECREF(errorHandler);
1864    Py_XDECREF(exc);
1865    Py_DECREF(unicode);
1866    return NULL;
1867}
1868
1869
1870PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1871                   Py_ssize_t size,
1872                   int encodeSetO,
1873                   int encodeWhiteSpace,
1874                   const char *errors)
1875{
1876    PyObject *v;
1877    /* It might be possible to tighten this worst case */
1878    Py_ssize_t cbAllocated = 5 * size;
1879    int inShift = 0;
1880    Py_ssize_t i = 0;
1881    unsigned int bitsleft = 0;
1882    unsigned long charsleft = 0;
1883    char * out;
1884    char * start;
1885
1886    if (size == 0)
1887       return PyBytes_FromStringAndSize(NULL, 0);
1888
1889    if (cbAllocated / 5 != size)
1890        return PyErr_NoMemory();
1891
1892    v = PyBytes_FromStringAndSize(NULL, cbAllocated);
1893    if (v == NULL)
1894        return NULL;
1895
1896    start = out = PyBytes_AS_STRING(v);
1897    for (;i < size; ++i) {
1898        Py_UNICODE ch = s[i];
1899
1900        if (!inShift) {
1901            if (ch == '+') {
1902                *out++ = '+';
1903                *out++ = '-';
1904            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1905                charsleft = ch;
1906                bitsleft = 16;
1907                *out++ = '+';
1908                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1909                inShift = bitsleft > 0;
1910            } else {
1911                *out++ = (char) ch;
1912            }
1913        } else {
1914            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1915                *out++ = B64(charsleft << (6-bitsleft));
1916                charsleft = 0;
1917                bitsleft = 0;
1918                /* Characters not in the BASE64 set implicitly unshift the sequence
1919                   so no '-' is required, except if the character is itself a '-' */
1920                if (B64CHAR(ch) || ch == '-') {
1921                    *out++ = '-';
1922                }
1923                inShift = 0;
1924                *out++ = (char) ch;
1925            } else {
1926                bitsleft += 16;
1927                charsleft = (charsleft << 16) | ch;
1928                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1929
1930                /* If the next character is special then we dont' need to terminate
1931                   the shift sequence. If the next character is not a BASE64 character
1932                   or '-' then the shift sequence will be terminated implicitly and we
1933                   don't have to insert a '-'. */
1934
1935                if (bitsleft == 0) {
1936                    if (i + 1 < size) {
1937                        Py_UNICODE ch2 = s[i+1];
1938
1939                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1940
1941                        } else if (B64CHAR(ch2) || ch2 == '-') {
1942                            *out++ = '-';
1943                            inShift = 0;
1944                        } else {
1945                            inShift = 0;
1946                        }
1947
1948                    }
1949                    else {
1950                        *out++ = '-';
1951                        inShift = 0;
1952                    }
1953                }
1954            }
1955        }
1956    }
1957    if (bitsleft) {
1958        *out++= B64(charsleft << (6-bitsleft) );
1959        *out++ = '-';
1960    }
1961    if (_PyBytes_Resize(&v, out - start) < 0)
1962        return NULL;
1963    return v;
1964}
1965
1966#undef SPECIAL
1967#undef B64
1968#undef B64CHAR
1969#undef UB64
1970#undef ENCODE
1971#undef DECODE
1972
1973/* --- UTF-8 Codec -------------------------------------------------------- */
1974
1975static
1976char utf8_code_length[256] = {
1977    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1978       illegal prefix.  see RFC 2279 for details */
1979    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1980    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1987    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1988    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1991    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1992    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1993    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1994    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1995};
1996
1997PyObject *PyUnicode_DecodeUTF8(const char *s,
1998			       Py_ssize_t size,
1999			       const char *errors)
2000{
2001    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2002}
2003
2004/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2005#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2006
2007/* Mask to quickly check whether a C 'long' contains a
2008   non-ASCII, UTF8-encoded char. */
2009#if (SIZEOF_LONG == 8)
2010# define ASCII_CHAR_MASK 0x8080808080808080L
2011#elif (SIZEOF_LONG == 4)
2012# define ASCII_CHAR_MASK 0x80808080L
2013#else
2014# error C 'long' size should be either 4 or 8!
2015#endif
2016
2017PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2018			                Py_ssize_t size,
2019			                const char *errors,
2020			                Py_ssize_t *consumed)
2021{
2022    const char *starts = s;
2023    int n;
2024    Py_ssize_t startinpos;
2025    Py_ssize_t endinpos;
2026    Py_ssize_t outpos;
2027    const char *e, *aligned_end;
2028    PyUnicodeObject *unicode;
2029    Py_UNICODE *p;
2030    const char *errmsg = "";
2031    PyObject *errorHandler = NULL;
2032    PyObject *exc = NULL;
2033
2034    /* Note: size will always be longer than the resulting Unicode
2035       character count */
2036    unicode = _PyUnicode_New(size);
2037    if (!unicode)
2038        return NULL;
2039    if (size == 0) {
2040        if (consumed)
2041            *consumed = 0;
2042        return (PyObject *)unicode;
2043    }
2044
2045    /* Unpack UTF-8 encoded data */
2046    p = unicode->str;
2047    e = s + size;
2048    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2049
2050    while (s < e) {
2051        Py_UCS4 ch = (unsigned char)*s;
2052
2053        if (ch < 0x80) {
2054            /* Fast path for runs of ASCII characters. Given that common UTF-8
2055               input will consist of an overwhelming majority of ASCII
2056               characters, we try to optimize for this case by checking
2057               as many characters as a C 'long' can contain.
2058               First, check if we can do an aligned read, as most CPUs have
2059               a penalty for unaligned reads.
2060            */
2061            if (!((size_t) s & LONG_PTR_MASK)) {
2062                /* Help register allocation */
2063                register const char *_s = s;
2064                register Py_UNICODE *_p = p;
2065                while (_s < aligned_end) {
2066                    /* Read a whole long at a time (either 4 or 8 bytes),
2067                       and do a fast unrolled copy if it only contains ASCII
2068                       characters. */
2069                    unsigned long data = *(unsigned long *) _s;
2070                    if (data & ASCII_CHAR_MASK)
2071                        break;
2072                    _p[0] = (unsigned char) _s[0];
2073                    _p[1] = (unsigned char) _s[1];
2074                    _p[2] = (unsigned char) _s[2];
2075                    _p[3] = (unsigned char) _s[3];
2076#if (SIZEOF_LONG == 8)
2077                    _p[4] = (unsigned char) _s[4];
2078                    _p[5] = (unsigned char) _s[5];
2079                    _p[6] = (unsigned char) _s[6];
2080                    _p[7] = (unsigned char) _s[7];
2081#endif
2082                    _s += SIZEOF_LONG;
2083                    _p += SIZEOF_LONG;
2084                }
2085                s = _s;
2086                p = _p;
2087                if (s == e)
2088                    break;
2089                ch = (unsigned char)*s;
2090            }
2091        }
2092
2093        if (ch < 0x80) {
2094            *p++ = (Py_UNICODE)ch;
2095            s++;
2096            continue;
2097        }
2098
2099        n = utf8_code_length[ch];
2100
2101        if (s + n > e) {
2102	    if (consumed)
2103		break;
2104	    else {
2105		errmsg = "unexpected end of data";
2106		startinpos = s-starts;
2107		endinpos = size;
2108		goto utf8Error;
2109	    }
2110	}
2111
2112        switch (n) {
2113
2114        case 0:
2115            errmsg = "unexpected code byte";
2116	    startinpos = s-starts;
2117	    endinpos = startinpos+1;
2118	    goto utf8Error;
2119
2120        case 1:
2121            errmsg = "internal error";
2122	    startinpos = s-starts;
2123	    endinpos = startinpos+1;
2124	    goto utf8Error;
2125
2126        case 2:
2127            if ((s[1] & 0xc0) != 0x80) {
2128                errmsg = "invalid data";
2129		startinpos = s-starts;
2130		endinpos = startinpos+2;
2131		goto utf8Error;
2132	    }
2133            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2134            if (ch < 0x80) {
2135		startinpos = s-starts;
2136		endinpos = startinpos+2;
2137                errmsg = "illegal encoding";
2138		goto utf8Error;
2139	    }
2140	    else
2141		*p++ = (Py_UNICODE)ch;
2142            break;
2143
2144        case 3:
2145            if ((s[1] & 0xc0) != 0x80 ||
2146                (s[2] & 0xc0) != 0x80) {
2147                errmsg = "invalid data";
2148		startinpos = s-starts;
2149		endinpos = startinpos+3;
2150		goto utf8Error;
2151	    }
2152            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2153            if (ch < 0x0800) {
2154		/* Note: UTF-8 encodings of surrogates are considered
2155		   legal UTF-8 sequences;
2156
2157		   XXX For wide builds (UCS-4) we should probably try
2158		       to recombine the surrogates into a single code
2159		       unit.
2160		*/
2161                errmsg = "illegal encoding";
2162		startinpos = s-starts;
2163		endinpos = startinpos+3;
2164		goto utf8Error;
2165	    }
2166	    else
2167		*p++ = (Py_UNICODE)ch;
2168            break;
2169
2170        case 4:
2171            if ((s[1] & 0xc0) != 0x80 ||
2172                (s[2] & 0xc0) != 0x80 ||
2173                (s[3] & 0xc0) != 0x80) {
2174                errmsg = "invalid data";
2175		startinpos = s-starts;
2176		endinpos = startinpos+4;
2177		goto utf8Error;
2178	    }
2179            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2180                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2181            /* validate and convert to UTF-16 */
2182            if ((ch < 0x10000)        /* minimum value allowed for 4
2183					 byte encoding */
2184                || (ch > 0x10ffff))   /* maximum value allowed for
2185					 UTF-16 */
2186	    {
2187                errmsg = "illegal encoding";
2188		startinpos = s-starts;
2189		endinpos = startinpos+4;
2190		goto utf8Error;
2191	    }
2192#ifdef Py_UNICODE_WIDE
2193	    *p++ = (Py_UNICODE)ch;
2194#else
2195            /*  compute and append the two surrogates: */
2196
2197            /*  translate from 10000..10FFFF to 0..FFFF */
2198            ch -= 0x10000;
2199
2200            /*  high surrogate = top 10 bits added to D800 */
2201            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2202
2203            /*  low surrogate = bottom 10 bits added to DC00 */
2204            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2205#endif
2206            break;
2207
2208        default:
2209            /* Other sizes are only needed for UCS-4 */
2210            errmsg = "unsupported Unicode code range";
2211	    startinpos = s-starts;
2212	    endinpos = startinpos+n;
2213	    goto utf8Error;
2214        }
2215        s += n;
2216	continue;
2217
2218    utf8Error:
2219    outpos = p-PyUnicode_AS_UNICODE(unicode);
2220    if (unicode_decode_call_errorhandler(
2221	     errors, &errorHandler,
2222	     "utf8", errmsg,
2223	     &starts, &e, &startinpos, &endinpos, &exc, &s,
2224	     &unicode, &outpos, &p))
2225	goto onError;
2226	aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2227    }
2228    if (consumed)
2229	*consumed = s-starts;
2230
2231    /* Adjust length */
2232    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2233        goto onError;
2234
2235    Py_XDECREF(errorHandler);
2236    Py_XDECREF(exc);
2237    return (PyObject *)unicode;
2238
2239onError:
2240    Py_XDECREF(errorHandler);
2241    Py_XDECREF(exc);
2242    Py_DECREF(unicode);
2243    return NULL;
2244}
2245
2246#undef ASCII_CHAR_MASK
2247
2248
2249/* Allocation strategy:  if the string is short, convert into a stack buffer
2250   and allocate exactly as much space needed at the end.  Else allocate the
2251   maximum possible needed (4 result bytes per Unicode character), and return
2252   the excess memory at the end.
2253*/
2254PyObject *
2255PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2256		     Py_ssize_t size,
2257		     const char *errors)
2258{
2259#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2260
2261    Py_ssize_t i;                /* index into s of next input byte */
2262    PyObject *result;            /* result string object */
2263    char *p;                     /* next free byte in output buffer */
2264    Py_ssize_t nallocated;      /* number of result bytes allocated */
2265    Py_ssize_t nneeded;            /* number of result bytes needed */
2266    char stackbuf[MAX_SHORT_UNICHARS * 4];
2267
2268    assert(s != NULL);
2269    assert(size >= 0);
2270
2271    if (size <= MAX_SHORT_UNICHARS) {
2272        /* Write into the stack buffer; nallocated can't overflow.
2273         * At the end, we'll allocate exactly as much heap space as it
2274         * turns out we need.
2275         */
2276        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2277        result = NULL;   /* will allocate after we're done */
2278        p = stackbuf;
2279    }
2280    else {
2281        /* Overallocate on the heap, and give the excess back at the end. */
2282        nallocated = size * 4;
2283        if (nallocated / 4 != size)  /* overflow! */
2284            return PyErr_NoMemory();
2285        result = PyBytes_FromStringAndSize(NULL, nallocated);
2286        if (result == NULL)
2287            return NULL;
2288        p = PyBytes_AS_STRING(result);
2289    }
2290
2291    for (i = 0; i < size;) {
2292        Py_UCS4 ch = s[i++];
2293
2294        if (ch < 0x80)
2295            /* Encode ASCII */
2296            *p++ = (char) ch;
2297
2298        else if (ch < 0x0800) {
2299            /* Encode Latin-1 */
2300            *p++ = (char)(0xc0 | (ch >> 6));
2301            *p++ = (char)(0x80 | (ch & 0x3f));
2302        }
2303        else {
2304            /* Encode UCS2 Unicode ordinals */
2305            if (ch < 0x10000) {
2306                /* Special case: check for high surrogate */
2307                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2308                    Py_UCS4 ch2 = s[i];
2309                    /* Check for low surrogate and combine the two to
2310                       form a UCS4 value */
2311                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2312                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2313                        i++;
2314                        goto encodeUCS4;
2315                    }
2316                    /* Fall through: handles isolated high surrogates */
2317                }
2318                *p++ = (char)(0xe0 | (ch >> 12));
2319                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2320                *p++ = (char)(0x80 | (ch & 0x3f));
2321                continue;
2322    	    }
2323encodeUCS4:
2324            /* Encode UCS4 Unicode ordinals */
2325            *p++ = (char)(0xf0 | (ch >> 18));
2326            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2327            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2328            *p++ = (char)(0x80 | (ch & 0x3f));
2329        }
2330    }
2331
2332    if (result == NULL) {
2333        /* This was stack allocated. */
2334        nneeded = p - stackbuf;
2335        assert(nneeded <= nallocated);
2336        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2337    }
2338    else {
2339        /* Cut back to size actually needed. */
2340        nneeded = p - PyBytes_AS_STRING(result);
2341        assert(nneeded <= nallocated);
2342        _PyBytes_Resize(&result, nneeded);
2343    }
2344    return result;
2345
2346#undef MAX_SHORT_UNICHARS
2347}
2348
2349PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2350{
2351    if (!PyUnicode_Check(unicode)) {
2352        PyErr_BadArgument();
2353        return NULL;
2354    }
2355    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2356				PyUnicode_GET_SIZE(unicode),
2357				NULL);
2358}
2359
2360/* --- UTF-32 Codec ------------------------------------------------------- */
2361
2362PyObject *
2363PyUnicode_DecodeUTF32(const char *s,
2364		      Py_ssize_t size,
2365		      const char *errors,
2366		      int *byteorder)
2367{
2368    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2369}
2370
2371PyObject *
2372PyUnicode_DecodeUTF32Stateful(const char *s,
2373			      Py_ssize_t size,
2374			      const char *errors,
2375			      int *byteorder,
2376			      Py_ssize_t *consumed)
2377{
2378    const char *starts = s;
2379    Py_ssize_t startinpos;
2380    Py_ssize_t endinpos;
2381    Py_ssize_t outpos;
2382    PyUnicodeObject *unicode;
2383    Py_UNICODE *p;
2384#ifndef Py_UNICODE_WIDE
2385    int i, pairs;
2386#else
2387    const int pairs = 0;
2388#endif
2389    const unsigned char *q, *e;
2390    int bo = 0;       /* assume native ordering by default */
2391    const char *errmsg = "";
2392    /* Offsets from q for retrieving bytes in the right order. */
2393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2394    int iorder[] = {0, 1, 2, 3};
2395#else
2396    int iorder[] = {3, 2, 1, 0};
2397#endif
2398    PyObject *errorHandler = NULL;
2399    PyObject *exc = NULL;
2400    /* On narrow builds we split characters outside the BMP into two
2401       codepoints => count how much extra space we need. */
2402#ifndef Py_UNICODE_WIDE
2403    for (i = pairs = 0; i < size/4; i++)
2404	if (((Py_UCS4 *)s)[i] >= 0x10000)
2405	    pairs++;
2406#endif
2407
2408    /* This might be one to much, because of a BOM */
2409    unicode = _PyUnicode_New((size+3)/4+pairs);
2410    if (!unicode)
2411        return NULL;
2412    if (size == 0)
2413        return (PyObject *)unicode;
2414
2415    /* Unpack UTF-32 encoded data */
2416    p = unicode->str;
2417    q = (unsigned char *)s;
2418    e = q + size;
2419
2420    if (byteorder)
2421        bo = *byteorder;
2422
2423    /* Check for BOM marks (U+FEFF) in the input and adjust current
2424       byte order setting accordingly. In native mode, the leading BOM
2425       mark is skipped, in all other modes, it is copied to the output
2426       stream as-is (giving a ZWNBSP character). */
2427    if (bo == 0) {
2428        if (size >= 4) {
2429            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2430                                (q[iorder[1]] << 8) | q[iorder[0]];
2431#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2432	    if (bom == 0x0000FEFF) {
2433		q += 4;
2434		bo = -1;
2435	    }
2436	    else if (bom == 0xFFFE0000) {
2437		q += 4;
2438		bo = 1;
2439	    }
2440#else
2441	    if (bom == 0x0000FEFF) {
2442		q += 4;
2443		bo = 1;
2444	    }
2445	    else if (bom == 0xFFFE0000) {
2446		q += 4;
2447		bo = -1;
2448	    }
2449#endif
2450	}
2451    }
2452
2453    if (bo == -1) {
2454        /* force LE */
2455        iorder[0] = 0;
2456        iorder[1] = 1;
2457        iorder[2] = 2;
2458        iorder[3] = 3;
2459    }
2460    else if (bo == 1) {
2461        /* force BE */
2462        iorder[0] = 3;
2463        iorder[1] = 2;
2464        iorder[2] = 1;
2465        iorder[3] = 0;
2466    }
2467
2468    while (q < e) {
2469	Py_UCS4 ch;
2470	/* remaining bytes at the end? (size should be divisible by 4) */
2471	if (e-q<4) {
2472	    if (consumed)
2473		break;
2474	    errmsg = "truncated data";
2475	    startinpos = ((const char *)q)-starts;
2476	    endinpos = ((const char *)e)-starts;
2477	    goto utf32Error;
2478	    /* The remaining input chars are ignored if the callback
2479	       chooses to skip the input */
2480	}
2481	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2482	     (q[iorder[1]] << 8) | q[iorder[0]];
2483
2484	if (ch >= 0x110000)
2485	{
2486	    errmsg = "codepoint not in range(0x110000)";
2487	    startinpos = ((const char *)q)-starts;
2488	    endinpos = startinpos+4;
2489	    goto utf32Error;
2490	}
2491#ifndef Py_UNICODE_WIDE
2492	if (ch >= 0x10000)
2493	{
2494	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
2495	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2496	}
2497	else
2498#endif
2499	    *p++ = ch;
2500	q += 4;
2501	continue;
2502    utf32Error:
2503	outpos = p-PyUnicode_AS_UNICODE(unicode);
2504	if (unicode_decode_call_errorhandler(
2505	         errors, &errorHandler,
2506	         "utf32", errmsg,
2507	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2508	         &unicode, &outpos, &p))
2509	    goto onError;
2510    }
2511
2512    if (byteorder)
2513        *byteorder = bo;
2514
2515    if (consumed)
2516	*consumed = (const char *)q-starts;
2517
2518    /* Adjust length */
2519    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2520        goto onError;
2521
2522    Py_XDECREF(errorHandler);
2523    Py_XDECREF(exc);
2524    return (PyObject *)unicode;
2525
2526onError:
2527    Py_DECREF(unicode);
2528    Py_XDECREF(errorHandler);
2529    Py_XDECREF(exc);
2530    return NULL;
2531}
2532
2533PyObject *
2534PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2535		      Py_ssize_t size,
2536		      const char *errors,
2537		      int byteorder)
2538{
2539    PyObject *v;
2540    unsigned char *p;
2541    Py_ssize_t nsize, bytesize;
2542#ifndef Py_UNICODE_WIDE
2543    Py_ssize_t i, pairs;
2544#else
2545    const int pairs = 0;
2546#endif
2547    /* Offsets from p for storing byte pairs in the right order. */
2548#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2549    int iorder[] = {0, 1, 2, 3};
2550#else
2551    int iorder[] = {3, 2, 1, 0};
2552#endif
2553
2554#define STORECHAR(CH)                       \
2555    do {                                    \
2556        p[iorder[3]] = ((CH) >> 24) & 0xff; \
2557        p[iorder[2]] = ((CH) >> 16) & 0xff; \
2558        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2559        p[iorder[0]] = (CH) & 0xff;         \
2560        p += 4;                             \
2561    } while(0)
2562
2563    /* In narrow builds we can output surrogate pairs as one codepoint,
2564       so we need less space. */
2565#ifndef Py_UNICODE_WIDE
2566    for (i = pairs = 0; i < size-1; i++)
2567	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2568	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2569	    pairs++;
2570#endif
2571    nsize = (size - pairs + (byteorder == 0));
2572    bytesize = nsize * 4;
2573    if (bytesize / 4 != nsize)
2574	return PyErr_NoMemory();
2575    v = PyBytes_FromStringAndSize(NULL, bytesize);
2576    if (v == NULL)
2577        return NULL;
2578
2579    p = (unsigned char *)PyBytes_AS_STRING(v);
2580    if (byteorder == 0)
2581	STORECHAR(0xFEFF);
2582    if (size == 0)
2583        goto done;
2584
2585    if (byteorder == -1) {
2586        /* force LE */
2587        iorder[0] = 0;
2588        iorder[1] = 1;
2589        iorder[2] = 2;
2590        iorder[3] = 3;
2591    }
2592    else if (byteorder == 1) {
2593        /* force BE */
2594        iorder[0] = 3;
2595        iorder[1] = 2;
2596        iorder[2] = 1;
2597        iorder[3] = 0;
2598    }
2599
2600    while (size-- > 0) {
2601	Py_UCS4 ch = *s++;
2602#ifndef Py_UNICODE_WIDE
2603	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2604	    Py_UCS4 ch2 = *s;
2605	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2606		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2607		s++;
2608		size--;
2609	    }
2610	}
2611#endif
2612        STORECHAR(ch);
2613    }
2614
2615  done:
2616    return v;
2617#undef STORECHAR
2618}
2619
2620PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2621{
2622    if (!PyUnicode_Check(unicode)) {
2623        PyErr_BadArgument();
2624        return NULL;
2625    }
2626    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2627				 PyUnicode_GET_SIZE(unicode),
2628				 NULL,
2629				 0);
2630}
2631
2632/* --- UTF-16 Codec ------------------------------------------------------- */
2633
2634PyObject *
2635PyUnicode_DecodeUTF16(const char *s,
2636		      Py_ssize_t size,
2637		      const char *errors,
2638		      int *byteorder)
2639{
2640    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2641}
2642
2643/* Two masks for fast checking of whether a C 'long' may contain
2644   UTF16-encoded surrogate characters. This is an efficient heuristic,
2645   assuming that non-surrogate characters with a code point >= 0x8000 are
2646   rare in most input.
2647   FAST_CHAR_MASK is used when the input is in native byte ordering,
2648   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
2649   */
2650#if (SIZEOF_LONG == 8)
2651# define FAST_CHAR_MASK         0x8000800080008000L
2652# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
2653#elif (SIZEOF_LONG == 4)
2654# define FAST_CHAR_MASK         0x80008000L
2655# define SWAPPED_FAST_CHAR_MASK 0x00800080L
2656#else
2657# error C 'long' size should be either 4 or 8!
2658#endif
2659
2660PyObject *
2661PyUnicode_DecodeUTF16Stateful(const char *s,
2662			      Py_ssize_t size,
2663			      const char *errors,
2664			      int *byteorder,
2665			      Py_ssize_t *consumed)
2666{
2667    const char *starts = s;
2668    Py_ssize_t startinpos;
2669    Py_ssize_t endinpos;
2670    Py_ssize_t outpos;
2671    PyUnicodeObject *unicode;
2672    Py_UNICODE *p;
2673    const unsigned char *q, *e, *aligned_end;
2674    int bo = 0;       /* assume native ordering by default */
2675    int native_ordering = 0;
2676    const char *errmsg = "";
2677    /* Offsets from q for retrieving byte pairs in the right order. */
2678#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2679    int ihi = 1, ilo = 0;
2680#else
2681    int ihi = 0, ilo = 1;
2682#endif
2683    PyObject *errorHandler = NULL;
2684    PyObject *exc = NULL;
2685
2686    /* Note: size will always be longer than the resulting Unicode
2687       character count */
2688    unicode = _PyUnicode_New(size);
2689    if (!unicode)
2690        return NULL;
2691    if (size == 0)
2692        return (PyObject *)unicode;
2693
2694    /* Unpack UTF-16 encoded data */
2695    p = unicode->str;
2696    q = (unsigned char *)s;
2697    e = q + size - 1;
2698
2699    if (byteorder)
2700        bo = *byteorder;
2701
2702    /* Check for BOM marks (U+FEFF) in the input and adjust current
2703       byte order setting accordingly. In native mode, the leading BOM
2704       mark is skipped, in all other modes, it is copied to the output
2705       stream as-is (giving a ZWNBSP character). */
2706    if (bo == 0) {
2707        if (size >= 2) {
2708            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2709#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2710	    if (bom == 0xFEFF) {
2711		q += 2;
2712		bo = -1;
2713	    }
2714	    else if (bom == 0xFFFE) {
2715		q += 2;
2716		bo = 1;
2717	    }
2718#else
2719	    if (bom == 0xFEFF) {
2720		q += 2;
2721		bo = 1;
2722	    }
2723	    else if (bom == 0xFFFE) {
2724		q += 2;
2725		bo = -1;
2726	    }
2727#endif
2728	}
2729    }
2730
2731    if (bo == -1) {
2732        /* force LE */
2733        ihi = 1;
2734        ilo = 0;
2735    }
2736    else if (bo == 1) {
2737        /* force BE */
2738        ihi = 0;
2739        ilo = 1;
2740    }
2741#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2742    native_ordering = ilo < ihi;
2743#else
2744    native_ordering = ilo > ihi;
2745#endif
2746
2747    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
2748    while (q < e) {
2749	Py_UNICODE ch;
2750        /* First check for possible aligned read of a C 'long'. Unaligned
2751           reads are more expensive, better to defer to another iteration. */
2752        if (!((size_t) q & LONG_PTR_MASK)) {
2753            /* Fast path for runs of non-surrogate chars. */
2754            register const unsigned char *_q = q;
2755            Py_UNICODE *_p = p;
2756            if (native_ordering) {
2757                /* Native ordering is simple: as long as the input cannot
2758                   possibly contain a surrogate char, do an unrolled copy
2759                   of several 16-bit code points to the target object.
2760                   The non-surrogate check is done on several input bytes
2761                   at a time (as many as a C 'long' can contain). */
2762                while (_q < aligned_end) {
2763                    unsigned long data = * (unsigned long *) _q;
2764                    if (data & FAST_CHAR_MASK)
2765                        break;
2766                    _p[0] = ((unsigned short *) _q)[0];
2767                    _p[1] = ((unsigned short *) _q)[1];
2768#if (SIZEOF_LONG == 8)
2769                    _p[2] = ((unsigned short *) _q)[2];
2770                    _p[3] = ((unsigned short *) _q)[3];
2771#endif
2772                    _q += SIZEOF_LONG;
2773                    _p += SIZEOF_LONG / 2;
2774                }
2775            }
2776            else {
2777                /* Byteswapped ordering is similar, but we must decompose
2778                   the copy bytewise, and take care of zero'ing out the
2779                   upper bytes if the target object is in 32-bit units
2780                   (that is, in UCS-4 builds). */
2781                while (_q < aligned_end) {
2782                    unsigned long data = * (unsigned long *) _q;
2783                    if (data & SWAPPED_FAST_CHAR_MASK)
2784                        break;
2785                    /* Zero upper bytes in UCS-4 builds */
2786#if (Py_UNICODE_SIZE > 2)
2787                    _p[0] = 0;
2788                    _p[1] = 0;
2789#if (SIZEOF_LONG == 8)
2790                    _p[2] = 0;
2791                    _p[3] = 0;
2792#endif
2793#endif
2794                    /* Issue #4916; UCS-4 builds on big endian machines must
2795                       fill the two last bytes of each 4-byte unit. */
2796#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
2797# define OFF 2
2798#else
2799# define OFF 0
2800#endif
2801                    ((unsigned char *) _p)[OFF + 1] = _q[0];
2802                    ((unsigned char *) _p)[OFF + 0] = _q[1];
2803                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
2804                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
2805#if (SIZEOF_LONG == 8)
2806                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
2807                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
2808                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
2809                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
2810#endif
2811#undef OFF
2812                    _q += SIZEOF_LONG;
2813                    _p += SIZEOF_LONG / 2;
2814                }
2815            }
2816            p = _p;
2817            q = _q;
2818            if (q >= e)
2819                break;
2820        }
2821	ch = (q[ihi] << 8) | q[ilo];
2822
2823	q += 2;
2824
2825	if (ch < 0xD800 || ch > 0xDFFF) {
2826	    *p++ = ch;
2827	    continue;
2828	}
2829
2830	/* UTF-16 code pair: */
2831	if (q > e) {
2832	    errmsg = "unexpected end of data";
2833	    startinpos = (((const char *)q) - 2) - starts;
2834	    endinpos = ((const char *)e) + 1 - starts;
2835	    goto utf16Error;
2836	}
2837	if (0xD800 <= ch && ch <= 0xDBFF) {
2838	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2839	    q += 2;
2840	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2841#ifndef Py_UNICODE_WIDE
2842		*p++ = ch;
2843		*p++ = ch2;
2844#else
2845		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2846#endif
2847		continue;
2848	    }
2849	    else {
2850                errmsg = "illegal UTF-16 surrogate";
2851		startinpos = (((const char *)q)-4)-starts;
2852		endinpos = startinpos+2;
2853		goto utf16Error;
2854	    }
2855
2856	}
2857	errmsg = "illegal encoding";
2858	startinpos = (((const char *)q)-2)-starts;
2859	endinpos = startinpos+2;
2860	/* Fall through to report the error */
2861
2862    utf16Error:
2863	outpos = p - PyUnicode_AS_UNICODE(unicode);
2864	if (unicode_decode_call_errorhandler(
2865                errors,
2866                &errorHandler,
2867                "utf16", errmsg,
2868                &starts,
2869                (const char **)&e,
2870                &startinpos,
2871                &endinpos,
2872                &exc,
2873                (const char **)&q,
2874                &unicode,
2875                &outpos,
2876                &p))
2877	    goto onError;
2878    }
2879    /* remaining byte at the end? (size should be even) */
2880    if (e == q) {
2881        if (!consumed) {
2882            errmsg = "truncated data";
2883            startinpos = ((const char *)q) - starts;
2884            endinpos = ((const char *)e) + 1 - starts;
2885            outpos = p - PyUnicode_AS_UNICODE(unicode);
2886            if (unicode_decode_call_errorhandler(
2887                    errors,
2888                    &errorHandler,
2889                    "utf16", errmsg,
2890                    &starts,
2891                    (const char **)&e,
2892                    &startinpos,
2893                    &endinpos,
2894                    &exc,
2895                    (const char **)&q,
2896                    &unicode,
2897                    &outpos,
2898                    &p))
2899                goto onError;
2900            /* The remaining input chars are ignored if the callback
2901               chooses to skip the input */
2902        }
2903    }
2904
2905    if (byteorder)
2906        *byteorder = bo;
2907
2908    if (consumed)
2909	*consumed = (const char *)q-starts;
2910
2911    /* Adjust length */
2912    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2913        goto onError;
2914
2915    Py_XDECREF(errorHandler);
2916    Py_XDECREF(exc);
2917    return (PyObject *)unicode;
2918
2919onError:
2920    Py_DECREF(unicode);
2921    Py_XDECREF(errorHandler);
2922    Py_XDECREF(exc);
2923    return NULL;
2924}
2925
2926#undef FAST_CHAR_MASK
2927#undef SWAPPED_FAST_CHAR_MASK
2928
2929PyObject *
2930PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2931		      Py_ssize_t size,
2932		      const char *errors,
2933		      int byteorder)
2934{
2935    PyObject *v;
2936    unsigned char *p;
2937    Py_ssize_t nsize, bytesize;
2938#ifdef Py_UNICODE_WIDE
2939    Py_ssize_t i, pairs;
2940#else
2941    const int pairs = 0;
2942#endif
2943    /* Offsets from p for storing byte pairs in the right order. */
2944#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2945    int ihi = 1, ilo = 0;
2946#else
2947    int ihi = 0, ilo = 1;
2948#endif
2949
2950#define STORECHAR(CH)                   \
2951    do {                                \
2952        p[ihi] = ((CH) >> 8) & 0xff;    \
2953        p[ilo] = (CH) & 0xff;           \
2954        p += 2;                         \
2955    } while(0)
2956
2957#ifdef Py_UNICODE_WIDE
2958    for (i = pairs = 0; i < size; i++)
2959	if (s[i] >= 0x10000)
2960	    pairs++;
2961#endif
2962    /* 2 * (size + pairs + (byteorder == 0)) */
2963    if (size > PY_SSIZE_T_MAX ||
2964        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2965	return PyErr_NoMemory();
2966    nsize = size + pairs + (byteorder == 0);
2967    bytesize = nsize * 2;
2968    if (bytesize / 2 != nsize)
2969	return PyErr_NoMemory();
2970    v = PyBytes_FromStringAndSize(NULL, bytesize);
2971    if (v == NULL)
2972        return NULL;
2973
2974    p = (unsigned char *)PyBytes_AS_STRING(v);
2975    if (byteorder == 0)
2976	STORECHAR(0xFEFF);
2977    if (size == 0)
2978        goto done;
2979
2980    if (byteorder == -1) {
2981        /* force LE */
2982        ihi = 1;
2983        ilo = 0;
2984    }
2985    else if (byteorder == 1) {
2986        /* force BE */
2987        ihi = 0;
2988        ilo = 1;
2989    }
2990
2991    while (size-- > 0) {
2992	Py_UNICODE ch = *s++;
2993	Py_UNICODE ch2 = 0;
2994#ifdef Py_UNICODE_WIDE
2995	if (ch >= 0x10000) {
2996	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2997	    ch  = 0xD800 | ((ch-0x10000) >> 10);
2998	}
2999#endif
3000        STORECHAR(ch);
3001        if (ch2)
3002            STORECHAR(ch2);
3003    }
3004
3005  done:
3006    return v;
3007#undef STORECHAR
3008}
3009
3010PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3011{
3012    if (!PyUnicode_Check(unicode)) {
3013        PyErr_BadArgument();
3014        return NULL;
3015    }
3016    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3017				 PyUnicode_GET_SIZE(unicode),
3018				 NULL,
3019				 0);
3020}
3021
3022/* --- Unicode Escape Codec ----------------------------------------------- */
3023
3024static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3025
3026PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3027					Py_ssize_t size,
3028					const char *errors)
3029{
3030    const char *starts = s;
3031    Py_ssize_t startinpos;
3032    Py_ssize_t endinpos;
3033    Py_ssize_t outpos;
3034    int i;
3035    PyUnicodeObject *v;
3036    Py_UNICODE *p;
3037    const char *end;
3038    char* message;
3039    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3040    PyObject *errorHandler = NULL;
3041    PyObject *exc = NULL;
3042
3043    /* Escaped strings will always be longer than the resulting
3044       Unicode string, so we start with size here and then reduce the
3045       length after conversion to the true value.
3046       (but if the error callback returns a long replacement string
3047       we'll have to allocate more space) */
3048    v = _PyUnicode_New(size);
3049    if (v == NULL)
3050        goto onError;
3051    if (size == 0)
3052        return (PyObject *)v;
3053
3054    p = PyUnicode_AS_UNICODE(v);
3055    end = s + size;
3056
3057    while (s < end) {
3058        unsigned char c;
3059        Py_UNICODE x;
3060        int digits;
3061
3062        /* Non-escape characters are interpreted as Unicode ordinals */
3063        if (*s != '\\') {
3064            *p++ = (unsigned char) *s++;
3065            continue;
3066        }
3067
3068        startinpos = s-starts;
3069        /* \ - Escapes */
3070        s++;
3071        c = *s++;
3072        if (s > end)
3073            c = '\0'; /* Invalid after \ */
3074        switch (c) {
3075
3076        /* \x escapes */
3077        case '\n': break;
3078        case '\\': *p++ = '\\'; break;
3079        case '\'': *p++ = '\''; break;
3080        case '\"': *p++ = '\"'; break;
3081        case 'b': *p++ = '\b'; break;
3082        case 'f': *p++ = '\014'; break; /* FF */
3083        case 't': *p++ = '\t'; break;
3084        case 'n': *p++ = '\n'; break;
3085        case 'r': *p++ = '\r'; break;
3086        case 'v': *p++ = '\013'; break; /* VT */
3087        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3088
3089        /* \OOO (octal) escapes */
3090        case '0': case '1': case '2': case '3':
3091        case '4': case '5': case '6': case '7':
3092            x = s[-1] - '0';
3093            if (s < end && '0' <= *s && *s <= '7') {
3094                x = (x<<3) + *s++ - '0';
3095                if (s < end && '0' <= *s && *s <= '7')
3096                    x = (x<<3) + *s++ - '0';
3097            }
3098            *p++ = x;
3099            break;
3100
3101        /* hex escapes */
3102        /* \xXX */
3103        case 'x':
3104            digits = 2;
3105            message = "truncated \\xXX escape";
3106            goto hexescape;
3107
3108        /* \uXXXX */
3109        case 'u':
3110            digits = 4;
3111            message = "truncated \\uXXXX escape";
3112            goto hexescape;
3113
3114        /* \UXXXXXXXX */
3115        case 'U':
3116            digits = 8;
3117            message = "truncated \\UXXXXXXXX escape";
3118        hexescape:
3119            chr = 0;
3120            outpos = p-PyUnicode_AS_UNICODE(v);
3121            if (s+digits>end) {
3122                endinpos = size;
3123                if (unicode_decode_call_errorhandler(
3124                    errors, &errorHandler,
3125                    "unicodeescape", "end of string in escape sequence",
3126                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3127                    &v, &outpos, &p))
3128                    goto onError;
3129                goto nextByte;
3130            }
3131            for (i = 0; i < digits; ++i) {
3132                c = (unsigned char) s[i];
3133                if (!ISXDIGIT(c)) {
3134                    endinpos = (s+i+1)-starts;
3135                    if (unicode_decode_call_errorhandler(
3136                        errors, &errorHandler,
3137                        "unicodeescape", message,
3138                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3139                        &v, &outpos, &p))
3140                        goto onError;
3141                    goto nextByte;
3142                }
3143                chr = (chr<<4) & ~0xF;
3144                if (c >= '0' && c <= '9')
3145                    chr += c - '0';
3146                else if (c >= 'a' && c <= 'f')
3147                    chr += 10 + c - 'a';
3148                else
3149                    chr += 10 + c - 'A';
3150            }
3151            s += i;
3152            if (chr == 0xffffffff && PyErr_Occurred())
3153                /* _decoding_error will have already written into the
3154                   target buffer. */
3155                break;
3156        store:
3157            /* when we get here, chr is a 32-bit unicode character */
3158            if (chr <= 0xffff)
3159                /* UCS-2 character */
3160                *p++ = (Py_UNICODE) chr;
3161            else if (chr <= 0x10ffff) {
3162                /* UCS-4 character. Either store directly, or as
3163                   surrogate pair. */
3164#ifdef Py_UNICODE_WIDE
3165                *p++ = chr;
3166#else
3167                chr -= 0x10000L;
3168                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3169                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3170#endif
3171            } else {
3172                endinpos = s-starts;
3173                outpos = p-PyUnicode_AS_UNICODE(v);
3174                if (unicode_decode_call_errorhandler(
3175                    errors, &errorHandler,
3176                    "unicodeescape", "illegal Unicode character",
3177                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3178                    &v, &outpos, &p))
3179                    goto onError;
3180            }
3181            break;
3182
3183        /* \N{name} */
3184        case 'N':
3185            message = "malformed \\N character escape";
3186            if (ucnhash_CAPI == NULL) {
3187                /* load the unicode data module */
3188                PyObject *m, *api;
3189                m = PyImport_ImportModuleNoBlock("unicodedata");
3190                if (m == NULL)
3191                    goto ucnhashError;
3192                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
3193                Py_DECREF(m);
3194                if (api == NULL)
3195                    goto ucnhashError;
3196                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
3197                Py_DECREF(api);
3198                if (ucnhash_CAPI == NULL)
3199                    goto ucnhashError;
3200            }
3201            if (*s == '{') {
3202                const char *start = s+1;
3203                /* look for the closing brace */
3204                while (*s != '}' && s < end)
3205                    s++;
3206                if (s > start && s < end && *s == '}') {
3207                    /* found a name.  look it up in the unicode database */
3208                    message = "unknown Unicode character name";
3209                    s++;
3210                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3211                        goto store;
3212                }
3213            }
3214            endinpos = s-starts;
3215            outpos = p-PyUnicode_AS_UNICODE(v);
3216            if (unicode_decode_call_errorhandler(
3217                errors, &errorHandler,
3218                "unicodeescape", message,
3219                &starts, &end, &startinpos, &endinpos, &exc, &s,
3220                &v, &outpos, &p))
3221                goto onError;
3222            break;
3223
3224        default:
3225            if (s > end) {
3226                message = "\\ at end of string";
3227                s--;
3228                endinpos = s-starts;
3229                outpos = p-PyUnicode_AS_UNICODE(v);
3230                if (unicode_decode_call_errorhandler(
3231                    errors, &errorHandler,
3232                    "unicodeescape", message,
3233                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3234                    &v, &outpos, &p))
3235                    goto onError;
3236            }
3237            else {
3238                *p++ = '\\';
3239                *p++ = (unsigned char)s[-1];
3240            }
3241            break;
3242        }
3243        nextByte:
3244        ;
3245    }
3246    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3247        goto onError;
3248    Py_XDECREF(errorHandler);
3249    Py_XDECREF(exc);
3250    return (PyObject *)v;
3251
3252ucnhashError:
3253    PyErr_SetString(
3254        PyExc_UnicodeError,
3255        "\\N escapes not supported (can't load unicodedata module)"
3256        );
3257    Py_XDECREF(v);
3258    Py_XDECREF(errorHandler);
3259    Py_XDECREF(exc);
3260    return NULL;
3261
3262onError:
3263    Py_XDECREF(v);
3264    Py_XDECREF(errorHandler);
3265    Py_XDECREF(exc);
3266    return NULL;
3267}
3268
3269/* Return a Unicode-Escape string version of the Unicode object.
3270
3271   If quotes is true, the string is enclosed in u"" or u'' quotes as
3272   appropriate.
3273
3274*/
3275
3276Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3277                                      Py_ssize_t size,
3278                                      Py_UNICODE ch)
3279{
3280    /* like wcschr, but doesn't stop at NULL characters */
3281
3282    while (size-- > 0) {
3283        if (*s == ch)
3284            return s;
3285        s++;
3286    }
3287
3288    return NULL;
3289}
3290
3291static const char *hexdigits = "0123456789abcdef";
3292
3293PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3294					Py_ssize_t size)
3295{
3296    PyObject *repr;
3297    char *p;
3298
3299#ifdef Py_UNICODE_WIDE
3300    const Py_ssize_t expandsize = 10;
3301#else
3302    const Py_ssize_t expandsize = 6;
3303#endif
3304
3305    /* XXX(nnorwitz): rather than over-allocating, it would be
3306       better to choose a different scheme.  Perhaps scan the
3307       first N-chars of the string and allocate based on that size.
3308    */
3309    /* Initial allocation is based on the longest-possible unichr
3310       escape.
3311
3312       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3313       unichr, so in this case it's the longest unichr escape. In
3314       narrow (UTF-16) builds this is five chars per source unichr
3315       since there are two unichrs in the surrogate pair, so in narrow
3316       (UTF-16) builds it's not the longest unichr escape.
3317
3318       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3319       so in the narrow (UTF-16) build case it's the longest unichr
3320       escape.
3321    */
3322
3323    if (size == 0)
3324        return PyBytes_FromStringAndSize(NULL, 0);
3325
3326    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3327	return PyErr_NoMemory();
3328
3329    repr = PyBytes_FromStringAndSize(NULL,
3330        2
3331        + expandsize*size
3332        + 1);
3333    if (repr == NULL)
3334        return NULL;
3335
3336    p = PyBytes_AS_STRING(repr);
3337
3338    while (size-- > 0) {
3339        Py_UNICODE ch = *s++;
3340
3341        /* Escape backslashes */
3342        if (ch == '\\') {
3343            *p++ = '\\';
3344            *p++ = (char) ch;
3345            continue;
3346        }
3347
3348#ifdef Py_UNICODE_WIDE
3349        /* Map 21-bit characters to '\U00xxxxxx' */
3350        else if (ch >= 0x10000) {
3351            *p++ = '\\';
3352            *p++ = 'U';
3353            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3354            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3355            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3356            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3357            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3358            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3359            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3360            *p++ = hexdigits[ch & 0x0000000F];
3361	    continue;
3362        }
3363#else
3364	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3365	else if (ch >= 0xD800 && ch < 0xDC00) {
3366	    Py_UNICODE ch2;
3367	    Py_UCS4 ucs;
3368
3369	    ch2 = *s++;
3370	    size--;
3371	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3372		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3373		*p++ = '\\';
3374		*p++ = 'U';
3375		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3376		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3377		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3378		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3379		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3380		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3381		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3382		*p++ = hexdigits[ucs & 0x0000000F];
3383		continue;
3384	    }
3385	    /* Fall through: isolated surrogates are copied as-is */
3386	    s--;
3387	    size++;
3388	}
3389#endif
3390
3391        /* Map 16-bit characters to '\uxxxx' */
3392        if (ch >= 256) {
3393            *p++ = '\\';
3394            *p++ = 'u';
3395            *p++ = hexdigits[(ch >> 12) & 0x000F];
3396            *p++ = hexdigits[(ch >> 8) & 0x000F];
3397            *p++ = hexdigits[(ch >> 4) & 0x000F];
3398            *p++ = hexdigits[ch & 0x000F];
3399        }
3400
3401        /* Map special whitespace to '\t', \n', '\r' */
3402        else if (ch == '\t') {
3403            *p++ = '\\';
3404            *p++ = 't';
3405        }
3406        else if (ch == '\n') {
3407            *p++ = '\\';
3408            *p++ = 'n';
3409        }
3410        else if (ch == '\r') {
3411            *p++ = '\\';
3412            *p++ = 'r';
3413        }
3414
3415        /* Map non-printable US ASCII to '\xhh' */
3416        else if (ch < ' ' || ch >= 0x7F) {
3417            *p++ = '\\';
3418            *p++ = 'x';
3419            *p++ = hexdigits[(ch >> 4) & 0x000F];
3420            *p++ = hexdigits[ch & 0x000F];
3421        }
3422
3423        /* Copy everything else as-is */
3424        else
3425            *p++ = (char) ch;
3426    }
3427
3428    assert(p - PyBytes_AS_STRING(repr) > 0);
3429    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3430        return NULL;
3431    return repr;
3432}
3433
3434PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3435{
3436    PyObject *s;
3437    if (!PyUnicode_Check(unicode)) {
3438        PyErr_BadArgument();
3439        return NULL;
3440    }
3441    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3442                                      PyUnicode_GET_SIZE(unicode));
3443    return s;
3444}
3445
3446/* --- Raw Unicode Escape Codec ------------------------------------------- */
3447
3448PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3449					   Py_ssize_t size,
3450					   const char *errors)
3451{
3452    const char *starts = s;
3453    Py_ssize_t startinpos;
3454    Py_ssize_t endinpos;
3455    Py_ssize_t outpos;
3456    PyUnicodeObject *v;
3457    Py_UNICODE *p;
3458    const char *end;
3459    const char *bs;
3460    PyObject *errorHandler = NULL;
3461    PyObject *exc = NULL;
3462
3463    /* Escaped strings will always be longer than the resulting
3464       Unicode string, so we start with size here and then reduce the
3465       length after conversion to the true value. (But decoding error
3466       handler might have to resize the string) */
3467    v = _PyUnicode_New(size);
3468    if (v == NULL)
3469	goto onError;
3470    if (size == 0)
3471	return (PyObject *)v;
3472    p = PyUnicode_AS_UNICODE(v);
3473    end = s + size;
3474    while (s < end) {
3475	unsigned char c;
3476	Py_UCS4 x;
3477	int i;
3478        int count;
3479
3480	/* Non-escape characters are interpreted as Unicode ordinals */
3481	if (*s != '\\') {
3482	    *p++ = (unsigned char)*s++;
3483	    continue;
3484	}
3485	startinpos = s-starts;
3486
3487	/* \u-escapes are only interpreted iff the number of leading
3488	   backslashes if odd */
3489	bs = s;
3490	for (;s < end;) {
3491	    if (*s != '\\')
3492		break;
3493	    *p++ = (unsigned char)*s++;
3494	}
3495	if (((s - bs) & 1) == 0 ||
3496	    s >= end ||
3497	    (*s != 'u' && *s != 'U')) {
3498	    continue;
3499	}
3500	p--;
3501        count = *s=='u' ? 4 : 8;
3502	s++;
3503
3504	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3505	outpos = p-PyUnicode_AS_UNICODE(v);
3506	for (x = 0, i = 0; i < count; ++i, ++s) {
3507	    c = (unsigned char)*s;
3508	    if (!ISXDIGIT(c)) {
3509		endinpos = s-starts;
3510		if (unicode_decode_call_errorhandler(
3511		    errors, &errorHandler,
3512		    "rawunicodeescape", "truncated \\uXXXX",
3513		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3514		    &v, &outpos, &p))
3515		    goto onError;
3516		goto nextByte;
3517	    }
3518	    x = (x<<4) & ~0xF;
3519	    if (c >= '0' && c <= '9')
3520		x += c - '0';
3521	    else if (c >= 'a' && c <= 'f')
3522		x += 10 + c - 'a';
3523	    else
3524		x += 10 + c - 'A';
3525	}
3526        if (x <= 0xffff)
3527                /* UCS-2 character */
3528                *p++ = (Py_UNICODE) x;
3529        else if (x <= 0x10ffff) {
3530                /* UCS-4 character. Either store directly, or as
3531                   surrogate pair. */
3532#ifdef Py_UNICODE_WIDE
3533                *p++ = (Py_UNICODE) x;
3534#else
3535                x -= 0x10000L;
3536                *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3537                *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3538#endif
3539        } else {
3540            endinpos = s-starts;
3541            outpos = p-PyUnicode_AS_UNICODE(v);
3542            if (unicode_decode_call_errorhandler(
3543                    errors, &errorHandler,
3544                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3545		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3546		    &v, &outpos, &p))
3547		    goto onError;
3548        }
3549	nextByte:
3550	;
3551    }
3552    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3553	goto onError;
3554    Py_XDECREF(errorHandler);
3555    Py_XDECREF(exc);
3556    return (PyObject *)v;
3557
3558 onError:
3559    Py_XDECREF(v);
3560    Py_XDECREF(errorHandler);
3561    Py_XDECREF(exc);
3562    return NULL;
3563}
3564
3565PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3566					   Py_ssize_t size)
3567{
3568    PyObject *repr;
3569    char *p;
3570    char *q;
3571
3572#ifdef Py_UNICODE_WIDE
3573    const Py_ssize_t expandsize = 10;
3574#else
3575    const Py_ssize_t expandsize = 6;
3576#endif
3577
3578    if (size > PY_SSIZE_T_MAX / expandsize)
3579	return PyErr_NoMemory();
3580
3581    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
3582    if (repr == NULL)
3583        return NULL;
3584    if (size == 0)
3585        return repr;
3586
3587    p = q = PyBytes_AS_STRING(repr);
3588    while (size-- > 0) {
3589        Py_UNICODE ch = *s++;
3590#ifdef Py_UNICODE_WIDE
3591	/* Map 32-bit characters to '\Uxxxxxxxx' */
3592	if (ch >= 0x10000) {
3593            *p++ = '\\';
3594            *p++ = 'U';
3595            *p++ = hexdigits[(ch >> 28) & 0xf];
3596            *p++ = hexdigits[(ch >> 24) & 0xf];
3597            *p++ = hexdigits[(ch >> 20) & 0xf];
3598            *p++ = hexdigits[(ch >> 16) & 0xf];
3599            *p++ = hexdigits[(ch >> 12) & 0xf];
3600            *p++ = hexdigits[(ch >> 8) & 0xf];
3601            *p++ = hexdigits[(ch >> 4) & 0xf];
3602            *p++ = hexdigits[ch & 15];
3603        }
3604        else
3605#else
3606	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3607	if (ch >= 0xD800 && ch < 0xDC00) {
3608	    Py_UNICODE ch2;
3609	    Py_UCS4 ucs;
3610
3611	    ch2 = *s++;
3612	    size--;
3613	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3614		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3615		*p++ = '\\';
3616		*p++ = 'U';
3617		*p++ = hexdigits[(ucs >> 28) & 0xf];
3618		*p++ = hexdigits[(ucs >> 24) & 0xf];
3619		*p++ = hexdigits[(ucs >> 20) & 0xf];
3620		*p++ = hexdigits[(ucs >> 16) & 0xf];
3621		*p++ = hexdigits[(ucs >> 12) & 0xf];
3622		*p++ = hexdigits[(ucs >> 8) & 0xf];
3623		*p++ = hexdigits[(ucs >> 4) & 0xf];
3624		*p++ = hexdigits[ucs & 0xf];
3625		continue;
3626	    }
3627	    /* Fall through: isolated surrogates are copied as-is */
3628	    s--;
3629	    size++;
3630	}
3631#endif
3632	/* Map 16-bit characters to '\uxxxx' */
3633	if (ch >= 256) {
3634            *p++ = '\\';
3635            *p++ = 'u';
3636            *p++ = hexdigits[(ch >> 12) & 0xf];
3637            *p++ = hexdigits[(ch >> 8) & 0xf];
3638            *p++ = hexdigits[(ch >> 4) & 0xf];
3639            *p++ = hexdigits[ch & 15];
3640        }
3641	/* Copy everything else as-is */
3642	else
3643            *p++ = (char) ch;
3644    }
3645    size = p - q;
3646
3647    assert(size > 0);
3648    if (_PyBytes_Resize(&repr, size) < 0)
3649        return NULL;
3650    return repr;
3651}
3652
3653PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3654{
3655    PyObject *s;
3656    if (!PyUnicode_Check(unicode)) {
3657        PyErr_BadArgument();
3658        return NULL;
3659    }
3660    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3661                                         PyUnicode_GET_SIZE(unicode));
3662
3663    return s;
3664}
3665
3666/* --- Unicode Internal Codec ------------------------------------------- */
3667
3668PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3669					   Py_ssize_t size,
3670					   const char *errors)
3671{
3672    const char *starts = s;
3673    Py_ssize_t startinpos;
3674    Py_ssize_t endinpos;
3675    Py_ssize_t outpos;
3676    PyUnicodeObject *v;
3677    Py_UNICODE *p;
3678    const char *end;
3679    const char *reason;
3680    PyObject *errorHandler = NULL;
3681    PyObject *exc = NULL;
3682
3683#ifdef Py_UNICODE_WIDE
3684    Py_UNICODE unimax = PyUnicode_GetMax();
3685#endif
3686
3687    /* XXX overflow detection missing */
3688    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3689    if (v == NULL)
3690	goto onError;
3691    if (PyUnicode_GetSize((PyObject *)v) == 0)
3692	return (PyObject *)v;
3693    p = PyUnicode_AS_UNICODE(v);
3694    end = s + size;
3695
3696    while (s < end) {
3697        memcpy(p, s, sizeof(Py_UNICODE));
3698        /* We have to sanity check the raw data, otherwise doom looms for
3699           some malformed UCS-4 data. */
3700        if (
3701            #ifdef Py_UNICODE_WIDE
3702            *p > unimax || *p < 0 ||
3703            #endif
3704            end-s < Py_UNICODE_SIZE
3705            )
3706            {
3707            startinpos = s - starts;
3708            if (end-s < Py_UNICODE_SIZE) {
3709                endinpos = end-starts;
3710                reason = "truncated input";
3711            }
3712            else {
3713                endinpos = s - starts + Py_UNICODE_SIZE;
3714                reason = "illegal code point (> 0x10FFFF)";
3715            }
3716            outpos = p - PyUnicode_AS_UNICODE(v);
3717            if (unicode_decode_call_errorhandler(
3718                    errors, &errorHandler,
3719                    "unicode_internal", reason,
3720                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3721                    &v, &outpos, &p)) {
3722                goto onError;
3723            }
3724        }
3725        else {
3726            p++;
3727            s += Py_UNICODE_SIZE;
3728        }
3729    }
3730
3731    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3732        goto onError;
3733    Py_XDECREF(errorHandler);
3734    Py_XDECREF(exc);
3735    return (PyObject *)v;
3736
3737 onError:
3738    Py_XDECREF(v);
3739    Py_XDECREF(errorHandler);
3740    Py_XDECREF(exc);
3741    return NULL;
3742}
3743
3744/* --- Latin-1 Codec ------------------------------------------------------ */
3745
3746PyObject *PyUnicode_DecodeLatin1(const char *s,
3747				 Py_ssize_t size,
3748				 const char *errors)
3749{
3750    PyUnicodeObject *v;
3751    Py_UNICODE *p;
3752    const char *e, *unrolled_end;
3753
3754    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3755    if (size == 1) {
3756	Py_UNICODE r = *(unsigned char*)s;
3757	return PyUnicode_FromUnicode(&r, 1);
3758    }
3759
3760    v = _PyUnicode_New(size);
3761    if (v == NULL)
3762	goto onError;
3763    if (size == 0)
3764	return (PyObject *)v;
3765    p = PyUnicode_AS_UNICODE(v);
3766    e = s + size;
3767    /* Unrolling the copy makes it much faster by reducing the looping
3768       overhead. This is similar to what many memcpy() implementations do. */
3769    unrolled_end = e - 4;
3770    while (s < unrolled_end) {
3771        p[0] = (unsigned char) s[0];
3772        p[1] = (unsigned char) s[1];
3773        p[2] = (unsigned char) s[2];
3774        p[3] = (unsigned char) s[3];
3775        s += 4;
3776        p += 4;
3777    }
3778    while (s < e)
3779        *p++ = (unsigned char) *s++;
3780    return (PyObject *)v;
3781
3782 onError:
3783    Py_XDECREF(v);
3784    return NULL;
3785}
3786
3787/* create or adjust a UnicodeEncodeError */
3788static void make_encode_exception(PyObject **exceptionObject,
3789    const char *encoding,
3790    const Py_UNICODE *unicode, Py_ssize_t size,
3791    Py_ssize_t startpos, Py_ssize_t endpos,
3792    const char *reason)
3793{
3794    if (*exceptionObject == NULL) {
3795	*exceptionObject = PyUnicodeEncodeError_Create(
3796	    encoding, unicode, size, startpos, endpos, reason);
3797    }
3798    else {
3799	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3800	    goto onError;
3801	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3802	    goto onError;
3803	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3804	    goto onError;
3805	return;
3806	onError:
3807	Py_DECREF(*exceptionObject);
3808	*exceptionObject = NULL;
3809    }
3810}
3811
3812/* raises a UnicodeEncodeError */
3813static void raise_encode_exception(PyObject **exceptionObject,
3814    const char *encoding,
3815    const Py_UNICODE *unicode, Py_ssize_t size,
3816    Py_ssize_t startpos, Py_ssize_t endpos,
3817    const char *reason)
3818{
3819    make_encode_exception(exceptionObject,
3820	encoding, unicode, size, startpos, endpos, reason);
3821    if (*exceptionObject != NULL)
3822	PyCodec_StrictErrors(*exceptionObject);
3823}
3824
3825/* error handling callback helper:
3826   build arguments, call the callback and check the arguments,
3827   put the result into newpos and return the replacement string, which
3828   has to be freed by the caller */
3829static PyObject *unicode_encode_call_errorhandler(const char *errors,
3830    PyObject **errorHandler,
3831    const char *encoding, const char *reason,
3832    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3833    Py_ssize_t startpos, Py_ssize_t endpos,
3834    Py_ssize_t *newpos)
3835{
3836    static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
3837
3838    PyObject *restuple;
3839    PyObject *resunicode;
3840
3841    if (*errorHandler == NULL) {
3842	*errorHandler = PyCodec_LookupError(errors);
3843        if (*errorHandler == NULL)
3844	    return NULL;
3845    }
3846
3847    make_encode_exception(exceptionObject,
3848	encoding, unicode, size, startpos, endpos, reason);
3849    if (*exceptionObject == NULL)
3850	return NULL;
3851
3852    restuple = PyObject_CallFunctionObjArgs(
3853	*errorHandler, *exceptionObject, NULL);
3854    if (restuple == NULL)
3855	return NULL;
3856    if (!PyTuple_Check(restuple)) {
3857	PyErr_Format(PyExc_TypeError, &argparse[4]);
3858	Py_DECREF(restuple);
3859	return NULL;
3860    }
3861    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3862	&resunicode, newpos)) {
3863	Py_DECREF(restuple);
3864	return NULL;
3865    }
3866    if (*newpos<0)
3867	*newpos = size+*newpos;
3868    if (*newpos<0 || *newpos>size) {
3869	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3870	Py_DECREF(restuple);
3871	return NULL;
3872    }
3873    Py_INCREF(resunicode);
3874    Py_DECREF(restuple);
3875    return resunicode;
3876}
3877
3878static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3879				 Py_ssize_t size,
3880				 const char *errors,
3881				 int limit)
3882{
3883    /* output object */
3884    PyObject *res;
3885    /* pointers to the beginning and end+1 of input */
3886    const Py_UNICODE *startp = p;
3887    const Py_UNICODE *endp = p + size;
3888    /* pointer to the beginning of the unencodable characters */
3889    /* const Py_UNICODE *badp = NULL; */
3890    /* pointer into the output */
3891    char *str;
3892    /* current output position */
3893    Py_ssize_t ressize;
3894    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3895    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3896    PyObject *errorHandler = NULL;
3897    PyObject *exc = NULL;
3898    /* the following variable is used for caching string comparisons
3899     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3900    int known_errorHandler = -1;
3901
3902    /* allocate enough for a simple encoding without
3903       replacements, if we need more, we'll resize */
3904    if (size == 0)
3905        return PyBytes_FromStringAndSize(NULL, 0);
3906    res = PyBytes_FromStringAndSize(NULL, size);
3907    if (res == NULL)
3908        return NULL;
3909    str = PyBytes_AS_STRING(res);
3910    ressize = size;
3911
3912    while (p<endp) {
3913	Py_UNICODE c = *p;
3914
3915	/* can we encode this? */
3916	if (c<limit) {
3917	    /* no overflow check, because we know that the space is enough */
3918	    *str++ = (char)c;
3919	    ++p;
3920	}
3921	else {
3922	    Py_ssize_t unicodepos = p-startp;
3923	    Py_ssize_t requiredsize;
3924	    PyObject *repunicode;
3925	    Py_ssize_t repsize;
3926	    Py_ssize_t newpos;
3927	    Py_ssize_t respos;
3928	    Py_UNICODE *uni2;
3929	    /* startpos for collecting unencodable chars */
3930	    const Py_UNICODE *collstart = p;
3931	    const Py_UNICODE *collend = p;
3932	    /* find all unecodable characters */
3933	    while ((collend < endp) && ((*collend)>=limit))
3934		++collend;
3935	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3936	    if (known_errorHandler==-1) {
3937		if ((errors==NULL) || (!strcmp(errors, "strict")))
3938		    known_errorHandler = 1;
3939		else if (!strcmp(errors, "replace"))
3940		    known_errorHandler = 2;
3941		else if (!strcmp(errors, "ignore"))
3942		    known_errorHandler = 3;
3943		else if (!strcmp(errors, "xmlcharrefreplace"))
3944		    known_errorHandler = 4;
3945		else
3946		    known_errorHandler = 0;
3947	    }
3948	    switch (known_errorHandler) {
3949		case 1: /* strict */
3950		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3951		    goto onError;
3952		case 2: /* replace */
3953		    while (collstart++<collend)
3954			*str++ = '?'; /* fall through */
3955		case 3: /* ignore */
3956		    p = collend;
3957		    break;
3958		case 4: /* xmlcharrefreplace */
3959		    respos = str - PyBytes_AS_STRING(res);
3960		    /* determine replacement size (temporarily (mis)uses p) */
3961		    for (p = collstart, repsize = 0; p < collend; ++p) {
3962			if (*p<10)
3963			    repsize += 2+1+1;
3964			else if (*p<100)
3965			    repsize += 2+2+1;
3966			else if (*p<1000)
3967			    repsize += 2+3+1;
3968			else if (*p<10000)
3969			    repsize += 2+4+1;
3970#ifndef Py_UNICODE_WIDE
3971			else
3972			    repsize += 2+5+1;
3973#else
3974			else if (*p<100000)
3975			    repsize += 2+5+1;
3976			else if (*p<1000000)
3977			    repsize += 2+6+1;
3978			else
3979			    repsize += 2+7+1;
3980#endif
3981		    }
3982		    requiredsize = respos+repsize+(endp-collend);
3983		    if (requiredsize > ressize) {
3984			if (requiredsize<2*ressize)
3985			    requiredsize = 2*ressize;
3986			if (_PyBytes_Resize(&res, requiredsize))
3987			    goto onError;
3988			str = PyBytes_AS_STRING(res) + respos;
3989			ressize = requiredsize;
3990		    }
3991		    /* generate replacement (temporarily (mis)uses p) */
3992		    for (p = collstart; p < collend; ++p) {
3993			str += sprintf(str, "&#%d;", (int)*p);
3994		    }
3995		    p = collend;
3996		    break;
3997		default:
3998		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3999			encoding, reason, startp, size, &exc,
4000			collstart-startp, collend-startp, &newpos);
4001		    if (repunicode == NULL)
4002			goto onError;
4003		    /* need more space? (at least enough for what we
4004		       have+the replacement+the rest of the string, so
4005		       we won't have to check space for encodable characters) */
4006		    respos = str - PyBytes_AS_STRING(res);
4007		    repsize = PyUnicode_GET_SIZE(repunicode);
4008		    requiredsize = respos+repsize+(endp-collend);
4009		    if (requiredsize > ressize) {
4010			if (requiredsize<2*ressize)
4011			    requiredsize = 2*ressize;
4012			if (_PyBytes_Resize(&res, requiredsize)) {
4013			    Py_DECREF(repunicode);
4014			    goto onError;
4015			}
4016			str = PyBytes_AS_STRING(res) + respos;
4017			ressize = requiredsize;
4018		    }
4019		    /* check if there is anything unencodable in the replacement
4020		       and copy it to the output */
4021		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4022			c = *uni2;
4023			if (c >= limit) {
4024			    raise_encode_exception(&exc, encoding, startp, size,
4025				unicodepos, unicodepos+1, reason);
4026			    Py_DECREF(repunicode);
4027			    goto onError;
4028			}
4029			*str = (char)c;
4030		    }
4031		    p = startp + newpos;
4032		    Py_DECREF(repunicode);
4033	    }
4034	}
4035    }
4036    /* Resize if we allocated to much */
4037    size = str - PyBytes_AS_STRING(res);
4038    if (size < ressize) { /* If this falls res will be NULL */
4039        assert(size >= 0);
4040        if (_PyBytes_Resize(&res, size) < 0)
4041            goto onError;
4042    }
4043
4044    Py_XDECREF(errorHandler);
4045    Py_XDECREF(exc);
4046    return res;
4047
4048  onError:
4049    Py_XDECREF(res);
4050    Py_XDECREF(errorHandler);
4051    Py_XDECREF(exc);
4052    return NULL;
4053}
4054
4055PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4056				 Py_ssize_t size,
4057				 const char *errors)
4058{
4059    return unicode_encode_ucs1(p, size, errors, 256);
4060}
4061
4062PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4063{
4064    if (!PyUnicode_Check(unicode)) {
4065	PyErr_BadArgument();
4066	return NULL;
4067    }
4068    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4069				  PyUnicode_GET_SIZE(unicode),
4070				  NULL);
4071}
4072
4073/* --- 7-bit ASCII Codec -------------------------------------------------- */
4074
4075PyObject *PyUnicode_DecodeASCII(const char *s,
4076				Py_ssize_t size,
4077				const char *errors)
4078{
4079    const char *starts = s;
4080    PyUnicodeObject *v;
4081    Py_UNICODE *p;
4082    Py_ssize_t startinpos;
4083    Py_ssize_t endinpos;
4084    Py_ssize_t outpos;
4085    const char *e;
4086    PyObject *errorHandler = NULL;
4087    PyObject *exc = NULL;
4088
4089    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4090    if (size == 1 && *(unsigned char*)s < 128) {
4091	Py_UNICODE r = *(unsigned char*)s;
4092	return PyUnicode_FromUnicode(&r, 1);
4093    }
4094
4095    v = _PyUnicode_New(size);
4096    if (v == NULL)
4097	goto onError;
4098    if (size == 0)
4099	return (PyObject *)v;
4100    p = PyUnicode_AS_UNICODE(v);
4101    e = s + size;
4102    while (s < e) {
4103	register unsigned char c = (unsigned char)*s;
4104	if (c < 128) {
4105	    *p++ = c;
4106	    ++s;
4107	}
4108	else {
4109	    startinpos = s-starts;
4110	    endinpos = startinpos + 1;
4111	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4112	    if (unicode_decode_call_errorhandler(
4113		 errors, &errorHandler,
4114		 "ascii", "ordinal not in range(128)",
4115		 &starts, &e, &startinpos, &endinpos, &exc, &s,
4116		 &v, &outpos, &p))
4117		goto onError;
4118	}
4119    }
4120    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4121	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4122	    goto onError;
4123    Py_XDECREF(errorHandler);
4124    Py_XDECREF(exc);
4125    return (PyObject *)v;
4126
4127 onError:
4128    Py_XDECREF(v);
4129    Py_XDECREF(errorHandler);
4130    Py_XDECREF(exc);
4131    return NULL;
4132}
4133
4134PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4135				Py_ssize_t size,
4136				const char *errors)
4137{
4138    return unicode_encode_ucs1(p, size, errors, 128);
4139}
4140
4141PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4142{
4143    if (!PyUnicode_Check(unicode)) {
4144	PyErr_BadArgument();
4145	return NULL;
4146    }
4147    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4148				 PyUnicode_GET_SIZE(unicode),
4149				 NULL);
4150}
4151
4152#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4153
4154/* --- MBCS codecs for Windows -------------------------------------------- */
4155
4156#if SIZEOF_INT < SIZEOF_SSIZE_T
4157#define NEED_RETRY
4158#endif
4159
4160/* XXX This code is limited to "true" double-byte encodings, as
4161   a) it assumes an incomplete character consists of a single byte, and
4162   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4163      encodings, see IsDBCSLeadByteEx documentation. */
4164
4165static int is_dbcs_lead_byte(const char *s, int offset)
4166{
4167    const char *curr = s + offset;
4168
4169    if (IsDBCSLeadByte(*curr)) {
4170	const char *prev = CharPrev(s, curr);
4171	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4172    }
4173    return 0;
4174}
4175
4176/*
4177 * Decode MBCS string into unicode object. If 'final' is set, converts
4178 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4179 */
4180static int decode_mbcs(PyUnicodeObject **v,
4181			const char *s, /* MBCS string */
4182			int size, /* sizeof MBCS string */
4183			int final)
4184{
4185    Py_UNICODE *p;
4186    Py_ssize_t n = 0;
4187    int usize = 0;
4188
4189    assert(size >= 0);
4190
4191    /* Skip trailing lead-byte unless 'final' is set */
4192    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4193	--size;
4194
4195    /* First get the size of the result */
4196    if (size > 0) {
4197	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4198	if (usize == 0) {
4199	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4200	    return -1;
4201	}
4202    }
4203
4204    if (*v == NULL) {
4205	/* Create unicode object */
4206	*v = _PyUnicode_New(usize);
4207	if (*v == NULL)
4208	    return -1;
4209    }
4210    else {
4211	/* Extend unicode object */
4212	n = PyUnicode_GET_SIZE(*v);
4213	if (_PyUnicode_Resize(v, n + usize) < 0)
4214	    return -1;
4215    }
4216
4217    /* Do the conversion */
4218    if (size > 0) {
4219	p = PyUnicode_AS_UNICODE(*v) + n;
4220	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4221	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4222	    return -1;
4223	}
4224    }
4225
4226    return size;
4227}
4228
4229PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4230					Py_ssize_t size,
4231					const char *errors,
4232					Py_ssize_t *consumed)
4233{
4234    PyUnicodeObject *v = NULL;
4235    int done;
4236
4237    if (consumed)
4238	*consumed = 0;
4239
4240#ifdef NEED_RETRY
4241  retry:
4242    if (size > INT_MAX)
4243	done = decode_mbcs(&v, s, INT_MAX, 0);
4244    else
4245#endif
4246	done = decode_mbcs(&v, s, (int)size, !consumed);
4247
4248    if (done < 0) {
4249        Py_XDECREF(v);
4250	return NULL;
4251    }
4252
4253    if (consumed)
4254	*consumed += done;
4255
4256#ifdef NEED_RETRY
4257    if (size > INT_MAX) {
4258	s += done;
4259	size -= done;
4260	goto retry;
4261    }
4262#endif
4263
4264    return (PyObject *)v;
4265}
4266
4267PyObject *PyUnicode_DecodeMBCS(const char *s,
4268				Py_ssize_t size,
4269				const char *errors)
4270{
4271    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4272}
4273
4274/*
4275 * Convert unicode into string object (MBCS).
4276 * Returns 0 if succeed, -1 otherwise.
4277 */
4278static int encode_mbcs(PyObject **repr,
4279			const Py_UNICODE *p, /* unicode */
4280			int size) /* size of unicode */
4281{
4282    int mbcssize = 0;
4283    Py_ssize_t n = 0;
4284
4285    assert(size >= 0);
4286
4287    /* First get the size of the result */
4288    if (size > 0) {
4289	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4290	if (mbcssize == 0) {
4291	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4292	    return -1;
4293	}
4294    }
4295
4296    if (*repr == NULL) {
4297	/* Create string object */
4298	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4299	if (*repr == NULL)
4300	    return -1;
4301    }
4302    else {
4303	/* Extend string object */
4304	n = PyBytes_Size(*repr);
4305	if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4306	    return -1;
4307    }
4308
4309    /* Do the conversion */
4310    if (size > 0) {
4311	char *s = PyBytes_AS_STRING(*repr) + n;
4312	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4313	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4314	    return -1;
4315	}
4316    }
4317
4318    return 0;
4319}
4320
4321PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4322				Py_ssize_t size,
4323				const char *errors)
4324{
4325    PyObject *repr = NULL;
4326    int ret;
4327
4328#ifdef NEED_RETRY
4329 retry:
4330    if (size > INT_MAX)
4331	ret = encode_mbcs(&repr, p, INT_MAX);
4332    else
4333#endif
4334	ret = encode_mbcs(&repr, p, (int)size);
4335
4336    if (ret < 0) {
4337	Py_XDECREF(repr);
4338	return NULL;
4339    }
4340
4341#ifdef NEED_RETRY
4342    if (size > INT_MAX) {
4343	p += INT_MAX;
4344	size -= INT_MAX;
4345	goto retry;
4346    }
4347#endif
4348
4349    return repr;
4350}
4351
4352PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4353{
4354    if (!PyUnicode_Check(unicode)) {
4355        PyErr_BadArgument();
4356        return NULL;
4357    }
4358    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4359				PyUnicode_GET_SIZE(unicode),
4360				NULL);
4361}
4362
4363#undef NEED_RETRY
4364
4365#endif /* MS_WINDOWS */
4366
4367/* --- Character Mapping Codec -------------------------------------------- */
4368
4369PyObject *PyUnicode_DecodeCharmap(const char *s,
4370				  Py_ssize_t size,
4371				  PyObject *mapping,
4372				  const char *errors)
4373{
4374    const char *starts = s;
4375    Py_ssize_t startinpos;
4376    Py_ssize_t endinpos;
4377    Py_ssize_t outpos;
4378    const char *e;
4379    PyUnicodeObject *v;
4380    Py_UNICODE *p;
4381    Py_ssize_t extrachars = 0;
4382    PyObject *errorHandler = NULL;
4383    PyObject *exc = NULL;
4384    Py_UNICODE *mapstring = NULL;
4385    Py_ssize_t maplen = 0;
4386
4387    /* Default to Latin-1 */
4388    if (mapping == NULL)
4389	return PyUnicode_DecodeLatin1(s, size, errors);
4390
4391    v = _PyUnicode_New(size);
4392    if (v == NULL)
4393	goto onError;
4394    if (size == 0)
4395	return (PyObject *)v;
4396    p = PyUnicode_AS_UNICODE(v);
4397    e = s + size;
4398    if (PyUnicode_CheckExact(mapping)) {
4399	mapstring = PyUnicode_AS_UNICODE(mapping);
4400	maplen = PyUnicode_GET_SIZE(mapping);
4401	while (s < e) {
4402	    unsigned char ch = *s;
4403	    Py_UNICODE x = 0xfffe; /* illegal value */
4404
4405	    if (ch < maplen)
4406		x = mapstring[ch];
4407
4408	    if (x == 0xfffe) {
4409		/* undefined mapping */
4410		outpos = p-PyUnicode_AS_UNICODE(v);
4411		startinpos = s-starts;
4412		endinpos = startinpos+1;
4413		if (unicode_decode_call_errorhandler(
4414		     errors, &errorHandler,
4415		     "charmap", "character maps to <undefined>",
4416		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4417		     &v, &outpos, &p)) {
4418		    goto onError;
4419		}
4420		continue;
4421	    }
4422	    *p++ = x;
4423	    ++s;
4424	}
4425    }
4426    else {
4427	while (s < e) {
4428	    unsigned char ch = *s;
4429	    PyObject *w, *x;
4430
4431	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
4432	    w = PyLong_FromLong((long)ch);
4433	    if (w == NULL)
4434		goto onError;
4435	    x = PyObject_GetItem(mapping, w);
4436	    Py_DECREF(w);
4437	    if (x == NULL) {
4438		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4439		    /* No mapping found means: mapping is undefined. */
4440		    PyErr_Clear();
4441		    x = Py_None;
4442		    Py_INCREF(x);
4443		} else
4444		    goto onError;
4445	    }
4446
4447	    /* Apply mapping */
4448	    if (PyLong_Check(x)) {
4449		long value = PyLong_AS_LONG(x);
4450		if (value < 0 || value > 65535) {
4451		    PyErr_SetString(PyExc_TypeError,
4452				    "character mapping must be in range(65536)");
4453		    Py_DECREF(x);
4454		    goto onError;
4455		}
4456		*p++ = (Py_UNICODE)value;
4457	    }
4458	    else if (x == Py_None) {
4459		/* undefined mapping */
4460		outpos = p-PyUnicode_AS_UNICODE(v);
4461		startinpos = s-starts;
4462		endinpos = startinpos+1;
4463		if (unicode_decode_call_errorhandler(
4464		     errors, &errorHandler,
4465		     "charmap", "character maps to <undefined>",
4466		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4467		     &v, &outpos, &p)) {
4468		    Py_DECREF(x);
4469		    goto onError;
4470		}
4471		Py_DECREF(x);
4472		continue;
4473	    }
4474	    else if (PyUnicode_Check(x)) {
4475		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4476
4477		if (targetsize == 1)
4478		    /* 1-1 mapping */
4479		    *p++ = *PyUnicode_AS_UNICODE(x);
4480
4481		else if (targetsize > 1) {
4482		    /* 1-n mapping */
4483		    if (targetsize > extrachars) {
4484			/* resize first */
4485			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4486			Py_ssize_t needed = (targetsize - extrachars) + \
4487				     (targetsize << 2);
4488			extrachars += needed;
4489			/* XXX overflow detection missing */
4490			if (_PyUnicode_Resize(&v,
4491					     PyUnicode_GET_SIZE(v) + needed) < 0) {
4492			    Py_DECREF(x);
4493			    goto onError;
4494			}
4495			p = PyUnicode_AS_UNICODE(v) + oldpos;
4496		    }
4497		    Py_UNICODE_COPY(p,
4498				    PyUnicode_AS_UNICODE(x),
4499				    targetsize);
4500		    p += targetsize;
4501		    extrachars -= targetsize;
4502		}
4503		/* 1-0 mapping: skip the character */
4504	    }
4505	    else {
4506		/* wrong return value */
4507		PyErr_SetString(PyExc_TypeError,
4508		      "character mapping must return integer, None or str");
4509		Py_DECREF(x);
4510		goto onError;
4511	    }
4512	    Py_DECREF(x);
4513	    ++s;
4514	}
4515    }
4516    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4517	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4518	    goto onError;
4519    Py_XDECREF(errorHandler);
4520    Py_XDECREF(exc);
4521    return (PyObject *)v;
4522
4523 onError:
4524    Py_XDECREF(errorHandler);
4525    Py_XDECREF(exc);
4526    Py_XDECREF(v);
4527    return NULL;
4528}
4529
4530/* Charmap encoding: the lookup table */
4531
4532struct encoding_map{
4533  PyObject_HEAD
4534  unsigned char level1[32];
4535  int count2, count3;
4536  unsigned char level23[1];
4537};
4538
4539static PyObject*
4540encoding_map_size(PyObject *obj, PyObject* args)
4541{
4542    struct encoding_map *map = (struct encoding_map*)obj;
4543    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4544                          128*map->count3);
4545}
4546
4547static PyMethodDef encoding_map_methods[] = {
4548	{"size", encoding_map_size, METH_NOARGS,
4549         PyDoc_STR("Return the size (in bytes) of this object") },
4550        { 0 }
4551};
4552
4553static void
4554encoding_map_dealloc(PyObject* o)
4555{
4556	PyObject_FREE(o);
4557}
4558
4559static PyTypeObject EncodingMapType = {
4560	PyVarObject_HEAD_INIT(NULL, 0)
4561        "EncodingMap",          /*tp_name*/
4562        sizeof(struct encoding_map),   /*tp_basicsize*/
4563        0,                      /*tp_itemsize*/
4564        /* methods */
4565        encoding_map_dealloc,   /*tp_dealloc*/
4566        0,                      /*tp_print*/
4567        0,                      /*tp_getattr*/
4568        0,                      /*tp_setattr*/
4569        0,                      /*tp_compare*/
4570        0,                      /*tp_repr*/
4571        0,                      /*tp_as_number*/
4572        0,                      /*tp_as_sequence*/
4573        0,                      /*tp_as_mapping*/
4574        0,                      /*tp_hash*/
4575        0,                      /*tp_call*/
4576        0,                      /*tp_str*/
4577        0,                      /*tp_getattro*/
4578        0,                      /*tp_setattro*/
4579        0,                      /*tp_as_buffer*/
4580        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4581        0,                      /*tp_doc*/
4582        0,                      /*tp_traverse*/
4583        0,                      /*tp_clear*/
4584        0,                      /*tp_richcompare*/
4585        0,                      /*tp_weaklistoffset*/
4586        0,                      /*tp_iter*/
4587        0,                      /*tp_iternext*/
4588        encoding_map_methods,   /*tp_methods*/
4589        0,                      /*tp_members*/
4590        0,                      /*tp_getset*/
4591        0,                      /*tp_base*/
4592        0,                      /*tp_dict*/
4593        0,                      /*tp_descr_get*/
4594        0,                      /*tp_descr_set*/
4595        0,                      /*tp_dictoffset*/
4596        0,                      /*tp_init*/
4597        0,                      /*tp_alloc*/
4598        0,                      /*tp_new*/
4599        0,                      /*tp_free*/
4600        0,                      /*tp_is_gc*/
4601};
4602
4603PyObject*
4604PyUnicode_BuildEncodingMap(PyObject* string)
4605{
4606    Py_UNICODE *decode;
4607    PyObject *result;
4608    struct encoding_map *mresult;
4609    int i;
4610    int need_dict = 0;
4611    unsigned char level1[32];
4612    unsigned char level2[512];
4613    unsigned char *mlevel1, *mlevel2, *mlevel3;
4614    int count2 = 0, count3 = 0;
4615
4616    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4617        PyErr_BadArgument();
4618        return NULL;
4619    }
4620    decode = PyUnicode_AS_UNICODE(string);
4621    memset(level1, 0xFF, sizeof level1);
4622    memset(level2, 0xFF, sizeof level2);
4623
4624    /* If there isn't a one-to-one mapping of NULL to \0,
4625       or if there are non-BMP characters, we need to use
4626       a mapping dictionary. */
4627    if (decode[0] != 0)
4628        need_dict = 1;
4629    for (i = 1; i < 256; i++) {
4630        int l1, l2;
4631        if (decode[i] == 0
4632            #ifdef Py_UNICODE_WIDE
4633            || decode[i] > 0xFFFF
4634            #endif
4635        ) {
4636            need_dict = 1;
4637            break;
4638        }
4639        if (decode[i] == 0xFFFE)
4640            /* unmapped character */
4641            continue;
4642        l1 = decode[i] >> 11;
4643        l2 = decode[i] >> 7;
4644        if (level1[l1] == 0xFF)
4645            level1[l1] = count2++;
4646        if (level2[l2] == 0xFF)
4647            level2[l2] = count3++;
4648    }
4649
4650    if (count2 >= 0xFF || count3 >= 0xFF)
4651        need_dict = 1;
4652
4653    if (need_dict) {
4654        PyObject *result = PyDict_New();
4655        PyObject *key, *value;
4656        if (!result)
4657            return NULL;
4658        for (i = 0; i < 256; i++) {
4659            key = value = NULL;
4660            key = PyLong_FromLong(decode[i]);
4661            value = PyLong_FromLong(i);
4662            if (!key || !value)
4663                goto failed1;
4664            if (PyDict_SetItem(result, key, value) == -1)
4665                goto failed1;
4666            Py_DECREF(key);
4667            Py_DECREF(value);
4668        }
4669        return result;
4670      failed1:
4671        Py_XDECREF(key);
4672        Py_XDECREF(value);
4673        Py_DECREF(result);
4674        return NULL;
4675    }
4676
4677    /* Create a three-level trie */
4678    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4679                             16*count2 + 128*count3 - 1);
4680    if (!result)
4681        return PyErr_NoMemory();
4682    PyObject_Init(result, &EncodingMapType);
4683    mresult = (struct encoding_map*)result;
4684    mresult->count2 = count2;
4685    mresult->count3 = count3;
4686    mlevel1 = mresult->level1;
4687    mlevel2 = mresult->level23;
4688    mlevel3 = mresult->level23 + 16*count2;
4689    memcpy(mlevel1, level1, 32);
4690    memset(mlevel2, 0xFF, 16*count2);
4691    memset(mlevel3, 0, 128*count3);
4692    count3 = 0;
4693    for (i = 1; i < 256; i++) {
4694        int o1, o2, o3, i2, i3;
4695        if (decode[i] == 0xFFFE)
4696            /* unmapped character */
4697            continue;
4698        o1 = decode[i]>>11;
4699        o2 = (decode[i]>>7) & 0xF;
4700        i2 = 16*mlevel1[o1] + o2;
4701        if (mlevel2[i2] == 0xFF)
4702            mlevel2[i2] = count3++;
4703        o3 = decode[i] & 0x7F;
4704        i3 = 128*mlevel2[i2] + o3;
4705        mlevel3[i3] = i;
4706    }
4707    return result;
4708}
4709
4710static int
4711encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4712{
4713    struct encoding_map *map = (struct encoding_map*)mapping;
4714    int l1 = c>>11;
4715    int l2 = (c>>7) & 0xF;
4716    int l3 = c & 0x7F;
4717    int i;
4718
4719#ifdef Py_UNICODE_WIDE
4720    if (c > 0xFFFF) {
4721	return -1;
4722    }
4723#endif
4724    if (c == 0)
4725        return 0;
4726    /* level 1*/
4727    i = map->level1[l1];
4728    if (i == 0xFF) {
4729        return -1;
4730    }
4731    /* level 2*/
4732    i = map->level23[16*i+l2];
4733    if (i == 0xFF) {
4734        return -1;
4735    }
4736    /* level 3 */
4737    i = map->level23[16*map->count2 + 128*i + l3];
4738    if (i == 0) {
4739        return -1;
4740    }
4741    return i;
4742}
4743
4744/* Lookup the character ch in the mapping. If the character
4745   can't be found, Py_None is returned (or NULL, if another
4746   error occurred). */
4747static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4748{
4749    PyObject *w = PyLong_FromLong((long)c);
4750    PyObject *x;
4751
4752    if (w == NULL)
4753	 return NULL;
4754    x = PyObject_GetItem(mapping, w);
4755    Py_DECREF(w);
4756    if (x == NULL) {
4757	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4758	    /* No mapping found means: mapping is undefined. */
4759	    PyErr_Clear();
4760	    x = Py_None;
4761	    Py_INCREF(x);
4762	    return x;
4763	} else
4764	    return NULL;
4765    }
4766    else if (x == Py_None)
4767	return x;
4768    else if (PyLong_Check(x)) {
4769	long value = PyLong_AS_LONG(x);
4770	if (value < 0 || value > 255) {
4771	    PyErr_SetString(PyExc_TypeError,
4772			     "character mapping must be in range(256)");
4773	    Py_DECREF(x);
4774	    return NULL;
4775	}
4776	return x;
4777    }
4778    else if (PyBytes_Check(x))
4779	return x;
4780    else {
4781	/* wrong return value */
4782	PyErr_Format(PyExc_TypeError,
4783                "character mapping must return integer, bytes or None, not %.400s",
4784                x->ob_type->tp_name);
4785	Py_DECREF(x);
4786	return NULL;
4787    }
4788}
4789
4790static int
4791charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4792{
4793	Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4794	/* exponentially overallocate to minimize reallocations */
4795	if (requiredsize < 2*outsize)
4796	    requiredsize = 2*outsize;
4797	if (_PyBytes_Resize(outobj, requiredsize))
4798	    return -1;
4799	return 0;
4800}
4801
4802typedef enum charmapencode_result {
4803  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4804}charmapencode_result;
4805/* lookup the character, put the result in the output string and adjust
4806   various state variables. Resize the output bytes object if not enough
4807   space is available. Return a new reference to the object that
4808   was put in the output buffer, or Py_None, if the mapping was undefined
4809   (in which case no character was written) or NULL, if a
4810   reallocation error occurred. The caller must decref the result */
4811static
4812charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4813    PyObject **outobj, Py_ssize_t *outpos)
4814{
4815    PyObject *rep;
4816    char *outstart;
4817    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4818
4819    if (Py_TYPE(mapping) == &EncodingMapType) {
4820        int res = encoding_map_lookup(c, mapping);
4821	Py_ssize_t requiredsize = *outpos+1;
4822        if (res == -1)
4823            return enc_FAILED;
4824	if (outsize<requiredsize)
4825	    if (charmapencode_resize(outobj, outpos, requiredsize))
4826		return enc_EXCEPTION;
4827        outstart = PyBytes_AS_STRING(*outobj);
4828	outstart[(*outpos)++] = (char)res;
4829	return enc_SUCCESS;
4830    }
4831
4832    rep = charmapencode_lookup(c, mapping);
4833    if (rep==NULL)
4834	return enc_EXCEPTION;
4835    else if (rep==Py_None) {
4836	Py_DECREF(rep);
4837	return enc_FAILED;
4838    } else {
4839	if (PyLong_Check(rep)) {
4840	    Py_ssize_t requiredsize = *outpos+1;
4841	    if (outsize<requiredsize)
4842		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4843		    Py_DECREF(rep);
4844		    return enc_EXCEPTION;
4845		}
4846            outstart = PyBytes_AS_STRING(*outobj);
4847	    outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
4848	}
4849	else {
4850	    const char *repchars = PyBytes_AS_STRING(rep);
4851	    Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4852	    Py_ssize_t requiredsize = *outpos+repsize;
4853	    if (outsize<requiredsize)
4854		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4855		    Py_DECREF(rep);
4856		    return enc_EXCEPTION;
4857		}
4858            outstart = PyBytes_AS_STRING(*outobj);
4859	    memcpy(outstart + *outpos, repchars, repsize);
4860	    *outpos += repsize;
4861	}
4862    }
4863    Py_DECREF(rep);
4864    return enc_SUCCESS;
4865}
4866
4867/* handle an error in PyUnicode_EncodeCharmap
4868   Return 0 on success, -1 on error */
4869static
4870int charmap_encoding_error(
4871    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4872    PyObject **exceptionObject,
4873    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4874    PyObject **res, Py_ssize_t *respos)
4875{
4876    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4877    Py_ssize_t repsize;
4878    Py_ssize_t newpos;
4879    Py_UNICODE *uni2;
4880    /* startpos for collecting unencodable chars */
4881    Py_ssize_t collstartpos = *inpos;
4882    Py_ssize_t collendpos = *inpos+1;
4883    Py_ssize_t collpos;
4884    char *encoding = "charmap";
4885    char *reason = "character maps to <undefined>";
4886    charmapencode_result x;
4887
4888    /* find all unencodable characters */
4889    while (collendpos < size) {
4890        PyObject *rep;
4891        if (Py_TYPE(mapping) == &EncodingMapType) {
4892	    int res = encoding_map_lookup(p[collendpos], mapping);
4893	    if (res != -1)
4894		break;
4895	    ++collendpos;
4896	    continue;
4897	}
4898
4899	rep = charmapencode_lookup(p[collendpos], mapping);
4900	if (rep==NULL)
4901	    return -1;
4902	else if (rep!=Py_None) {
4903	    Py_DECREF(rep);
4904	    break;
4905	}
4906	Py_DECREF(rep);
4907	++collendpos;
4908    }
4909    /* cache callback name lookup
4910     * (if not done yet, i.e. it's the first error) */
4911    if (*known_errorHandler==-1) {
4912	if ((errors==NULL) || (!strcmp(errors, "strict")))
4913	    *known_errorHandler = 1;
4914	else if (!strcmp(errors, "replace"))
4915	    *known_errorHandler = 2;
4916	else if (!strcmp(errors, "ignore"))
4917	    *known_errorHandler = 3;
4918	else if (!strcmp(errors, "xmlcharrefreplace"))
4919	    *known_errorHandler = 4;
4920	else
4921	    *known_errorHandler = 0;
4922    }
4923    switch (*known_errorHandler) {
4924	case 1: /* strict */
4925	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4926	    return -1;
4927	case 2: /* replace */
4928	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4929		x = charmapencode_output('?', mapping, res, respos);
4930		if (x==enc_EXCEPTION) {
4931		    return -1;
4932		}
4933		else if (x==enc_FAILED) {
4934		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4935		    return -1;
4936		}
4937	    }
4938	    /* fall through */
4939	case 3: /* ignore */
4940	    *inpos = collendpos;
4941	    break;
4942	case 4: /* xmlcharrefreplace */
4943	    /* generate replacement (temporarily (mis)uses p) */
4944	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4945		char buffer[2+29+1+1];
4946		char *cp;
4947		sprintf(buffer, "&#%d;", (int)p[collpos]);
4948		for (cp = buffer; *cp; ++cp) {
4949		    x = charmapencode_output(*cp, mapping, res, respos);
4950		    if (x==enc_EXCEPTION)
4951			return -1;
4952		    else if (x==enc_FAILED) {
4953			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4954			return -1;
4955		    }
4956		}
4957	    }
4958	    *inpos = collendpos;
4959	    break;
4960	default:
4961	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4962		encoding, reason, p, size, exceptionObject,
4963		collstartpos, collendpos, &newpos);
4964	    if (repunicode == NULL)
4965		return -1;
4966	    /* generate replacement  */
4967	    repsize = PyUnicode_GET_SIZE(repunicode);
4968	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4969		x = charmapencode_output(*uni2, mapping, res, respos);
4970		if (x==enc_EXCEPTION) {
4971		    return -1;
4972		}
4973		else if (x==enc_FAILED) {
4974		    Py_DECREF(repunicode);
4975		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4976		    return -1;
4977		}
4978	    }
4979	    *inpos = newpos;
4980	    Py_DECREF(repunicode);
4981    }
4982    return 0;
4983}
4984
4985PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4986				  Py_ssize_t size,
4987				  PyObject *mapping,
4988				  const char *errors)
4989{
4990    /* output object */
4991    PyObject *res = NULL;
4992    /* current input position */
4993    Py_ssize_t inpos = 0;
4994    /* current output position */
4995    Py_ssize_t respos = 0;
4996    PyObject *errorHandler = NULL;
4997    PyObject *exc = NULL;
4998    /* the following variable is used for caching string comparisons
4999     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5000     * 3=ignore, 4=xmlcharrefreplace */
5001    int known_errorHandler = -1;
5002
5003    /* Default to Latin-1 */
5004    if (mapping == NULL)
5005	return PyUnicode_EncodeLatin1(p, size, errors);
5006
5007    /* allocate enough for a simple encoding without
5008       replacements, if we need more, we'll resize */
5009    res = PyBytes_FromStringAndSize(NULL, size);
5010    if (res == NULL)
5011        goto onError;
5012    if (size == 0)
5013	return res;
5014
5015    while (inpos<size) {
5016	/* try to encode it */
5017	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5018	if (x==enc_EXCEPTION) /* error */
5019	    goto onError;
5020	if (x==enc_FAILED) { /* unencodable character */
5021	    if (charmap_encoding_error(p, size, &inpos, mapping,
5022		&exc,
5023		&known_errorHandler, &errorHandler, errors,
5024		&res, &respos)) {
5025		goto onError;
5026	    }
5027	}
5028	else
5029	    /* done with this character => adjust input position */
5030	    ++inpos;
5031    }
5032
5033    /* Resize if we allocated to much */
5034    if (respos<PyBytes_GET_SIZE(res))
5035        if (_PyBytes_Resize(&res, respos) < 0)
5036            goto onError;
5037
5038    Py_XDECREF(exc);
5039    Py_XDECREF(errorHandler);
5040    return res;
5041
5042    onError:
5043    Py_XDECREF(res);
5044    Py_XDECREF(exc);
5045    Py_XDECREF(errorHandler);
5046    return NULL;
5047}
5048
5049PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5050				    PyObject *mapping)
5051{
5052    if (!PyUnicode_Check(unicode) || mapping == NULL) {
5053	PyErr_BadArgument();
5054	return NULL;
5055    }
5056    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5057				   PyUnicode_GET_SIZE(unicode),
5058				   mapping,
5059				   NULL);
5060}
5061
5062/* create or adjust a UnicodeTranslateError */
5063static void make_translate_exception(PyObject **exceptionObject,
5064    const Py_UNICODE *unicode, Py_ssize_t size,
5065    Py_ssize_t startpos, Py_ssize_t endpos,
5066    const char *reason)
5067{
5068    if (*exceptionObject == NULL) {
5069    	*exceptionObject = PyUnicodeTranslateError_Create(
5070	    unicode, size, startpos, endpos, reason);
5071    }
5072    else {
5073	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5074	    goto onError;
5075	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5076	    goto onError;
5077	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5078	    goto onError;
5079	return;
5080	onError:
5081	Py_DECREF(*exceptionObject);
5082	*exceptionObject = NULL;
5083    }
5084}
5085
5086/* raises a UnicodeTranslateError */
5087static void raise_translate_exception(PyObject **exceptionObject,
5088    const Py_UNICODE *unicode, Py_ssize_t size,
5089    Py_ssize_t startpos, Py_ssize_t endpos,
5090    const char *reason)
5091{
5092    make_translate_exception(exceptionObject,
5093	unicode, size, startpos, endpos, reason);
5094    if (*exceptionObject != NULL)
5095	PyCodec_StrictErrors(*exceptionObject);
5096}
5097
5098/* error handling callback helper:
5099   build arguments, call the callback and check the arguments,
5100   put the result into newpos and return the replacement string, which
5101   has to be freed by the caller */
5102static PyObject *unicode_translate_call_errorhandler(const char *errors,
5103    PyObject **errorHandler,
5104    const char *reason,
5105    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5106    Py_ssize_t startpos, Py_ssize_t endpos,
5107    Py_ssize_t *newpos)
5108{
5109    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5110
5111    Py_ssize_t i_newpos;
5112    PyObject *restuple;
5113    PyObject *resunicode;
5114
5115    if (*errorHandler == NULL) {
5116	*errorHandler = PyCodec_LookupError(errors);
5117        if (*errorHandler == NULL)
5118	    return NULL;
5119    }
5120
5121    make_translate_exception(exceptionObject,
5122	unicode, size, startpos, endpos, reason);
5123    if (*exceptionObject == NULL)
5124	return NULL;
5125
5126    restuple = PyObject_CallFunctionObjArgs(
5127	*errorHandler, *exceptionObject, NULL);
5128    if (restuple == NULL)
5129	return NULL;
5130    if (!PyTuple_Check(restuple)) {
5131	PyErr_Format(PyExc_TypeError, &argparse[4]);
5132	Py_DECREF(restuple);
5133	return NULL;
5134    }
5135    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
5136	&resunicode, &i_newpos)) {
5137	Py_DECREF(restuple);
5138	return NULL;
5139    }
5140    if (i_newpos<0)
5141	*newpos = size+i_newpos;
5142    else
5143        *newpos = i_newpos;
5144    if (*newpos<0 || *newpos>size) {
5145	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5146	Py_DECREF(restuple);
5147	return NULL;
5148    }
5149    Py_INCREF(resunicode);
5150    Py_DECREF(restuple);
5151    return resunicode;
5152}
5153
5154/* Lookup the character ch in the mapping and put the result in result,
5155   which must be decrefed by the caller.
5156   Return 0 on success, -1 on error */
5157static
5158int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5159{
5160    PyObject *w = PyLong_FromLong((long)c);
5161    PyObject *x;
5162
5163    if (w == NULL)
5164	 return -1;
5165    x = PyObject_GetItem(mapping, w);
5166    Py_DECREF(w);
5167    if (x == NULL) {
5168	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5169	    /* No mapping found means: use 1:1 mapping. */
5170	    PyErr_Clear();
5171	    *result = NULL;
5172	    return 0;
5173	} else
5174	    return -1;
5175    }
5176    else if (x == Py_None) {
5177	*result = x;
5178	return 0;
5179    }
5180    else if (PyLong_Check(x)) {
5181	long value = PyLong_AS_LONG(x);
5182	long max = PyUnicode_GetMax();
5183	if (value < 0 || value > max) {
5184	    PyErr_Format(PyExc_TypeError,
5185                         "character mapping must be in range(0x%x)", max+1);
5186	    Py_DECREF(x);
5187	    return -1;
5188	}
5189	*result = x;
5190	return 0;
5191    }
5192    else if (PyUnicode_Check(x)) {
5193	*result = x;
5194	return 0;
5195    }
5196    else {
5197	/* wrong return value */
5198	PyErr_SetString(PyExc_TypeError,
5199	      "character mapping must return integer, None or str");
5200	Py_DECREF(x);
5201	return -1;
5202    }
5203}
5204/* ensure that *outobj is at least requiredsize characters long,
5205if not reallocate and adjust various state variables.
5206Return 0 on success, -1 on error */
5207static
5208int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
5209    Py_ssize_t requiredsize)
5210{
5211    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
5212    if (requiredsize > oldsize) {
5213	/* remember old output position */
5214	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5215	/* exponentially overallocate to minimize reallocations */
5216	if (requiredsize < 2 * oldsize)
5217	    requiredsize = 2 * oldsize;
5218	if (PyUnicode_Resize(outobj, requiredsize) < 0)
5219	    return -1;
5220	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5221    }
5222    return 0;
5223}
5224/* lookup the character, put the result in the output string and adjust
5225   various state variables. Return a new reference to the object that
5226   was put in the output buffer in *result, or Py_None, if the mapping was
5227   undefined (in which case no character was written).
5228   The called must decref result.
5229   Return 0 on success, -1 on error. */
5230static
5231int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5232    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5233    PyObject **res)
5234{
5235    if (charmaptranslate_lookup(*curinp, mapping, res))
5236	return -1;
5237    if (*res==NULL) {
5238	/* not found => default to 1:1 mapping */
5239	*(*outp)++ = *curinp;
5240    }
5241    else if (*res==Py_None)
5242	;
5243    else if (PyLong_Check(*res)) {
5244	/* no overflow check, because we know that the space is enough */
5245	*(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
5246    }
5247    else if (PyUnicode_Check(*res)) {
5248	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5249	if (repsize==1) {
5250	    /* no overflow check, because we know that the space is enough */
5251	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5252	}
5253	else if (repsize!=0) {
5254	    /* more than one character */
5255	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5256		(insize - (curinp-startinp)) +
5257		repsize - 1;
5258	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
5259		return -1;
5260	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5261	    *outp += repsize;
5262	}
5263    }
5264    else
5265	return -1;
5266    return 0;
5267}
5268
5269PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5270				     Py_ssize_t size,
5271				     PyObject *mapping,
5272				     const char *errors)
5273{
5274    /* output object */
5275    PyObject *res = NULL;
5276    /* pointers to the beginning and end+1 of input */
5277    const Py_UNICODE *startp = p;
5278    const Py_UNICODE *endp = p + size;
5279    /* pointer into the output */
5280    Py_UNICODE *str;
5281    /* current output position */
5282    Py_ssize_t respos = 0;
5283    char *reason = "character maps to <undefined>";
5284    PyObject *errorHandler = NULL;
5285    PyObject *exc = NULL;
5286    /* the following variable is used for caching string comparisons
5287     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5288     * 3=ignore, 4=xmlcharrefreplace */
5289    int known_errorHandler = -1;
5290
5291    if (mapping == NULL) {
5292	PyErr_BadArgument();
5293	return NULL;
5294    }
5295
5296    /* allocate enough for a simple 1:1 translation without
5297       replacements, if we need more, we'll resize */
5298    res = PyUnicode_FromUnicode(NULL, size);
5299    if (res == NULL)
5300	goto onError;
5301    if (size == 0)
5302	return res;
5303    str = PyUnicode_AS_UNICODE(res);
5304
5305    while (p<endp) {
5306	/* try to encode it */
5307	PyObject *x = NULL;
5308	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5309	    Py_XDECREF(x);
5310	    goto onError;
5311	}
5312	Py_XDECREF(x);
5313	if (x!=Py_None) /* it worked => adjust input pointer */
5314	    ++p;
5315	else { /* untranslatable character */
5316	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5317	    Py_ssize_t repsize;
5318	    Py_ssize_t newpos;
5319	    Py_UNICODE *uni2;
5320	    /* startpos for collecting untranslatable chars */
5321	    const Py_UNICODE *collstart = p;
5322	    const Py_UNICODE *collend = p+1;
5323	    const Py_UNICODE *coll;
5324
5325	    /* find all untranslatable characters */
5326	    while (collend < endp) {
5327		if (charmaptranslate_lookup(*collend, mapping, &x))
5328		    goto onError;
5329		Py_XDECREF(x);
5330		if (x!=Py_None)
5331		    break;
5332		++collend;
5333	    }
5334	    /* cache callback name lookup
5335	     * (if not done yet, i.e. it's the first error) */
5336	    if (known_errorHandler==-1) {
5337		if ((errors==NULL) || (!strcmp(errors, "strict")))
5338		    known_errorHandler = 1;
5339		else if (!strcmp(errors, "replace"))
5340		    known_errorHandler = 2;
5341		else if (!strcmp(errors, "ignore"))
5342		    known_errorHandler = 3;
5343		else if (!strcmp(errors, "xmlcharrefreplace"))
5344		    known_errorHandler = 4;
5345		else
5346		    known_errorHandler = 0;
5347	    }
5348	    switch (known_errorHandler) {
5349		case 1: /* strict */
5350		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5351		    goto onError;
5352		case 2: /* replace */
5353		    /* No need to check for space, this is a 1:1 replacement */
5354		    for (coll = collstart; coll<collend; ++coll)
5355			*str++ = '?';
5356		    /* fall through */
5357		case 3: /* ignore */
5358		    p = collend;
5359		    break;
5360		case 4: /* xmlcharrefreplace */
5361		    /* generate replacement (temporarily (mis)uses p) */
5362		    for (p = collstart; p < collend; ++p) {
5363			char buffer[2+29+1+1];
5364			char *cp;
5365			sprintf(buffer, "&#%d;", (int)*p);
5366			if (charmaptranslate_makespace(&res, &str,
5367			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5368			    goto onError;
5369			for (cp = buffer; *cp; ++cp)
5370			    *str++ = *cp;
5371		    }
5372		    p = collend;
5373		    break;
5374		default:
5375		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5376			reason, startp, size, &exc,
5377			collstart-startp, collend-startp, &newpos);
5378		    if (repunicode == NULL)
5379			goto onError;
5380		    /* generate replacement  */
5381		    repsize = PyUnicode_GET_SIZE(repunicode);
5382		    if (charmaptranslate_makespace(&res, &str,
5383			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5384			Py_DECREF(repunicode);
5385			goto onError;
5386		    }
5387		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5388			*str++ = *uni2;
5389		    p = startp + newpos;
5390		    Py_DECREF(repunicode);
5391	    }
5392	}
5393    }
5394    /* Resize if we allocated to much */
5395    respos = str-PyUnicode_AS_UNICODE(res);
5396    if (respos<PyUnicode_GET_SIZE(res)) {
5397	if (PyUnicode_Resize(&res, respos) < 0)
5398	    goto onError;
5399    }
5400    Py_XDECREF(exc);
5401    Py_XDECREF(errorHandler);
5402    return res;
5403
5404    onError:
5405    Py_XDECREF(res);
5406    Py_XDECREF(exc);
5407    Py_XDECREF(errorHandler);
5408    return NULL;
5409}
5410
5411PyObject *PyUnicode_Translate(PyObject *str,
5412			      PyObject *mapping,
5413			      const char *errors)
5414{
5415    PyObject *result;
5416
5417    str = PyUnicode_FromObject(str);
5418    if (str == NULL)
5419	goto onError;
5420    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5421					PyUnicode_GET_SIZE(str),
5422					mapping,
5423					errors);
5424    Py_DECREF(str);
5425    return result;
5426
5427 onError:
5428    Py_XDECREF(str);
5429    return NULL;
5430}
5431
5432/* --- Decimal Encoder ---------------------------------------------------- */
5433
5434int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5435			    Py_ssize_t length,
5436			    char *output,
5437			    const char *errors)
5438{
5439    Py_UNICODE *p, *end;
5440    PyObject *errorHandler = NULL;
5441    PyObject *exc = NULL;
5442    const char *encoding = "decimal";
5443    const char *reason = "invalid decimal Unicode string";
5444    /* the following variable is used for caching string comparisons
5445     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5446    int known_errorHandler = -1;
5447
5448    if (output == NULL) {
5449	PyErr_BadArgument();
5450	return -1;
5451    }
5452
5453    p = s;
5454    end = s + length;
5455    while (p < end) {
5456	register Py_UNICODE ch = *p;
5457	int decimal;
5458	PyObject *repunicode;
5459	Py_ssize_t repsize;
5460	Py_ssize_t newpos;
5461	Py_UNICODE *uni2;
5462	Py_UNICODE *collstart;
5463	Py_UNICODE *collend;
5464
5465	if (Py_UNICODE_ISSPACE(ch)) {
5466	    *output++ = ' ';
5467	    ++p;
5468	    continue;
5469	}
5470	decimal = Py_UNICODE_TODECIMAL(ch);
5471	if (decimal >= 0) {
5472	    *output++ = '0' + decimal;
5473	    ++p;
5474	    continue;
5475	}
5476	if (0 < ch && ch < 256) {
5477	    *output++ = (char)ch;
5478	    ++p;
5479	    continue;
5480	}
5481	/* All other characters are considered unencodable */
5482	collstart = p;
5483	collend = p+1;
5484	while (collend < end) {
5485	    if ((0 < *collend && *collend < 256) ||
5486	        !Py_UNICODE_ISSPACE(*collend) ||
5487	        Py_UNICODE_TODECIMAL(*collend))
5488		break;
5489	}
5490	/* cache callback name lookup
5491	 * (if not done yet, i.e. it's the first error) */
5492	if (known_errorHandler==-1) {
5493	    if ((errors==NULL) || (!strcmp(errors, "strict")))
5494		known_errorHandler = 1;
5495	    else if (!strcmp(errors, "replace"))
5496		known_errorHandler = 2;
5497	    else if (!strcmp(errors, "ignore"))
5498		known_errorHandler = 3;
5499	    else if (!strcmp(errors, "xmlcharrefreplace"))
5500		known_errorHandler = 4;
5501	    else
5502		known_errorHandler = 0;
5503	}
5504	switch (known_errorHandler) {
5505	    case 1: /* strict */
5506		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5507		goto onError;
5508	    case 2: /* replace */
5509		for (p = collstart; p < collend; ++p)
5510		    *output++ = '?';
5511		/* fall through */
5512	    case 3: /* ignore */
5513		p = collend;
5514		break;
5515	    case 4: /* xmlcharrefreplace */
5516		/* generate replacement (temporarily (mis)uses p) */
5517		for (p = collstart; p < collend; ++p)
5518		    output += sprintf(output, "&#%d;", (int)*p);
5519		p = collend;
5520		break;
5521	    default:
5522		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5523		    encoding, reason, s, length, &exc,
5524		    collstart-s, collend-s, &newpos);
5525		if (repunicode == NULL)
5526		    goto onError;
5527		/* generate replacement  */
5528		repsize = PyUnicode_GET_SIZE(repunicode);
5529		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5530		    Py_UNICODE ch = *uni2;
5531		    if (Py_UNICODE_ISSPACE(ch))
5532			*output++ = ' ';
5533		    else {
5534			decimal = Py_UNICODE_TODECIMAL(ch);
5535			if (decimal >= 0)
5536			    *output++ = '0' + decimal;
5537			else if (0 < ch && ch < 256)
5538			    *output++ = (char)ch;
5539			else {
5540			    Py_DECREF(repunicode);
5541			    raise_encode_exception(&exc, encoding,
5542				s, length, collstart-s, collend-s, reason);
5543			    goto onError;
5544			}
5545		    }
5546		}
5547		p = s + newpos;
5548		Py_DECREF(repunicode);
5549	}
5550    }
5551    /* 0-terminate the output string */
5552    *output++ = '\0';
5553    Py_XDECREF(exc);
5554    Py_XDECREF(errorHandler);
5555    return 0;
5556
5557 onError:
5558    Py_XDECREF(exc);
5559    Py_XDECREF(errorHandler);
5560    return -1;
5561}
5562
5563/* --- Helpers ------------------------------------------------------------ */
5564
5565#include "stringlib/unicodedefs.h"
5566#include "stringlib/fastsearch.h"
5567#include "stringlib/count.h"
5568/* Include _ParseTupleFinds from find.h */
5569#define FROM_UNICODE
5570#include "stringlib/find.h"
5571#include "stringlib/partition.h"
5572
5573#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5574#include "stringlib/localeutil.h"
5575
5576/* helper macro to fixup start/end slice values */
5577#define FIX_START_END(obj)                      \
5578    if (start < 0)                              \
5579        start += (obj)->length;                 \
5580    if (start < 0)                              \
5581        start = 0;                              \
5582    if (end > (obj)->length)                    \
5583        end = (obj)->length;                    \
5584    if (end < 0)                                \
5585        end += (obj)->length;                   \
5586    if (end < 0)                                \
5587        end = 0;
5588
5589Py_ssize_t PyUnicode_Count(PyObject *str,
5590                           PyObject *substr,
5591                           Py_ssize_t start,
5592                           Py_ssize_t end)
5593{
5594    Py_ssize_t result;
5595    PyUnicodeObject* str_obj;
5596    PyUnicodeObject* sub_obj;
5597
5598    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5599    if (!str_obj)
5600	return -1;
5601    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5602    if (!sub_obj) {
5603	Py_DECREF(str_obj);
5604	return -1;
5605    }
5606
5607    FIX_START_END(str_obj);
5608
5609    result = stringlib_count(
5610        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5611        );
5612
5613    Py_DECREF(sub_obj);
5614    Py_DECREF(str_obj);
5615
5616    return result;
5617}
5618
5619Py_ssize_t PyUnicode_Find(PyObject *str,
5620                          PyObject *sub,
5621                          Py_ssize_t start,
5622                          Py_ssize_t end,
5623                          int direction)
5624{
5625    Py_ssize_t result;
5626
5627    str = PyUnicode_FromObject(str);
5628    if (!str)
5629	return -2;
5630    sub = PyUnicode_FromObject(sub);
5631    if (!sub) {
5632	Py_DECREF(str);
5633	return -2;
5634    }
5635
5636    if (direction > 0)
5637        result = stringlib_find_slice(
5638            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5639            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5640            start, end
5641            );
5642    else
5643        result = stringlib_rfind_slice(
5644            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5645            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5646            start, end
5647            );
5648
5649    Py_DECREF(str);
5650    Py_DECREF(sub);
5651
5652    return result;
5653}
5654
5655static
5656int tailmatch(PyUnicodeObject *self,
5657	      PyUnicodeObject *substring,
5658	      Py_ssize_t start,
5659	      Py_ssize_t end,
5660	      int direction)
5661{
5662    if (substring->length == 0)
5663        return 1;
5664
5665    FIX_START_END(self);
5666
5667    end -= substring->length;
5668    if (end < start)
5669	return 0;
5670
5671    if (direction > 0) {
5672	if (Py_UNICODE_MATCH(self, end, substring))
5673	    return 1;
5674    } else {
5675        if (Py_UNICODE_MATCH(self, start, substring))
5676	    return 1;
5677    }
5678
5679    return 0;
5680}
5681
5682Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5683			PyObject *substr,
5684			Py_ssize_t start,
5685			Py_ssize_t end,
5686			int direction)
5687{
5688    Py_ssize_t result;
5689
5690    str = PyUnicode_FromObject(str);
5691    if (str == NULL)
5692	return -1;
5693    substr = PyUnicode_FromObject(substr);
5694    if (substr == NULL) {
5695	Py_DECREF(str);
5696	return -1;
5697    }
5698
5699    result = tailmatch((PyUnicodeObject *)str,
5700		       (PyUnicodeObject *)substr,
5701		       start, end, direction);
5702    Py_DECREF(str);
5703    Py_DECREF(substr);
5704    return result;
5705}
5706
5707/* Apply fixfct filter to the Unicode object self and return a
5708   reference to the modified object */
5709
5710static
5711PyObject *fixup(PyUnicodeObject *self,
5712		int (*fixfct)(PyUnicodeObject *s))
5713{
5714
5715    PyUnicodeObject *u;
5716
5717    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5718    if (u == NULL)
5719	return NULL;
5720
5721    Py_UNICODE_COPY(u->str, self->str, self->length);
5722
5723    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5724	/* fixfct should return TRUE if it modified the buffer. If
5725	   FALSE, return a reference to the original buffer instead
5726	   (to save space, not time) */
5727	Py_INCREF(self);
5728	Py_DECREF(u);
5729	return (PyObject*) self;
5730    }
5731    return (PyObject*) u;
5732}
5733
5734static
5735int fixupper(PyUnicodeObject *self)
5736{
5737    Py_ssize_t len = self->length;
5738    Py_UNICODE *s = self->str;
5739    int status = 0;
5740
5741    while (len-- > 0) {
5742	register Py_UNICODE ch;
5743
5744	ch = Py_UNICODE_TOUPPER(*s);
5745	if (ch != *s) {
5746            status = 1;
5747	    *s = ch;
5748	}
5749        s++;
5750    }
5751
5752    return status;
5753}
5754
5755static
5756int fixlower(PyUnicodeObject *self)
5757{
5758    Py_ssize_t len = self->length;
5759    Py_UNICODE *s = self->str;
5760    int status = 0;
5761
5762    while (len-- > 0) {
5763	register Py_UNICODE ch;
5764
5765	ch = Py_UNICODE_TOLOWER(*s);
5766	if (ch != *s) {
5767            status = 1;
5768	    *s = ch;
5769	}
5770        s++;
5771    }
5772
5773    return status;
5774}
5775
5776static
5777int fixswapcase(PyUnicodeObject *self)
5778{
5779    Py_ssize_t len = self->length;
5780    Py_UNICODE *s = self->str;
5781    int status = 0;
5782
5783    while (len-- > 0) {
5784        if (Py_UNICODE_ISUPPER(*s)) {
5785            *s = Py_UNICODE_TOLOWER(*s);
5786            status = 1;
5787        } else if (Py_UNICODE_ISLOWER(*s)) {
5788            *s = Py_UNICODE_TOUPPER(*s);
5789            status = 1;
5790        }
5791        s++;
5792    }
5793
5794    return status;
5795}
5796
5797static
5798int fixcapitalize(PyUnicodeObject *self)
5799{
5800    Py_ssize_t len = self->length;
5801    Py_UNICODE *s = self->str;
5802    int status = 0;
5803
5804    if (len == 0)
5805	return 0;
5806    if (Py_UNICODE_ISLOWER(*s)) {
5807	*s = Py_UNICODE_TOUPPER(*s);
5808	status = 1;
5809    }
5810    s++;
5811    while (--len > 0) {
5812        if (Py_UNICODE_ISUPPER(*s)) {
5813            *s = Py_UNICODE_TOLOWER(*s);
5814            status = 1;
5815        }
5816        s++;
5817    }
5818    return status;
5819}
5820
5821static
5822int fixtitle(PyUnicodeObject *self)
5823{
5824    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5825    register Py_UNICODE *e;
5826    int previous_is_cased;
5827
5828    /* Shortcut for single character strings */
5829    if (PyUnicode_GET_SIZE(self) == 1) {
5830	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5831	if (*p != ch) {
5832	    *p = ch;
5833	    return 1;
5834	}
5835	else
5836	    return 0;
5837    }
5838
5839    e = p + PyUnicode_GET_SIZE(self);
5840    previous_is_cased = 0;
5841    for (; p < e; p++) {
5842	register const Py_UNICODE ch = *p;
5843
5844	if (previous_is_cased)
5845	    *p = Py_UNICODE_TOLOWER(ch);
5846	else
5847	    *p = Py_UNICODE_TOTITLE(ch);
5848
5849	if (Py_UNICODE_ISLOWER(ch) ||
5850	    Py_UNICODE_ISUPPER(ch) ||
5851	    Py_UNICODE_ISTITLE(ch))
5852	    previous_is_cased = 1;
5853	else
5854	    previous_is_cased = 0;
5855    }
5856    return 1;
5857}
5858
5859PyObject *
5860PyUnicode_Join(PyObject *separator, PyObject *seq)
5861{
5862    const Py_UNICODE blank = ' ';
5863    const Py_UNICODE *sep = &blank;
5864    Py_ssize_t seplen = 1;
5865    PyUnicodeObject *res = NULL; /* the result */
5866    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5867    PyObject *fseq;          /* PySequence_Fast(seq) */
5868    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
5869    PyObject **items;
5870    PyObject *item;
5871    Py_ssize_t sz, i;
5872
5873    fseq = PySequence_Fast(seq, "");
5874    if (fseq == NULL) {
5875    	return NULL;
5876    }
5877
5878    /* NOTE: the following code can't call back into Python code,
5879     * so we are sure that fseq won't be mutated.
5880     */
5881
5882    seqlen = PySequence_Fast_GET_SIZE(fseq);
5883    /* If empty sequence, return u"". */
5884    if (seqlen == 0) {
5885    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5886    	goto Done;
5887    }
5888    items = PySequence_Fast_ITEMS(fseq);
5889    /* If singleton sequence with an exact Unicode, return that. */
5890    if (seqlen == 1) {
5891	item = items[0];
5892	if (PyUnicode_CheckExact(item)) {
5893	    Py_INCREF(item);
5894	    res = (PyUnicodeObject *)item;
5895	    goto Done;
5896	}
5897    }
5898    else {
5899        /* Set up sep and seplen */
5900        if (separator == NULL) {
5901            sep = &blank;
5902            seplen = 1;
5903        }
5904        else {
5905            if (!PyUnicode_Check(separator)) {
5906                PyErr_Format(PyExc_TypeError,
5907                             "separator: expected str instance,"
5908                             " %.80s found",
5909                             Py_TYPE(separator)->tp_name);
5910                goto onError;
5911            }
5912            sep = PyUnicode_AS_UNICODE(separator);
5913            seplen = PyUnicode_GET_SIZE(separator);
5914        }
5915    }
5916
5917    /* There are at least two things to join, or else we have a subclass
5918     * of str in the sequence.
5919     * Do a pre-pass to figure out the total amount of space we'll
5920     * need (sz), and see whether all argument are strings.
5921     */
5922    sz = 0;
5923    for (i = 0; i < seqlen; i++) {
5924        const Py_ssize_t old_sz = sz;
5925        item = items[i];
5926	if (!PyUnicode_Check(item)) {
5927	    PyErr_Format(PyExc_TypeError,
5928			 "sequence item %zd: expected str instance,"
5929			 " %.80s found",
5930			 i, Py_TYPE(item)->tp_name);
5931	    goto onError;
5932	}
5933        sz += PyUnicode_GET_SIZE(item);
5934        if (i != 0)
5935            sz += seplen;
5936        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5937            PyErr_SetString(PyExc_OverflowError,
5938                "join() result is too long for a Python string");
5939            goto onError;
5940        }
5941    }
5942
5943    res = _PyUnicode_New(sz);
5944    if (res == NULL)
5945        goto onError;
5946
5947    /* Catenate everything. */
5948    res_p = PyUnicode_AS_UNICODE(res);
5949    for (i = 0; i < seqlen; ++i) {
5950        Py_ssize_t itemlen;
5951        item = items[i];
5952        itemlen = PyUnicode_GET_SIZE(item);
5953	/* Copy item, and maybe the separator. */
5954	if (i) {
5955	    Py_UNICODE_COPY(res_p, sep, seplen);
5956	    res_p += seplen;
5957	}
5958	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5959	res_p += itemlen;
5960    }
5961
5962 Done:
5963    Py_DECREF(fseq);
5964    return (PyObject *)res;
5965
5966 onError:
5967    Py_DECREF(fseq);
5968    Py_XDECREF(res);
5969    return NULL;
5970}
5971
5972static
5973PyUnicodeObject *pad(PyUnicodeObject *self,
5974		     Py_ssize_t left,
5975		     Py_ssize_t right,
5976		     Py_UNICODE fill)
5977{
5978    PyUnicodeObject *u;
5979
5980    if (left < 0)
5981        left = 0;
5982    if (right < 0)
5983        right = 0;
5984
5985    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5986        Py_INCREF(self);
5987        return self;
5988    }
5989
5990    if (left > PY_SSIZE_T_MAX - self->length ||
5991        right > PY_SSIZE_T_MAX - (left + self->length)) {
5992        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5993        return NULL;
5994    }
5995    u = _PyUnicode_New(left + self->length + right);
5996    if (u) {
5997        if (left)
5998            Py_UNICODE_FILL(u->str, fill, left);
5999        Py_UNICODE_COPY(u->str + left, self->str, self->length);
6000        if (right)
6001            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6002    }
6003
6004    return u;
6005}
6006
6007#define SPLIT_APPEND(data, left, right)					\
6008	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
6009	if (!str)							\
6010	    goto onError;						\
6011	if (PyList_Append(list, str)) {					\
6012	    Py_DECREF(str);						\
6013	    goto onError;						\
6014	}								\
6015        else								\
6016            Py_DECREF(str);
6017
6018static
6019PyObject *split_whitespace(PyUnicodeObject *self,
6020			   PyObject *list,
6021			   Py_ssize_t maxcount)
6022{
6023    register Py_ssize_t i;
6024    register Py_ssize_t j;
6025    Py_ssize_t len = self->length;
6026    PyObject *str;
6027    register const Py_UNICODE *buf = self->str;
6028
6029    for (i = j = 0; i < len; ) {
6030	/* find a token */
6031	while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6032	    i++;
6033	j = i;
6034	while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
6035	    i++;
6036	if (j < i) {
6037	    if (maxcount-- <= 0)
6038		break;
6039	    SPLIT_APPEND(buf, j, i);
6040	    while (i < len && Py_UNICODE_ISSPACE(buf[i]))
6041		i++;
6042	    j = i;
6043	}
6044    }
6045    if (j < len) {
6046	SPLIT_APPEND(buf, j, len);
6047    }
6048    return list;
6049
6050 onError:
6051    Py_DECREF(list);
6052    return NULL;
6053}
6054
6055PyObject *PyUnicode_Splitlines(PyObject *string,
6056			       int keepends)
6057{
6058    register Py_ssize_t i;
6059    register Py_ssize_t j;
6060    Py_ssize_t len;
6061    PyObject *list;
6062    PyObject *str;
6063    Py_UNICODE *data;
6064
6065    string = PyUnicode_FromObject(string);
6066    if (string == NULL)
6067	return NULL;
6068    data = PyUnicode_AS_UNICODE(string);
6069    len = PyUnicode_GET_SIZE(string);
6070
6071    list = PyList_New(0);
6072    if (!list)
6073        goto onError;
6074
6075    for (i = j = 0; i < len; ) {
6076	Py_ssize_t eol;
6077
6078	/* Find a line and append it */
6079	while (i < len && !BLOOM_LINEBREAK(data[i]))
6080	    i++;
6081
6082	/* Skip the line break reading CRLF as one line break */
6083	eol = i;
6084	if (i < len) {
6085	    if (data[i] == '\r' && i + 1 < len &&
6086		data[i+1] == '\n')
6087		i += 2;
6088	    else
6089		i++;
6090	    if (keepends)
6091		eol = i;
6092	}
6093	SPLIT_APPEND(data, j, eol);
6094	j = i;
6095    }
6096    if (j < len) {
6097	SPLIT_APPEND(data, j, len);
6098    }
6099
6100    Py_DECREF(string);
6101    return list;
6102
6103 onError:
6104    Py_XDECREF(list);
6105    Py_DECREF(string);
6106    return NULL;
6107}
6108
6109static
6110PyObject *split_char(PyUnicodeObject *self,
6111		     PyObject *list,
6112		     Py_UNICODE ch,
6113		     Py_ssize_t maxcount)
6114{
6115    register Py_ssize_t i;
6116    register Py_ssize_t j;
6117    Py_ssize_t len = self->length;
6118    PyObject *str;
6119    register const Py_UNICODE *buf = self->str;
6120
6121    for (i = j = 0; i < len; ) {
6122	if (buf[i] == ch) {
6123	    if (maxcount-- <= 0)
6124		break;
6125	    SPLIT_APPEND(buf, j, i);
6126	    i = j = i + 1;
6127	} else
6128	    i++;
6129    }
6130    if (j <= len) {
6131	SPLIT_APPEND(buf, j, len);
6132    }
6133    return list;
6134
6135 onError:
6136    Py_DECREF(list);
6137    return NULL;
6138}
6139
6140static
6141PyObject *split_substring(PyUnicodeObject *self,
6142			  PyObject *list,
6143			  PyUnicodeObject *substring,
6144			  Py_ssize_t maxcount)
6145{
6146    register Py_ssize_t i;
6147    register Py_ssize_t j;
6148    Py_ssize_t len = self->length;
6149    Py_ssize_t sublen = substring->length;
6150    PyObject *str;
6151
6152    for (i = j = 0; i <= len - sublen; ) {
6153	if (Py_UNICODE_MATCH(self, i, substring)) {
6154	    if (maxcount-- <= 0)
6155		break;
6156	    SPLIT_APPEND(self->str, j, i);
6157	    i = j = i + sublen;
6158	} else
6159	    i++;
6160    }
6161    if (j <= len) {
6162	SPLIT_APPEND(self->str, j, len);
6163    }
6164    return list;
6165
6166 onError:
6167    Py_DECREF(list);
6168    return NULL;
6169}
6170
6171static
6172PyObject *rsplit_whitespace(PyUnicodeObject *self,
6173			    PyObject *list,
6174			    Py_ssize_t maxcount)
6175{
6176    register Py_ssize_t i;
6177    register Py_ssize_t j;
6178    Py_ssize_t len = self->length;
6179    PyObject *str;
6180    register const Py_UNICODE *buf = self->str;
6181
6182    for (i = j = len - 1; i >= 0; ) {
6183	/* find a token */
6184	while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6185	    i--;
6186	j = i;
6187	while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
6188	    i--;
6189	if (j > i) {
6190	    if (maxcount-- <= 0)
6191		break;
6192	    SPLIT_APPEND(buf, i + 1, j + 1);
6193	    while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6194		i--;
6195	    j = i;
6196	}
6197    }
6198    if (j >= 0) {
6199	SPLIT_APPEND(buf, 0, j + 1);
6200    }
6201    if (PyList_Reverse(list) < 0)
6202        goto onError;
6203    return list;
6204
6205 onError:
6206    Py_DECREF(list);
6207    return NULL;
6208}
6209
6210static
6211PyObject *rsplit_char(PyUnicodeObject *self,
6212		      PyObject *list,
6213		      Py_UNICODE ch,
6214		      Py_ssize_t maxcount)
6215{
6216    register Py_ssize_t i;
6217    register Py_ssize_t j;
6218    Py_ssize_t len = self->length;
6219    PyObject *str;
6220    register const Py_UNICODE *buf = self->str;
6221
6222    for (i = j = len - 1; i >= 0; ) {
6223	if (buf[i] == ch) {
6224	    if (maxcount-- <= 0)
6225		break;
6226	    SPLIT_APPEND(buf, i + 1, j + 1);
6227	    j = i = i - 1;
6228	} else
6229	    i--;
6230    }
6231    if (j >= -1) {
6232	SPLIT_APPEND(buf, 0, j + 1);
6233    }
6234    if (PyList_Reverse(list) < 0)
6235        goto onError;
6236    return list;
6237
6238 onError:
6239    Py_DECREF(list);
6240    return NULL;
6241}
6242
6243static
6244PyObject *rsplit_substring(PyUnicodeObject *self,
6245			   PyObject *list,
6246			   PyUnicodeObject *substring,
6247			   Py_ssize_t maxcount)
6248{
6249    register Py_ssize_t i;
6250    register Py_ssize_t j;
6251    Py_ssize_t len = self->length;
6252    Py_ssize_t sublen = substring->length;
6253    PyObject *str;
6254
6255    for (i = len - sublen, j = len; i >= 0; ) {
6256	if (Py_UNICODE_MATCH(self, i, substring)) {
6257	    if (maxcount-- <= 0)
6258		break;
6259	    SPLIT_APPEND(self->str, i + sublen, j);
6260	    j = i;
6261	    i -= sublen;
6262	} else
6263	    i--;
6264    }
6265    if (j >= 0) {
6266	SPLIT_APPEND(self->str, 0, j);
6267    }
6268    if (PyList_Reverse(list) < 0)
6269        goto onError;
6270    return list;
6271
6272 onError:
6273    Py_DECREF(list);
6274    return NULL;
6275}
6276
6277#undef SPLIT_APPEND
6278
6279static
6280PyObject *split(PyUnicodeObject *self,
6281		PyUnicodeObject *substring,
6282		Py_ssize_t maxcount)
6283{
6284    PyObject *list;
6285
6286    if (maxcount < 0)
6287        maxcount = PY_SSIZE_T_MAX;
6288
6289    list = PyList_New(0);
6290    if (!list)
6291        return NULL;
6292
6293    if (substring == NULL)
6294	return split_whitespace(self,list,maxcount);
6295
6296    else if (substring->length == 1)
6297	return split_char(self,list,substring->str[0],maxcount);
6298
6299    else if (substring->length == 0) {
6300	Py_DECREF(list);
6301	PyErr_SetString(PyExc_ValueError, "empty separator");
6302	return NULL;
6303    }
6304    else
6305	return split_substring(self,list,substring,maxcount);
6306}
6307
6308static
6309PyObject *rsplit(PyUnicodeObject *self,
6310		 PyUnicodeObject *substring,
6311		 Py_ssize_t maxcount)
6312{
6313    PyObject *list;
6314
6315    if (maxcount < 0)
6316        maxcount = PY_SSIZE_T_MAX;
6317
6318    list = PyList_New(0);
6319    if (!list)
6320        return NULL;
6321
6322    if (substring == NULL)
6323	return rsplit_whitespace(self,list,maxcount);
6324
6325    else if (substring->length == 1)
6326	return rsplit_char(self,list,substring->str[0],maxcount);
6327
6328    else if (substring->length == 0) {
6329	Py_DECREF(list);
6330	PyErr_SetString(PyExc_ValueError, "empty separator");
6331	return NULL;
6332    }
6333    else
6334	return rsplit_substring(self,list,substring,maxcount);
6335}
6336
6337static
6338PyObject *replace(PyUnicodeObject *self,
6339		  PyUnicodeObject *str1,
6340		  PyUnicodeObject *str2,
6341		  Py_ssize_t maxcount)
6342{
6343    PyUnicodeObject *u;
6344
6345    if (maxcount < 0)
6346	maxcount = PY_SSIZE_T_MAX;
6347
6348    if (str1->length == str2->length) {
6349        /* same length */
6350        Py_ssize_t i;
6351        if (str1->length == 1) {
6352            /* replace characters */
6353            Py_UNICODE u1, u2;
6354            if (!findchar(self->str, self->length, str1->str[0]))
6355                goto nothing;
6356            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6357            if (!u)
6358                return NULL;
6359            Py_UNICODE_COPY(u->str, self->str, self->length);
6360            u1 = str1->str[0];
6361            u2 = str2->str[0];
6362            for (i = 0; i < u->length; i++)
6363                if (u->str[i] == u1) {
6364                    if (--maxcount < 0)
6365                        break;
6366                    u->str[i] = u2;
6367                }
6368        } else {
6369            i = fastsearch(
6370                self->str, self->length, str1->str, str1->length, FAST_SEARCH
6371                );
6372            if (i < 0)
6373                goto nothing;
6374            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6375            if (!u)
6376                return NULL;
6377            Py_UNICODE_COPY(u->str, self->str, self->length);
6378            while (i <= self->length - str1->length)
6379                if (Py_UNICODE_MATCH(self, i, str1)) {
6380                    if (--maxcount < 0)
6381                        break;
6382                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6383                    i += str1->length;
6384                } else
6385                    i++;
6386        }
6387    } else {
6388
6389        Py_ssize_t n, i, j, e;
6390        Py_ssize_t product, new_size, delta;
6391        Py_UNICODE *p;
6392
6393        /* replace strings */
6394        n = stringlib_count(self->str, self->length, str1->str, str1->length);
6395        if (n > maxcount)
6396            n = maxcount;
6397        if (n == 0)
6398            goto nothing;
6399        /* new_size = self->length + n * (str2->length - str1->length)); */
6400        delta = (str2->length - str1->length);
6401        if (delta == 0) {
6402            new_size = self->length;
6403        } else {
6404            product = n * (str2->length - str1->length);
6405            if ((product / (str2->length - str1->length)) != n) {
6406                PyErr_SetString(PyExc_OverflowError,
6407                                "replace string is too long");
6408                return NULL;
6409            }
6410            new_size = self->length + product;
6411            if (new_size < 0) {
6412                PyErr_SetString(PyExc_OverflowError,
6413                                "replace string is too long");
6414                return NULL;
6415            }
6416        }
6417        u = _PyUnicode_New(new_size);
6418        if (!u)
6419            return NULL;
6420        i = 0;
6421        p = u->str;
6422        e = self->length - str1->length;
6423        if (str1->length > 0) {
6424            while (n-- > 0) {
6425                /* look for next match */
6426                j = i;
6427                while (j <= e) {
6428                    if (Py_UNICODE_MATCH(self, j, str1))
6429                        break;
6430                    j++;
6431                }
6432		if (j > i) {
6433                    if (j > e)
6434                        break;
6435                    /* copy unchanged part [i:j] */
6436                    Py_UNICODE_COPY(p, self->str+i, j-i);
6437                    p += j - i;
6438                }
6439                /* copy substitution string */
6440                if (str2->length > 0) {
6441                    Py_UNICODE_COPY(p, str2->str, str2->length);
6442                    p += str2->length;
6443                }
6444                i = j + str1->length;
6445            }
6446            if (i < self->length)
6447                /* copy tail [i:] */
6448                Py_UNICODE_COPY(p, self->str+i, self->length-i);
6449        } else {
6450            /* interleave */
6451            while (n > 0) {
6452                Py_UNICODE_COPY(p, str2->str, str2->length);
6453                p += str2->length;
6454                if (--n <= 0)
6455                    break;
6456                *p++ = self->str[i++];
6457            }
6458            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6459        }
6460    }
6461    return (PyObject *) u;
6462
6463nothing:
6464    /* nothing to replace; return original string (when possible) */
6465    if (PyUnicode_CheckExact(self)) {
6466        Py_INCREF(self);
6467        return (PyObject *) self;
6468    }
6469    return PyUnicode_FromUnicode(self->str, self->length);
6470}
6471
6472/* --- Unicode Object Methods --------------------------------------------- */
6473
6474PyDoc_STRVAR(title__doc__,
6475"S.title() -> str\n\
6476\n\
6477Return a titlecased version of S, i.e. words start with title case\n\
6478characters, all remaining cased characters have lower case.");
6479
6480static PyObject*
6481unicode_title(PyUnicodeObject *self)
6482{
6483    return fixup(self, fixtitle);
6484}
6485
6486PyDoc_STRVAR(capitalize__doc__,
6487"S.capitalize() -> str\n\
6488\n\
6489Return a capitalized version of S, i.e. make the first character\n\
6490have upper case.");
6491
6492static PyObject*
6493unicode_capitalize(PyUnicodeObject *self)
6494{
6495    return fixup(self, fixcapitalize);
6496}
6497
6498#if 0
6499PyDoc_STRVAR(capwords__doc__,
6500"S.capwords() -> str\n\
6501\n\
6502Apply .capitalize() to all words in S and return the result with\n\
6503normalized whitespace (all whitespace strings are replaced by ' ').");
6504
6505static PyObject*
6506unicode_capwords(PyUnicodeObject *self)
6507{
6508    PyObject *list;
6509    PyObject *item;
6510    Py_ssize_t i;
6511
6512    /* Split into words */
6513    list = split(self, NULL, -1);
6514    if (!list)
6515        return NULL;
6516
6517    /* Capitalize each word */
6518    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6519        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6520		     fixcapitalize);
6521        if (item == NULL)
6522            goto onError;
6523        Py_DECREF(PyList_GET_ITEM(list, i));
6524        PyList_SET_ITEM(list, i, item);
6525    }
6526
6527    /* Join the words to form a new string */
6528    item = PyUnicode_Join(NULL, list);
6529
6530onError:
6531    Py_DECREF(list);
6532    return (PyObject *)item;
6533}
6534#endif
6535
6536/* Argument converter.  Coerces to a single unicode character */
6537
6538static int
6539convert_uc(PyObject *obj, void *addr)
6540{
6541	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6542	PyObject *uniobj;
6543	Py_UNICODE *unistr;
6544
6545	uniobj = PyUnicode_FromObject(obj);
6546	if (uniobj == NULL) {
6547		PyErr_SetString(PyExc_TypeError,
6548			"The fill character cannot be converted to Unicode");
6549		return 0;
6550	}
6551	if (PyUnicode_GET_SIZE(uniobj) != 1) {
6552		PyErr_SetString(PyExc_TypeError,
6553			"The fill character must be exactly one character long");
6554		Py_DECREF(uniobj);
6555		return 0;
6556	}
6557	unistr = PyUnicode_AS_UNICODE(uniobj);
6558	*fillcharloc = unistr[0];
6559	Py_DECREF(uniobj);
6560	return 1;
6561}
6562
6563PyDoc_STRVAR(center__doc__,
6564"S.center(width[, fillchar]) -> str\n\
6565\n\
6566Return S centered in a string of length width. Padding is\n\
6567done using the specified fill character (default is a space)");
6568
6569static PyObject *
6570unicode_center(PyUnicodeObject *self, PyObject *args)
6571{
6572    Py_ssize_t marg, left;
6573    Py_ssize_t width;
6574    Py_UNICODE fillchar = ' ';
6575
6576    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6577        return NULL;
6578
6579    if (self->length >= width && PyUnicode_CheckExact(self)) {
6580        Py_INCREF(self);
6581        return (PyObject*) self;
6582    }
6583
6584    marg = width - self->length;
6585    left = marg / 2 + (marg & width & 1);
6586
6587    return (PyObject*) pad(self, left, marg - left, fillchar);
6588}
6589
6590#if 0
6591
6592/* This code should go into some future Unicode collation support
6593   module. The basic comparison should compare ordinals on a naive
6594   basis (this is what Java does and thus JPython too). */
6595
6596/* speedy UTF-16 code point order comparison */
6597/* gleaned from: */
6598/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6599
6600static short utf16Fixup[32] =
6601{
6602    0, 0, 0, 0, 0, 0, 0, 0,
6603    0, 0, 0, 0, 0, 0, 0, 0,
6604    0, 0, 0, 0, 0, 0, 0, 0,
6605    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6606};
6607
6608static int
6609unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6610{
6611    Py_ssize_t len1, len2;
6612
6613    Py_UNICODE *s1 = str1->str;
6614    Py_UNICODE *s2 = str2->str;
6615
6616    len1 = str1->length;
6617    len2 = str2->length;
6618
6619    while (len1 > 0 && len2 > 0) {
6620        Py_UNICODE c1, c2;
6621
6622        c1 = *s1++;
6623        c2 = *s2++;
6624
6625	if (c1 > (1<<11) * 26)
6626	    c1 += utf16Fixup[c1>>11];
6627	if (c2 > (1<<11) * 26)
6628            c2 += utf16Fixup[c2>>11];
6629        /* now c1 and c2 are in UTF-32-compatible order */
6630
6631        if (c1 != c2)
6632            return (c1 < c2) ? -1 : 1;
6633
6634        len1--; len2--;
6635    }
6636
6637    return (len1 < len2) ? -1 : (len1 != len2);
6638}
6639
6640#else
6641
6642static int
6643unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6644{
6645    register Py_ssize_t len1, len2;
6646
6647    Py_UNICODE *s1 = str1->str;
6648    Py_UNICODE *s2 = str2->str;
6649
6650    len1 = str1->length;
6651    len2 = str2->length;
6652
6653    while (len1 > 0 && len2 > 0) {
6654        Py_UNICODE c1, c2;
6655
6656        c1 = *s1++;
6657        c2 = *s2++;
6658
6659        if (c1 != c2)
6660            return (c1 < c2) ? -1 : 1;
6661
6662        len1--; len2--;
6663    }
6664
6665    return (len1 < len2) ? -1 : (len1 != len2);
6666}
6667
6668#endif
6669
6670int PyUnicode_Compare(PyObject *left,
6671		      PyObject *right)
6672{
6673    if (PyUnicode_Check(left) && PyUnicode_Check(right))
6674        return unicode_compare((PyUnicodeObject *)left,
6675                               (PyUnicodeObject *)right);
6676    PyErr_Format(PyExc_TypeError,
6677                 "Can't compare %.100s and %.100s",
6678                 left->ob_type->tp_name,
6679                 right->ob_type->tp_name);
6680    return -1;
6681}
6682
6683int
6684PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6685{
6686    int i;
6687    Py_UNICODE *id;
6688    assert(PyUnicode_Check(uni));
6689    id = PyUnicode_AS_UNICODE(uni);
6690    /* Compare Unicode string and source character set string */
6691    for (i = 0; id[i] && str[i]; i++)
6692	if (id[i] != str[i])
6693	    return ((int)id[i] < (int)str[i]) ? -1 : 1;
6694    if (id[i])
6695	return 1; /* uni is longer */
6696    if (str[i])
6697	return -1; /* str is longer */
6698    return 0;
6699}
6700
6701
6702#define TEST_COND(cond) \
6703	((cond) ? Py_True : Py_False)
6704
6705PyObject *PyUnicode_RichCompare(PyObject *left,
6706                                PyObject *right,
6707                                int op)
6708{
6709    int result;
6710
6711    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6712        PyObject *v;
6713        if (((PyUnicodeObject *) left)->length !=
6714            ((PyUnicodeObject *) right)->length) {
6715            if (op == Py_EQ) {
6716                Py_INCREF(Py_False);
6717                return Py_False;
6718            }
6719            if (op == Py_NE) {
6720                Py_INCREF(Py_True);
6721                return Py_True;
6722            }
6723        }
6724        if (left == right)
6725            result = 0;
6726        else
6727            result = unicode_compare((PyUnicodeObject *)left,
6728                                     (PyUnicodeObject *)right);
6729
6730        /* Convert the return value to a Boolean */
6731        switch (op) {
6732        case Py_EQ:
6733            v = TEST_COND(result == 0);
6734            break;
6735        case Py_NE:
6736            v = TEST_COND(result != 0);
6737            break;
6738        case Py_LE:
6739            v = TEST_COND(result <= 0);
6740            break;
6741        case Py_GE:
6742            v = TEST_COND(result >= 0);
6743            break;
6744        case Py_LT:
6745            v = TEST_COND(result == -1);
6746            break;
6747        case Py_GT:
6748            v = TEST_COND(result == 1);
6749            break;
6750        default:
6751            PyErr_BadArgument();
6752            return NULL;
6753        }
6754        Py_INCREF(v);
6755        return v;
6756    }
6757
6758    Py_INCREF(Py_NotImplemented);
6759    return Py_NotImplemented;
6760}
6761
6762int PyUnicode_Contains(PyObject *container,
6763		       PyObject *element)
6764{
6765    PyObject *str, *sub;
6766    int result;
6767
6768    /* Coerce the two arguments */
6769    sub = PyUnicode_FromObject(element);
6770    if (!sub) {
6771	PyErr_Format(PyExc_TypeError,
6772	    "'in <string>' requires string as left operand, not %s",
6773	    element->ob_type->tp_name);
6774        return -1;
6775    }
6776
6777    str = PyUnicode_FromObject(container);
6778    if (!str) {
6779        Py_DECREF(sub);
6780        return -1;
6781    }
6782
6783    result = stringlib_contains_obj(str, sub);
6784
6785    Py_DECREF(str);
6786    Py_DECREF(sub);
6787
6788    return result;
6789}
6790
6791/* Concat to string or Unicode object giving a new Unicode object. */
6792
6793PyObject *PyUnicode_Concat(PyObject *left,
6794			   PyObject *right)
6795{
6796    PyUnicodeObject *u = NULL, *v = NULL, *w;
6797
6798    /* Coerce the two arguments */
6799    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6800    if (u == NULL)
6801	goto onError;
6802    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6803    if (v == NULL)
6804	goto onError;
6805
6806    /* Shortcuts */
6807    if (v == unicode_empty) {
6808	Py_DECREF(v);
6809	return (PyObject *)u;
6810    }
6811    if (u == unicode_empty) {
6812	Py_DECREF(u);
6813	return (PyObject *)v;
6814    }
6815
6816    /* Concat the two Unicode strings */
6817    w = _PyUnicode_New(u->length + v->length);
6818    if (w == NULL)
6819	goto onError;
6820    Py_UNICODE_COPY(w->str, u->str, u->length);
6821    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6822
6823    Py_DECREF(u);
6824    Py_DECREF(v);
6825    return (PyObject *)w;
6826
6827onError:
6828    Py_XDECREF(u);
6829    Py_XDECREF(v);
6830    return NULL;
6831}
6832
6833void
6834PyUnicode_Append(PyObject **pleft, PyObject *right)
6835{
6836	PyObject *new;
6837	if (*pleft == NULL)
6838		return;
6839	if (right == NULL || !PyUnicode_Check(*pleft)) {
6840		Py_DECREF(*pleft);
6841		*pleft = NULL;
6842		return;
6843	}
6844	new = PyUnicode_Concat(*pleft, right);
6845	Py_DECREF(*pleft);
6846	*pleft = new;
6847}
6848
6849void
6850PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6851{
6852	PyUnicode_Append(pleft, right);
6853	Py_XDECREF(right);
6854}
6855
6856PyDoc_STRVAR(count__doc__,
6857"S.count(sub[, start[, end]]) -> int\n\
6858\n\
6859Return the number of non-overlapping occurrences of substring sub in\n\
6860string S[start:end].  Optional arguments start and end are\n\
6861interpreted as in slice notation.");
6862
6863static PyObject *
6864unicode_count(PyUnicodeObject *self, PyObject *args)
6865{
6866    PyUnicodeObject *substring;
6867    Py_ssize_t start = 0;
6868    Py_ssize_t end = PY_SSIZE_T_MAX;
6869    PyObject *result;
6870
6871    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6872		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6873        return NULL;
6874
6875    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6876        (PyObject *)substring);
6877    if (substring == NULL)
6878	return NULL;
6879
6880    FIX_START_END(self);
6881
6882    result = PyLong_FromSsize_t(
6883        stringlib_count(self->str + start, end - start,
6884                        substring->str, substring->length)
6885        );
6886
6887    Py_DECREF(substring);
6888
6889    return result;
6890}
6891
6892PyDoc_STRVAR(encode__doc__,
6893"S.encode([encoding[, errors]]) -> bytes\n\
6894\n\
6895Encode S using the codec registered for encoding. encoding defaults\n\
6896to the default encoding. errors may be given to set a different error\n\
6897handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6898a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6899'xmlcharrefreplace' as well as any other name registered with\n\
6900codecs.register_error that can handle UnicodeEncodeErrors.");
6901
6902static PyObject *
6903unicode_encode(PyUnicodeObject *self, PyObject *args)
6904{
6905    char *encoding = NULL;
6906    char *errors = NULL;
6907    PyObject *v;
6908
6909    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6910        return NULL;
6911    v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
6912    if (v == NULL)
6913        goto onError;
6914    if (!PyBytes_Check(v)) {
6915        PyErr_Format(PyExc_TypeError,
6916                     "encoder did not return a bytes object "
6917                     "(type=%.400s)",
6918                     Py_TYPE(v)->tp_name);
6919        Py_DECREF(v);
6920        return NULL;
6921    }
6922    return v;
6923
6924 onError:
6925    return NULL;
6926}
6927
6928PyDoc_STRVAR(expandtabs__doc__,
6929"S.expandtabs([tabsize]) -> str\n\
6930\n\
6931Return a copy of S where all tab characters are expanded using spaces.\n\
6932If tabsize is not given, a tab size of 8 characters is assumed.");
6933
6934static PyObject*
6935unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6936{
6937    Py_UNICODE *e;
6938    Py_UNICODE *p;
6939    Py_UNICODE *q;
6940    Py_UNICODE *qe;
6941    Py_ssize_t i, j, incr;
6942    PyUnicodeObject *u;
6943    int tabsize = 8;
6944
6945    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6946	return NULL;
6947
6948    /* First pass: determine size of output string */
6949    i = 0; /* chars up to and including most recent \n or \r */
6950    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6951    e = self->str + self->length; /* end of input */
6952    for (p = self->str; p < e; p++)
6953        if (*p == '\t') {
6954	    if (tabsize > 0) {
6955		incr = tabsize - (j % tabsize); /* cannot overflow */
6956		if (j > PY_SSIZE_T_MAX - incr)
6957		    goto overflow1;
6958		j += incr;
6959            }
6960	}
6961        else {
6962	    if (j > PY_SSIZE_T_MAX - 1)
6963		goto overflow1;
6964            j++;
6965            if (*p == '\n' || *p == '\r') {
6966		if (i > PY_SSIZE_T_MAX - j)
6967		    goto overflow1;
6968                i += j;
6969                j = 0;
6970            }
6971        }
6972
6973    if (i > PY_SSIZE_T_MAX - j)
6974	goto overflow1;
6975
6976    /* Second pass: create output string and fill it */
6977    u = _PyUnicode_New(i + j);
6978    if (!u)
6979        return NULL;
6980
6981    j = 0; /* same as in first pass */
6982    q = u->str; /* next output char */
6983    qe = u->str + u->length; /* end of output */
6984
6985    for (p = self->str; p < e; p++)
6986        if (*p == '\t') {
6987	    if (tabsize > 0) {
6988		i = tabsize - (j % tabsize);
6989		j += i;
6990		while (i--) {
6991		    if (q >= qe)
6992			goto overflow2;
6993		    *q++ = ' ';
6994                }
6995	    }
6996	}
6997	else {
6998	    if (q >= qe)
6999		goto overflow2;
7000	    *q++ = *p;
7001            j++;
7002            if (*p == '\n' || *p == '\r')
7003                j = 0;
7004        }
7005
7006    return (PyObject*) u;
7007
7008  overflow2:
7009    Py_DECREF(u);
7010  overflow1:
7011    PyErr_SetString(PyExc_OverflowError, "new string is too long");
7012    return NULL;
7013}
7014
7015PyDoc_STRVAR(find__doc__,
7016"S.find(sub[, start[, end]]) -> int\n\
7017\n\
7018Return the lowest index in S where substring sub is found,\n\
7019such that sub is contained within s[start:end].  Optional\n\
7020arguments start and end are interpreted as in slice notation.\n\
7021\n\
7022Return -1 on failure.");
7023
7024static PyObject *
7025unicode_find(PyUnicodeObject *self, PyObject *args)
7026{
7027    PyObject *substring;
7028    Py_ssize_t start;
7029    Py_ssize_t end;
7030    Py_ssize_t result;
7031
7032    if (!_ParseTupleFinds(args, &substring, &start, &end))
7033        return NULL;
7034
7035    result = stringlib_find_slice(
7036        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7037        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7038        start, end
7039        );
7040
7041    Py_DECREF(substring);
7042
7043    return PyLong_FromSsize_t(result);
7044}
7045
7046static PyObject *
7047unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7048{
7049    if (index < 0 || index >= self->length) {
7050        PyErr_SetString(PyExc_IndexError, "string index out of range");
7051        return NULL;
7052    }
7053
7054    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7055}
7056
7057/* Believe it or not, this produces the same value for ASCII strings
7058   as string_hash(). */
7059static long
7060unicode_hash(PyUnicodeObject *self)
7061{
7062    Py_ssize_t len;
7063    Py_UNICODE *p;
7064    long x;
7065
7066    if (self->hash != -1)
7067        return self->hash;
7068    len = Py_SIZE(self);
7069    p = self->str;
7070    x = *p << 7;
7071    while (--len >= 0)
7072        x = (1000003*x) ^ *p++;
7073    x ^= Py_SIZE(self);
7074    if (x == -1)
7075        x = -2;
7076    self->hash = x;
7077    return x;
7078}
7079
7080PyDoc_STRVAR(index__doc__,
7081"S.index(sub[, start[, end]]) -> int\n\
7082\n\
7083Like S.find() but raise ValueError when the substring is not found.");
7084
7085static PyObject *
7086unicode_index(PyUnicodeObject *self, PyObject *args)
7087{
7088    Py_ssize_t result;
7089    PyObject *substring;
7090    Py_ssize_t start;
7091    Py_ssize_t end;
7092
7093    if (!_ParseTupleFinds(args, &substring, &start, &end))
7094        return NULL;
7095
7096    result = stringlib_find_slice(
7097        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7098        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7099        start, end
7100        );
7101
7102    Py_DECREF(substring);
7103
7104    if (result < 0) {
7105        PyErr_SetString(PyExc_ValueError, "substring not found");
7106        return NULL;
7107    }
7108
7109    return PyLong_FromSsize_t(result);
7110}
7111
7112PyDoc_STRVAR(islower__doc__,
7113"S.islower() -> bool\n\
7114\n\
7115Return True if all cased characters in S are lowercase and there is\n\
7116at least one cased character in S, False otherwise.");
7117
7118static PyObject*
7119unicode_islower(PyUnicodeObject *self)
7120{
7121    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7122    register const Py_UNICODE *e;
7123    int cased;
7124
7125    /* Shortcut for single character strings */
7126    if (PyUnicode_GET_SIZE(self) == 1)
7127	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7128
7129    /* Special case for empty strings */
7130    if (PyUnicode_GET_SIZE(self) == 0)
7131	return PyBool_FromLong(0);
7132
7133    e = p + PyUnicode_GET_SIZE(self);
7134    cased = 0;
7135    for (; p < e; p++) {
7136	register const Py_UNICODE ch = *p;
7137
7138	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7139	    return PyBool_FromLong(0);
7140	else if (!cased && Py_UNICODE_ISLOWER(ch))
7141	    cased = 1;
7142    }
7143    return PyBool_FromLong(cased);
7144}
7145
7146PyDoc_STRVAR(isupper__doc__,
7147"S.isupper() -> bool\n\
7148\n\
7149Return True if all cased characters in S are uppercase and there is\n\
7150at least one cased character in S, False otherwise.");
7151
7152static PyObject*
7153unicode_isupper(PyUnicodeObject *self)
7154{
7155    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7156    register const Py_UNICODE *e;
7157    int cased;
7158
7159    /* Shortcut for single character strings */
7160    if (PyUnicode_GET_SIZE(self) == 1)
7161	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7162
7163    /* Special case for empty strings */
7164    if (PyUnicode_GET_SIZE(self) == 0)
7165	return PyBool_FromLong(0);
7166
7167    e = p + PyUnicode_GET_SIZE(self);
7168    cased = 0;
7169    for (; p < e; p++) {
7170	register const Py_UNICODE ch = *p;
7171
7172	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7173	    return PyBool_FromLong(0);
7174	else if (!cased && Py_UNICODE_ISUPPER(ch))
7175	    cased = 1;
7176    }
7177    return PyBool_FromLong(cased);
7178}
7179
7180PyDoc_STRVAR(istitle__doc__,
7181"S.istitle() -> bool\n\
7182\n\
7183Return True if S is a titlecased string and there is at least one\n\
7184character in S, i.e. upper- and titlecase characters may only\n\
7185follow uncased characters and lowercase characters only cased ones.\n\
7186Return False otherwise.");
7187
7188static PyObject*
7189unicode_istitle(PyUnicodeObject *self)
7190{
7191    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7192    register const Py_UNICODE *e;
7193    int cased, previous_is_cased;
7194
7195    /* Shortcut for single character strings */
7196    if (PyUnicode_GET_SIZE(self) == 1)
7197	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7198			       (Py_UNICODE_ISUPPER(*p) != 0));
7199
7200    /* Special case for empty strings */
7201    if (PyUnicode_GET_SIZE(self) == 0)
7202	return PyBool_FromLong(0);
7203
7204    e = p + PyUnicode_GET_SIZE(self);
7205    cased = 0;
7206    previous_is_cased = 0;
7207    for (; p < e; p++) {
7208	register const Py_UNICODE ch = *p;
7209
7210	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7211	    if (previous_is_cased)
7212		return PyBool_FromLong(0);
7213	    previous_is_cased = 1;
7214	    cased = 1;
7215	}
7216	else if (Py_UNICODE_ISLOWER(ch)) {
7217	    if (!previous_is_cased)
7218		return PyBool_FromLong(0);
7219	    previous_is_cased = 1;
7220	    cased = 1;
7221	}
7222	else
7223	    previous_is_cased = 0;
7224    }
7225    return PyBool_FromLong(cased);
7226}
7227
7228PyDoc_STRVAR(isspace__doc__,
7229"S.isspace() -> bool\n\
7230\n\
7231Return True if all characters in S are whitespace\n\
7232and there is at least one character in S, False otherwise.");
7233
7234static PyObject*
7235unicode_isspace(PyUnicodeObject *self)
7236{
7237    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7238    register const Py_UNICODE *e;
7239
7240    /* Shortcut for single character strings */
7241    if (PyUnicode_GET_SIZE(self) == 1 &&
7242	Py_UNICODE_ISSPACE(*p))
7243	return PyBool_FromLong(1);
7244
7245    /* Special case for empty strings */
7246    if (PyUnicode_GET_SIZE(self) == 0)
7247	return PyBool_FromLong(0);
7248
7249    e = p + PyUnicode_GET_SIZE(self);
7250    for (; p < e; p++) {
7251	if (!Py_UNICODE_ISSPACE(*p))
7252	    return PyBool_FromLong(0);
7253    }
7254    return PyBool_FromLong(1);
7255}
7256
7257PyDoc_STRVAR(isalpha__doc__,
7258"S.isalpha() -> bool\n\
7259\n\
7260Return True if all characters in S are alphabetic\n\
7261and there is at least one character in S, False otherwise.");
7262
7263static PyObject*
7264unicode_isalpha(PyUnicodeObject *self)
7265{
7266    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7267    register const Py_UNICODE *e;
7268
7269    /* Shortcut for single character strings */
7270    if (PyUnicode_GET_SIZE(self) == 1 &&
7271	Py_UNICODE_ISALPHA(*p))
7272	return PyBool_FromLong(1);
7273
7274    /* Special case for empty strings */
7275    if (PyUnicode_GET_SIZE(self) == 0)
7276	return PyBool_FromLong(0);
7277
7278    e = p + PyUnicode_GET_SIZE(self);
7279    for (; p < e; p++) {
7280	if (!Py_UNICODE_ISALPHA(*p))
7281	    return PyBool_FromLong(0);
7282    }
7283    return PyBool_FromLong(1);
7284}
7285
7286PyDoc_STRVAR(isalnum__doc__,
7287"S.isalnum() -> bool\n\
7288\n\
7289Return True if all characters in S are alphanumeric\n\
7290and there is at least one character in S, False otherwise.");
7291
7292static PyObject*
7293unicode_isalnum(PyUnicodeObject *self)
7294{
7295    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7296    register const Py_UNICODE *e;
7297
7298    /* Shortcut for single character strings */
7299    if (PyUnicode_GET_SIZE(self) == 1 &&
7300	Py_UNICODE_ISALNUM(*p))
7301	return PyBool_FromLong(1);
7302
7303    /* Special case for empty strings */
7304    if (PyUnicode_GET_SIZE(self) == 0)
7305	return PyBool_FromLong(0);
7306
7307    e = p + PyUnicode_GET_SIZE(self);
7308    for (; p < e; p++) {
7309	if (!Py_UNICODE_ISALNUM(*p))
7310	    return PyBool_FromLong(0);
7311    }
7312    return PyBool_FromLong(1);
7313}
7314
7315PyDoc_STRVAR(isdecimal__doc__,
7316"S.isdecimal() -> bool\n\
7317\n\
7318Return True if there are only decimal characters in S,\n\
7319False otherwise.");
7320
7321static PyObject*
7322unicode_isdecimal(PyUnicodeObject *self)
7323{
7324    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7325    register const Py_UNICODE *e;
7326
7327    /* Shortcut for single character strings */
7328    if (PyUnicode_GET_SIZE(self) == 1 &&
7329	Py_UNICODE_ISDECIMAL(*p))
7330	return PyBool_FromLong(1);
7331
7332    /* Special case for empty strings */
7333    if (PyUnicode_GET_SIZE(self) == 0)
7334	return PyBool_FromLong(0);
7335
7336    e = p + PyUnicode_GET_SIZE(self);
7337    for (; p < e; p++) {
7338	if (!Py_UNICODE_ISDECIMAL(*p))
7339	    return PyBool_FromLong(0);
7340    }
7341    return PyBool_FromLong(1);
7342}
7343
7344PyDoc_STRVAR(isdigit__doc__,
7345"S.isdigit() -> bool\n\
7346\n\
7347Return True if all characters in S are digits\n\
7348and there is at least one character in S, False otherwise.");
7349
7350static PyObject*
7351unicode_isdigit(PyUnicodeObject *self)
7352{
7353    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7354    register const Py_UNICODE *e;
7355
7356    /* Shortcut for single character strings */
7357    if (PyUnicode_GET_SIZE(self) == 1 &&
7358	Py_UNICODE_ISDIGIT(*p))
7359	return PyBool_FromLong(1);
7360
7361    /* Special case for empty strings */
7362    if (PyUnicode_GET_SIZE(self) == 0)
7363	return PyBool_FromLong(0);
7364
7365    e = p + PyUnicode_GET_SIZE(self);
7366    for (; p < e; p++) {
7367	if (!Py_UNICODE_ISDIGIT(*p))
7368	    return PyBool_FromLong(0);
7369    }
7370    return PyBool_FromLong(1);
7371}
7372
7373PyDoc_STRVAR(isnumeric__doc__,
7374"S.isnumeric() -> bool\n\
7375\n\
7376Return True if there are only numeric characters in S,\n\
7377False otherwise.");
7378
7379static PyObject*
7380unicode_isnumeric(PyUnicodeObject *self)
7381{
7382    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7383    register const Py_UNICODE *e;
7384
7385    /* Shortcut for single character strings */
7386    if (PyUnicode_GET_SIZE(self) == 1 &&
7387	Py_UNICODE_ISNUMERIC(*p))
7388	return PyBool_FromLong(1);
7389
7390    /* Special case for empty strings */
7391    if (PyUnicode_GET_SIZE(self) == 0)
7392	return PyBool_FromLong(0);
7393
7394    e = p + PyUnicode_GET_SIZE(self);
7395    for (; p < e; p++) {
7396	if (!Py_UNICODE_ISNUMERIC(*p))
7397	    return PyBool_FromLong(0);
7398    }
7399    return PyBool_FromLong(1);
7400}
7401
7402int
7403PyUnicode_IsIdentifier(PyObject *self)
7404{
7405    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7406    register const Py_UNICODE *e;
7407
7408    /* Special case for empty strings */
7409    if (PyUnicode_GET_SIZE(self) == 0)
7410	return 0;
7411
7412    /* PEP 3131 says that the first character must be in
7413       XID_Start and subsequent characters in XID_Continue,
7414       and for the ASCII range, the 2.x rules apply (i.e
7415       start with letters and underscore, continue with
7416       letters, digits, underscore). However, given the current
7417       definition of XID_Start and XID_Continue, it is sufficient
7418       to check just for these, except that _ must be allowed
7419       as starting an identifier.  */
7420    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7421        return 0;
7422
7423    e = p + PyUnicode_GET_SIZE(self);
7424    for (p++; p < e; p++) {
7425	if (!_PyUnicode_IsXidContinue(*p))
7426	    return 0;
7427    }
7428    return 1;
7429}
7430
7431PyDoc_STRVAR(isidentifier__doc__,
7432"S.isidentifier() -> bool\n\
7433\n\
7434Return True if S is a valid identifier according\n\
7435to the language definition.");
7436
7437static PyObject*
7438unicode_isidentifier(PyObject *self)
7439{
7440    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7441}
7442
7443PyDoc_STRVAR(isprintable__doc__,
7444"S.isprintable() -> bool\n\
7445\n\
7446Return True if all characters in S are considered\n\
7447printable in repr() or S is empty, False otherwise.");
7448
7449static PyObject*
7450unicode_isprintable(PyObject *self)
7451{
7452    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7453    register const Py_UNICODE *e;
7454
7455    /* Shortcut for single character strings */
7456    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7457        Py_RETURN_TRUE;
7458    }
7459
7460    e = p + PyUnicode_GET_SIZE(self);
7461    for (; p < e; p++) {
7462        if (!Py_UNICODE_ISPRINTABLE(*p)) {
7463            Py_RETURN_FALSE;
7464        }
7465    }
7466    Py_RETURN_TRUE;
7467}
7468
7469PyDoc_STRVAR(join__doc__,
7470"S.join(sequence) -> str\n\
7471\n\
7472Return a string which is the concatenation of the strings in the\n\
7473sequence.  The separator between elements is S.");
7474
7475static PyObject*
7476unicode_join(PyObject *self, PyObject *data)
7477{
7478    return PyUnicode_Join(self, data);
7479}
7480
7481static Py_ssize_t
7482unicode_length(PyUnicodeObject *self)
7483{
7484    return self->length;
7485}
7486
7487PyDoc_STRVAR(ljust__doc__,
7488"S.ljust(width[, fillchar]) -> str\n\
7489\n\
7490Return S left-justified in a Unicode string of length width. Padding is\n\
7491done using the specified fill character (default is a space).");
7492
7493static PyObject *
7494unicode_ljust(PyUnicodeObject *self, PyObject *args)
7495{
7496    Py_ssize_t width;
7497    Py_UNICODE fillchar = ' ';
7498
7499    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7500        return NULL;
7501
7502    if (self->length >= width && PyUnicode_CheckExact(self)) {
7503        Py_INCREF(self);
7504        return (PyObject*) self;
7505    }
7506
7507    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7508}
7509
7510PyDoc_STRVAR(lower__doc__,
7511"S.lower() -> str\n\
7512\n\
7513Return a copy of the string S converted to lowercase.");
7514
7515static PyObject*
7516unicode_lower(PyUnicodeObject *self)
7517{
7518    return fixup(self, fixlower);
7519}
7520
7521#define LEFTSTRIP 0
7522#define RIGHTSTRIP 1
7523#define BOTHSTRIP 2
7524
7525/* Arrays indexed by above */
7526static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7527
7528#define STRIPNAME(i) (stripformat[i]+3)
7529
7530/* externally visible for str.strip(unicode) */
7531PyObject *
7532_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7533{
7534	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7535	Py_ssize_t len = PyUnicode_GET_SIZE(self);
7536	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7537	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7538	Py_ssize_t i, j;
7539
7540        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7541
7542	i = 0;
7543	if (striptype != RIGHTSTRIP) {
7544            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7545                i++;
7546            }
7547	}
7548
7549	j = len;
7550	if (striptype != LEFTSTRIP) {
7551            do {
7552                j--;
7553            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7554            j++;
7555	}
7556
7557	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7558            Py_INCREF(self);
7559            return (PyObject*)self;
7560	}
7561	else
7562            return PyUnicode_FromUnicode(s+i, j-i);
7563}
7564
7565
7566static PyObject *
7567do_strip(PyUnicodeObject *self, int striptype)
7568{
7569	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7570	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7571
7572	i = 0;
7573	if (striptype != RIGHTSTRIP) {
7574		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7575			i++;
7576		}
7577	}
7578
7579	j = len;
7580	if (striptype != LEFTSTRIP) {
7581		do {
7582			j--;
7583		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7584		j++;
7585	}
7586
7587	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7588		Py_INCREF(self);
7589		return (PyObject*)self;
7590	}
7591	else
7592		return PyUnicode_FromUnicode(s+i, j-i);
7593}
7594
7595
7596static PyObject *
7597do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7598{
7599	PyObject *sep = NULL;
7600
7601	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7602		return NULL;
7603
7604	if (sep != NULL && sep != Py_None) {
7605		if (PyUnicode_Check(sep))
7606			return _PyUnicode_XStrip(self, striptype, sep);
7607		else {
7608			PyErr_Format(PyExc_TypeError,
7609				     "%s arg must be None or str",
7610				     STRIPNAME(striptype));
7611			return NULL;
7612		}
7613	}
7614
7615	return do_strip(self, striptype);
7616}
7617
7618
7619PyDoc_STRVAR(strip__doc__,
7620"S.strip([chars]) -> str\n\
7621\n\
7622Return a copy of the string S with leading and trailing\n\
7623whitespace removed.\n\
7624If chars is given and not None, remove characters in chars instead.");
7625
7626static PyObject *
7627unicode_strip(PyUnicodeObject *self, PyObject *args)
7628{
7629	if (PyTuple_GET_SIZE(args) == 0)
7630		return do_strip(self, BOTHSTRIP); /* Common case */
7631	else
7632		return do_argstrip(self, BOTHSTRIP, args);
7633}
7634
7635
7636PyDoc_STRVAR(lstrip__doc__,
7637"S.lstrip([chars]) -> str\n\
7638\n\
7639Return a copy of the string S with leading whitespace removed.\n\
7640If chars is given and not None, remove characters in chars instead.");
7641
7642static PyObject *
7643unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7644{
7645	if (PyTuple_GET_SIZE(args) == 0)
7646		return do_strip(self, LEFTSTRIP); /* Common case */
7647	else
7648		return do_argstrip(self, LEFTSTRIP, args);
7649}
7650
7651
7652PyDoc_STRVAR(rstrip__doc__,
7653"S.rstrip([chars]) -> str\n\
7654\n\
7655Return a copy of the string S with trailing whitespace removed.\n\
7656If chars is given and not None, remove characters in chars instead.");
7657
7658static PyObject *
7659unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7660{
7661	if (PyTuple_GET_SIZE(args) == 0)
7662		return do_strip(self, RIGHTSTRIP); /* Common case */
7663	else
7664		return do_argstrip(self, RIGHTSTRIP, args);
7665}
7666
7667
7668static PyObject*
7669unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7670{
7671    PyUnicodeObject *u;
7672    Py_UNICODE *p;
7673    Py_ssize_t nchars;
7674    size_t nbytes;
7675
7676    if (len < 0)
7677        len = 0;
7678
7679    if (len == 1 && PyUnicode_CheckExact(str)) {
7680        /* no repeat, return original string */
7681        Py_INCREF(str);
7682        return (PyObject*) str;
7683    }
7684
7685    /* ensure # of chars needed doesn't overflow int and # of bytes
7686     * needed doesn't overflow size_t
7687     */
7688    nchars = len * str->length;
7689    if (len && nchars / len != str->length) {
7690        PyErr_SetString(PyExc_OverflowError,
7691                        "repeated string is too long");
7692        return NULL;
7693    }
7694    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7695    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7696        PyErr_SetString(PyExc_OverflowError,
7697                        "repeated string is too long");
7698        return NULL;
7699    }
7700    u = _PyUnicode_New(nchars);
7701    if (!u)
7702        return NULL;
7703
7704    p = u->str;
7705
7706    if (str->length == 1 && len > 0) {
7707        Py_UNICODE_FILL(p, str->str[0], len);
7708    } else {
7709	Py_ssize_t done = 0; /* number of characters copied this far */
7710	if (done < nchars) {
7711            Py_UNICODE_COPY(p, str->str, str->length);
7712            done = str->length;
7713	}
7714	while (done < nchars) {
7715            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7716            Py_UNICODE_COPY(p+done, p, n);
7717            done += n;
7718	}
7719    }
7720
7721    return (PyObject*) u;
7722}
7723
7724PyObject *PyUnicode_Replace(PyObject *obj,
7725			    PyObject *subobj,
7726			    PyObject *replobj,
7727			    Py_ssize_t maxcount)
7728{
7729    PyObject *self;
7730    PyObject *str1;
7731    PyObject *str2;
7732    PyObject *result;
7733
7734    self = PyUnicode_FromObject(obj);
7735    if (self == NULL)
7736	return NULL;
7737    str1 = PyUnicode_FromObject(subobj);
7738    if (str1 == NULL) {
7739	Py_DECREF(self);
7740	return NULL;
7741    }
7742    str2 = PyUnicode_FromObject(replobj);
7743    if (str2 == NULL) {
7744	Py_DECREF(self);
7745	Py_DECREF(str1);
7746	return NULL;
7747    }
7748    result = replace((PyUnicodeObject *)self,
7749		     (PyUnicodeObject *)str1,
7750		     (PyUnicodeObject *)str2,
7751		     maxcount);
7752    Py_DECREF(self);
7753    Py_DECREF(str1);
7754    Py_DECREF(str2);
7755    return result;
7756}
7757
7758PyDoc_STRVAR(replace__doc__,
7759"S.replace (old, new[, count]) -> str\n\
7760\n\
7761Return a copy of S with all occurrences of substring\n\
7762old replaced by new.  If the optional argument count is\n\
7763given, only the first count occurrences are replaced.");
7764
7765static PyObject*
7766unicode_replace(PyUnicodeObject *self, PyObject *args)
7767{
7768    PyUnicodeObject *str1;
7769    PyUnicodeObject *str2;
7770    Py_ssize_t maxcount = -1;
7771    PyObject *result;
7772
7773    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7774        return NULL;
7775    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7776    if (str1 == NULL)
7777	return NULL;
7778    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7779    if (str2 == NULL) {
7780	Py_DECREF(str1);
7781	return NULL;
7782    }
7783
7784    result = replace(self, str1, str2, maxcount);
7785
7786    Py_DECREF(str1);
7787    Py_DECREF(str2);
7788    return result;
7789}
7790
7791static
7792PyObject *unicode_repr(PyObject *unicode)
7793{
7794    PyObject *repr;
7795    Py_UNICODE *p;
7796    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7797    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7798
7799    /* XXX(nnorwitz): rather than over-allocating, it would be
7800       better to choose a different scheme.  Perhaps scan the
7801       first N-chars of the string and allocate based on that size.
7802    */
7803    /* Initial allocation is based on the longest-possible unichr
7804       escape.
7805
7806       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7807       unichr, so in this case it's the longest unichr escape. In
7808       narrow (UTF-16) builds this is five chars per source unichr
7809       since there are two unichrs in the surrogate pair, so in narrow
7810       (UTF-16) builds it's not the longest unichr escape.
7811
7812       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7813       so in the narrow (UTF-16) build case it's the longest unichr
7814       escape.
7815    */
7816
7817    repr = PyUnicode_FromUnicode(NULL,
7818        2 /* quotes */
7819#ifdef Py_UNICODE_WIDE
7820        + 10*size
7821#else
7822        + 6*size
7823#endif
7824        + 1);
7825    if (repr == NULL)
7826        return NULL;
7827
7828    p = PyUnicode_AS_UNICODE(repr);
7829
7830    /* Add quote */
7831    *p++ = (findchar(s, size, '\'') &&
7832            !findchar(s, size, '"')) ? '"' : '\'';
7833    while (size-- > 0) {
7834        Py_UNICODE ch = *s++;
7835
7836        /* Escape quotes and backslashes */
7837        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7838            *p++ = '\\';
7839            *p++ = ch;
7840            continue;
7841        }
7842
7843	/* Map special whitespace to '\t', \n', '\r' */
7844        if (ch == '\t') {
7845            *p++ = '\\';
7846            *p++ = 't';
7847        }
7848        else if (ch == '\n') {
7849            *p++ = '\\';
7850            *p++ = 'n';
7851        }
7852        else if (ch == '\r') {
7853            *p++ = '\\';
7854            *p++ = 'r';
7855        }
7856
7857        /* Map non-printable US ASCII to '\xhh' */
7858        else if (ch < ' ' || ch == 0x7F) {
7859            *p++ = '\\';
7860            *p++ = 'x';
7861            *p++ = hexdigits[(ch >> 4) & 0x000F];
7862            *p++ = hexdigits[ch & 0x000F];
7863        }
7864
7865        /* Copy ASCII characters as-is */
7866        else if (ch < 0x7F) {
7867            *p++ = ch;
7868        }
7869
7870	/* Non-ASCII characters */
7871        else {
7872            Py_UCS4 ucs = ch;
7873
7874#ifndef Py_UNICODE_WIDE
7875            Py_UNICODE ch2 = 0;
7876            /* Get code point from surrogate pair */
7877            if (size > 0) {
7878                ch2 = *s;
7879                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7880                            && ch2 <= 0xDFFF) {
7881                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7882                            + 0x00010000;
7883                    s++;
7884                    size--;
7885                }
7886            }
7887#endif
7888            /* Map Unicode whitespace and control characters
7889               (categories Z* and C* except ASCII space)
7890            */
7891            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7892                /* Map 8-bit characters to '\xhh' */
7893                if (ucs <= 0xff) {
7894                    *p++ = '\\';
7895                    *p++ = 'x';
7896                    *p++ = hexdigits[(ch >> 4) & 0x000F];
7897                    *p++ = hexdigits[ch & 0x000F];
7898                }
7899                /* Map 21-bit characters to '\U00xxxxxx' */
7900                else if (ucs >= 0x10000) {
7901                    *p++ = '\\';
7902                    *p++ = 'U';
7903                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7904                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7905                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7906                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7907                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7908                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7909                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7910                    *p++ = hexdigits[ucs & 0x0000000F];
7911                }
7912                /* Map 16-bit characters to '\uxxxx' */
7913                else {
7914                    *p++ = '\\';
7915                    *p++ = 'u';
7916                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
7917                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
7918                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
7919                    *p++ = hexdigits[ucs & 0x000F];
7920                }
7921            }
7922            /* Copy characters as-is */
7923            else {
7924                *p++ = ch;
7925#ifndef Py_UNICODE_WIDE
7926                if (ucs >= 0x10000)
7927                    *p++ = ch2;
7928#endif
7929            }
7930        }
7931    }
7932    /* Add quote */
7933    *p++ = PyUnicode_AS_UNICODE(repr)[0];
7934
7935    *p = '\0';
7936    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7937    return repr;
7938}
7939
7940PyDoc_STRVAR(rfind__doc__,
7941"S.rfind(sub[, start[, end]]) -> int\n\
7942\n\
7943Return the highest index in S where substring sub is found,\n\
7944such that sub is contained within s[start:end].  Optional\n\
7945arguments start and end are interpreted as in slice notation.\n\
7946\n\
7947Return -1 on failure.");
7948
7949static PyObject *
7950unicode_rfind(PyUnicodeObject *self, PyObject *args)
7951{
7952    PyObject *substring;
7953    Py_ssize_t start;
7954    Py_ssize_t end;
7955    Py_ssize_t result;
7956
7957    if (!_ParseTupleFinds(args, &substring, &start, &end))
7958	    return NULL;
7959
7960    result = stringlib_rfind_slice(
7961        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7962        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7963        start, end
7964        );
7965
7966    Py_DECREF(substring);
7967
7968    return PyLong_FromSsize_t(result);
7969}
7970
7971PyDoc_STRVAR(rindex__doc__,
7972"S.rindex(sub[, start[, end]]) -> int\n\
7973\n\
7974Like S.rfind() but raise ValueError when the substring is not found.");
7975
7976static PyObject *
7977unicode_rindex(PyUnicodeObject *self, PyObject *args)
7978{
7979    PyObject *substring;
7980    Py_ssize_t start;
7981    Py_ssize_t end;
7982    Py_ssize_t result;
7983
7984    if (!_ParseTupleFinds(args, &substring, &start, &end))
7985	    return NULL;
7986
7987    result = stringlib_rfind_slice(
7988        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7989        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7990        start, end
7991        );
7992
7993    Py_DECREF(substring);
7994
7995    if (result < 0) {
7996        PyErr_SetString(PyExc_ValueError, "substring not found");
7997        return NULL;
7998    }
7999    return PyLong_FromSsize_t(result);
8000}
8001
8002PyDoc_STRVAR(rjust__doc__,
8003"S.rjust(width[, fillchar]) -> str\n\
8004\n\
8005Return S right-justified in a string of length width. Padding is\n\
8006done using the specified fill character (default is a space).");
8007
8008static PyObject *
8009unicode_rjust(PyUnicodeObject *self, PyObject *args)
8010{
8011    Py_ssize_t width;
8012    Py_UNICODE fillchar = ' ';
8013
8014    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8015        return NULL;
8016
8017    if (self->length >= width && PyUnicode_CheckExact(self)) {
8018        Py_INCREF(self);
8019        return (PyObject*) self;
8020    }
8021
8022    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8023}
8024
8025PyObject *PyUnicode_Split(PyObject *s,
8026			  PyObject *sep,
8027			  Py_ssize_t maxsplit)
8028{
8029    PyObject *result;
8030
8031    s = PyUnicode_FromObject(s);
8032    if (s == NULL)
8033	return NULL;
8034    if (sep != NULL) {
8035	sep = PyUnicode_FromObject(sep);
8036	if (sep == NULL) {
8037	    Py_DECREF(s);
8038	    return NULL;
8039	}
8040    }
8041
8042    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8043
8044    Py_DECREF(s);
8045    Py_XDECREF(sep);
8046    return result;
8047}
8048
8049PyDoc_STRVAR(split__doc__,
8050"S.split([sep[, maxsplit]]) -> list of strings\n\
8051\n\
8052Return a list of the words in S, using sep as the\n\
8053delimiter string.  If maxsplit is given, at most maxsplit\n\
8054splits are done. If sep is not specified or is None, any\n\
8055whitespace string is a separator and empty strings are\n\
8056removed from the result.");
8057
8058static PyObject*
8059unicode_split(PyUnicodeObject *self, PyObject *args)
8060{
8061    PyObject *substring = Py_None;
8062    Py_ssize_t maxcount = -1;
8063
8064    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8065        return NULL;
8066
8067    if (substring == Py_None)
8068	return split(self, NULL, maxcount);
8069    else if (PyUnicode_Check(substring))
8070	return split(self, (PyUnicodeObject *)substring, maxcount);
8071    else
8072	return PyUnicode_Split((PyObject *)self, substring, maxcount);
8073}
8074
8075PyObject *
8076PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8077{
8078    PyObject* str_obj;
8079    PyObject* sep_obj;
8080    PyObject* out;
8081
8082    str_obj = PyUnicode_FromObject(str_in);
8083    if (!str_obj)
8084	return NULL;
8085    sep_obj = PyUnicode_FromObject(sep_in);
8086    if (!sep_obj) {
8087        Py_DECREF(str_obj);
8088        return NULL;
8089    }
8090
8091    out = stringlib_partition(
8092        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8093        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8094        );
8095
8096    Py_DECREF(sep_obj);
8097    Py_DECREF(str_obj);
8098
8099    return out;
8100}
8101
8102
8103PyObject *
8104PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8105{
8106    PyObject* str_obj;
8107    PyObject* sep_obj;
8108    PyObject* out;
8109
8110    str_obj = PyUnicode_FromObject(str_in);
8111    if (!str_obj)
8112	return NULL;
8113    sep_obj = PyUnicode_FromObject(sep_in);
8114    if (!sep_obj) {
8115        Py_DECREF(str_obj);
8116        return NULL;
8117    }
8118
8119    out = stringlib_rpartition(
8120        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8121        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8122        );
8123
8124    Py_DECREF(sep_obj);
8125    Py_DECREF(str_obj);
8126
8127    return out;
8128}
8129
8130PyDoc_STRVAR(partition__doc__,
8131"S.partition(sep) -> (head, sep, tail)\n\
8132\n\
8133Search for the separator sep in S, and return the part before it,\n\
8134the separator itself, and the part after it.  If the separator is not\n\
8135found, return S and two empty strings.");
8136
8137static PyObject*
8138unicode_partition(PyUnicodeObject *self, PyObject *separator)
8139{
8140    return PyUnicode_Partition((PyObject *)self, separator);
8141}
8142
8143PyDoc_STRVAR(rpartition__doc__,
8144"S.rpartition(sep) -> (tail, sep, head)\n\
8145\n\
8146Search for the separator sep in S, starting at the end of S, and return\n\
8147the part before it, the separator itself, and the part after it.  If the\n\
8148separator is not found, return two empty strings and S.");
8149
8150static PyObject*
8151unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8152{
8153    return PyUnicode_RPartition((PyObject *)self, separator);
8154}
8155
8156PyObject *PyUnicode_RSplit(PyObject *s,
8157			   PyObject *sep,
8158			   Py_ssize_t maxsplit)
8159{
8160    PyObject *result;
8161
8162    s = PyUnicode_FromObject(s);
8163    if (s == NULL)
8164	return NULL;
8165    if (sep != NULL) {
8166	sep = PyUnicode_FromObject(sep);
8167	if (sep == NULL) {
8168	    Py_DECREF(s);
8169	    return NULL;
8170	}
8171    }
8172
8173    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8174
8175    Py_DECREF(s);
8176    Py_XDECREF(sep);
8177    return result;
8178}
8179
8180PyDoc_STRVAR(rsplit__doc__,
8181"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8182\n\
8183Return a list of the words in S, using sep as the\n\
8184delimiter string, starting at the end of the string and\n\
8185working to the front.  If maxsplit is given, at most maxsplit\n\
8186splits are done. If sep is not specified, any whitespace string\n\
8187is a separator.");
8188
8189static PyObject*
8190unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8191{
8192    PyObject *substring = Py_None;
8193    Py_ssize_t maxcount = -1;
8194
8195    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8196        return NULL;
8197
8198    if (substring == Py_None)
8199	return rsplit(self, NULL, maxcount);
8200    else if (PyUnicode_Check(substring))
8201	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8202    else
8203	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8204}
8205
8206PyDoc_STRVAR(splitlines__doc__,
8207"S.splitlines([keepends]) -> list of strings\n\
8208\n\
8209Return a list of the lines in S, breaking at line boundaries.\n\
8210Line breaks are not included in the resulting list unless keepends\n\
8211is given and true.");
8212
8213static PyObject*
8214unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8215{
8216    int keepends = 0;
8217
8218    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8219        return NULL;
8220
8221    return PyUnicode_Splitlines((PyObject *)self, keepends);
8222}
8223
8224static
8225PyObject *unicode_str(PyObject *self)
8226{
8227    if (PyUnicode_CheckExact(self)) {
8228        Py_INCREF(self);
8229        return self;
8230    } else
8231        /* Subtype -- return genuine unicode string with the same value. */
8232        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8233                                     PyUnicode_GET_SIZE(self));
8234}
8235
8236PyDoc_STRVAR(swapcase__doc__,
8237"S.swapcase() -> str\n\
8238\n\
8239Return a copy of S with uppercase characters converted to lowercase\n\
8240and vice versa.");
8241
8242static PyObject*
8243unicode_swapcase(PyUnicodeObject *self)
8244{
8245    return fixup(self, fixswapcase);
8246}
8247
8248PyDoc_STRVAR(maketrans__doc__,
8249"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8250\n\
8251Return a translation table usable for str.translate().\n\
8252If there is only one argument, it must be a dictionary mapping Unicode\n\
8253ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8254Character keys will be then converted to ordinals.\n\
8255If there are two arguments, they must be strings of equal length, and\n\
8256in the resulting dictionary, each character in x will be mapped to the\n\
8257character at the same position in y. If there is a third argument, it\n\
8258must be a string, whose characters will be mapped to None in the result.");
8259
8260static PyObject*
8261unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8262{
8263    PyObject *x, *y = NULL, *z = NULL;
8264    PyObject *new = NULL, *key, *value;
8265    Py_ssize_t i = 0;
8266    int res;
8267
8268    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8269        return NULL;
8270    new = PyDict_New();
8271    if (!new)
8272        return NULL;
8273    if (y != NULL) {
8274        /* x must be a string too, of equal length */
8275        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8276        if (!PyUnicode_Check(x)) {
8277            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8278                            "be a string if there is a second argument");
8279            goto err;
8280        }
8281        if (PyUnicode_GET_SIZE(x) != ylen) {
8282            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8283                            "arguments must have equal length");
8284            goto err;
8285        }
8286        /* create entries for translating chars in x to those in y */
8287        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8288            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8289            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8290            if (!key || !value)
8291                goto err;
8292            res = PyDict_SetItem(new, key, value);
8293            Py_DECREF(key);
8294            Py_DECREF(value);
8295            if (res < 0)
8296                goto err;
8297        }
8298        /* create entries for deleting chars in z */
8299        if (z != NULL) {
8300            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8301                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8302                if (!key)
8303                    goto err;
8304                res = PyDict_SetItem(new, key, Py_None);
8305                Py_DECREF(key);
8306                if (res < 0)
8307                    goto err;
8308            }
8309        }
8310    } else {
8311        /* x must be a dict */
8312        if (!PyDict_Check(x)) {
8313            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8314                            "to maketrans it must be a dict");
8315            goto err;
8316        }
8317        /* copy entries into the new dict, converting string keys to int keys */
8318        while (PyDict_Next(x, &i, &key, &value)) {
8319            if (PyUnicode_Check(key)) {
8320                /* convert string keys to integer keys */
8321                PyObject *newkey;
8322                if (PyUnicode_GET_SIZE(key) != 1) {
8323                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8324                                    "table must be of length 1");
8325                    goto err;
8326                }
8327                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8328                if (!newkey)
8329                    goto err;
8330                res = PyDict_SetItem(new, newkey, value);
8331                Py_DECREF(newkey);
8332                if (res < 0)
8333                    goto err;
8334            } else if (PyLong_Check(key)) {
8335                /* just keep integer keys */
8336                if (PyDict_SetItem(new, key, value) < 0)
8337                    goto err;
8338            } else {
8339                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8340                                "be strings or integers");
8341                goto err;
8342            }
8343        }
8344    }
8345    return new;
8346  err:
8347    Py_DECREF(new);
8348    return NULL;
8349}
8350
8351PyDoc_STRVAR(translate__doc__,
8352"S.translate(table) -> str\n\
8353\n\
8354Return a copy of the string S, where all characters have been mapped\n\
8355through the given translation table, which must be a mapping of\n\
8356Unicode ordinals to Unicode ordinals, strings, or None.\n\
8357Unmapped characters are left untouched. Characters mapped to None\n\
8358are deleted.");
8359
8360static PyObject*
8361unicode_translate(PyUnicodeObject *self, PyObject *table)
8362{
8363    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8364}
8365
8366PyDoc_STRVAR(upper__doc__,
8367"S.upper() -> str\n\
8368\n\
8369Return a copy of S converted to uppercase.");
8370
8371static PyObject*
8372unicode_upper(PyUnicodeObject *self)
8373{
8374    return fixup(self, fixupper);
8375}
8376
8377PyDoc_STRVAR(zfill__doc__,
8378"S.zfill(width) -> str\n\
8379\n\
8380Pad a numeric string S with zeros on the left, to fill a field\n\
8381of the specified width. The string S is never truncated.");
8382
8383static PyObject *
8384unicode_zfill(PyUnicodeObject *self, PyObject *args)
8385{
8386    Py_ssize_t fill;
8387    PyUnicodeObject *u;
8388
8389    Py_ssize_t width;
8390    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8391        return NULL;
8392
8393    if (self->length >= width) {
8394        if (PyUnicode_CheckExact(self)) {
8395            Py_INCREF(self);
8396            return (PyObject*) self;
8397        }
8398        else
8399            return PyUnicode_FromUnicode(
8400                PyUnicode_AS_UNICODE(self),
8401                PyUnicode_GET_SIZE(self)
8402            );
8403    }
8404
8405    fill = width - self->length;
8406
8407    u = pad(self, fill, 0, '0');
8408
8409    if (u == NULL)
8410        return NULL;
8411
8412    if (u->str[fill] == '+' || u->str[fill] == '-') {
8413        /* move sign to beginning of string */
8414        u->str[0] = u->str[fill];
8415        u->str[fill] = '0';
8416    }
8417
8418    return (PyObject*) u;
8419}
8420
8421#if 0
8422static PyObject*
8423unicode_freelistsize(PyUnicodeObject *self)
8424{
8425    return PyLong_FromLong(numfree);
8426}
8427#endif
8428
8429PyDoc_STRVAR(startswith__doc__,
8430"S.startswith(prefix[, start[, end]]) -> bool\n\
8431\n\
8432Return True if S starts with the specified prefix, False otherwise.\n\
8433With optional start, test S beginning at that position.\n\
8434With optional end, stop comparing S at that position.\n\
8435prefix can also be a tuple of strings to try.");
8436
8437static PyObject *
8438unicode_startswith(PyUnicodeObject *self,
8439		   PyObject *args)
8440{
8441    PyObject *subobj;
8442    PyUnicodeObject *substring;
8443    Py_ssize_t start = 0;
8444    Py_ssize_t end = PY_SSIZE_T_MAX;
8445    int result;
8446
8447    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8448		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8449	return NULL;
8450    if (PyTuple_Check(subobj)) {
8451        Py_ssize_t i;
8452        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8453            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8454                            PyTuple_GET_ITEM(subobj, i));
8455            if (substring == NULL)
8456                return NULL;
8457            result = tailmatch(self, substring, start, end, -1);
8458            Py_DECREF(substring);
8459            if (result) {
8460                Py_RETURN_TRUE;
8461            }
8462        }
8463        /* nothing matched */
8464        Py_RETURN_FALSE;
8465    }
8466    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8467    if (substring == NULL)
8468         return NULL;
8469    result = tailmatch(self, substring, start, end, -1);
8470    Py_DECREF(substring);
8471    return PyBool_FromLong(result);
8472}
8473
8474
8475PyDoc_STRVAR(endswith__doc__,
8476"S.endswith(suffix[, start[, end]]) -> bool\n\
8477\n\
8478Return True if S ends with the specified suffix, False otherwise.\n\
8479With optional start, test S beginning at that position.\n\
8480With optional end, stop comparing S at that position.\n\
8481suffix can also be a tuple of strings to try.");
8482
8483static PyObject *
8484unicode_endswith(PyUnicodeObject *self,
8485		 PyObject *args)
8486{
8487    PyObject *subobj;
8488    PyUnicodeObject *substring;
8489    Py_ssize_t start = 0;
8490    Py_ssize_t end = PY_SSIZE_T_MAX;
8491    int result;
8492
8493    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8494        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8495	return NULL;
8496    if (PyTuple_Check(subobj)) {
8497        Py_ssize_t i;
8498        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8499            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8500                            PyTuple_GET_ITEM(subobj, i));
8501            if (substring == NULL)
8502            return NULL;
8503            result = tailmatch(self, substring, start, end, +1);
8504            Py_DECREF(substring);
8505            if (result) {
8506                Py_RETURN_TRUE;
8507            }
8508        }
8509        Py_RETURN_FALSE;
8510    }
8511    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8512    if (substring == NULL)
8513    return NULL;
8514
8515    result = tailmatch(self, substring, start, end, +1);
8516    Py_DECREF(substring);
8517    return PyBool_FromLong(result);
8518}
8519
8520#include "stringlib/string_format.h"
8521
8522PyDoc_STRVAR(format__doc__,
8523"S.format(*args, **kwargs) -> str\n\
8524\n\
8525");
8526
8527static PyObject *
8528unicode__format__(PyObject* self, PyObject* args)
8529{
8530    PyObject *format_spec;
8531
8532    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8533        return NULL;
8534
8535    return _PyUnicode_FormatAdvanced(self,
8536                                     PyUnicode_AS_UNICODE(format_spec),
8537                                     PyUnicode_GET_SIZE(format_spec));
8538}
8539
8540PyDoc_STRVAR(p_format__doc__,
8541"S.__format__(format_spec) -> str\n\
8542\n\
8543");
8544
8545static PyObject *
8546unicode__sizeof__(PyUnicodeObject *v)
8547{
8548    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8549                              sizeof(Py_UNICODE) * (v->length + 1));
8550}
8551
8552PyDoc_STRVAR(sizeof__doc__,
8553"S.__sizeof__() -> size of S in memory, in bytes");
8554
8555static PyObject *
8556unicode_getnewargs(PyUnicodeObject *v)
8557{
8558	return Py_BuildValue("(u#)", v->str, v->length);
8559}
8560
8561
8562static PyMethodDef unicode_methods[] = {
8563
8564    /* Order is according to common usage: often used methods should
8565       appear first, since lookup is done sequentially. */
8566
8567    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8568    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8569    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8570    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8571    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8572    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8573    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8574    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8575    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8576    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8577    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8578    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8579    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8580    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8581    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8582    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8583    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8584    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8585    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8586    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8587    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8588    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8589    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8590    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8591    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8592    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8593    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8594    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8595    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8596    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8597    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8598    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8599    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8600    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8601    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8602    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8603    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8604    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8605    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
8606    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8607    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8608    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8609    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8610    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8611    {"maketrans", (PyCFunction) unicode_maketrans,
8612     METH_VARARGS | METH_STATIC, maketrans__doc__},
8613    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8614#if 0
8615    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8616#endif
8617
8618#if 0
8619    /* This one is just used for debugging the implementation. */
8620    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8621#endif
8622
8623    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
8624    {NULL, NULL}
8625};
8626
8627static PyObject *
8628unicode_mod(PyObject *v, PyObject *w)
8629{
8630       if (!PyUnicode_Check(v)) {
8631               Py_INCREF(Py_NotImplemented);
8632               return Py_NotImplemented;
8633       }
8634       return PyUnicode_Format(v, w);
8635}
8636
8637static PyNumberMethods unicode_as_number = {
8638	0,				/*nb_add*/
8639	0,				/*nb_subtract*/
8640	0,				/*nb_multiply*/
8641	unicode_mod,			/*nb_remainder*/
8642};
8643
8644static PySequenceMethods unicode_as_sequence = {
8645    (lenfunc) unicode_length, 		/* sq_length */
8646    PyUnicode_Concat,		 	/* sq_concat */
8647    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
8648    (ssizeargfunc) unicode_getitem, 	/* sq_item */
8649    0,				 	/* sq_slice */
8650    0, 					/* sq_ass_item */
8651    0, 					/* sq_ass_slice */
8652    PyUnicode_Contains, 		/* sq_contains */
8653};
8654
8655static PyObject*
8656unicode_subscript(PyUnicodeObject* self, PyObject* item)
8657{
8658    if (PyIndex_Check(item)) {
8659        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8660        if (i == -1 && PyErr_Occurred())
8661            return NULL;
8662        if (i < 0)
8663            i += PyUnicode_GET_SIZE(self);
8664        return unicode_getitem(self, i);
8665    } else if (PySlice_Check(item)) {
8666        Py_ssize_t start, stop, step, slicelength, cur, i;
8667        Py_UNICODE* source_buf;
8668        Py_UNICODE* result_buf;
8669        PyObject* result;
8670
8671        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8672				 &start, &stop, &step, &slicelength) < 0) {
8673            return NULL;
8674        }
8675
8676        if (slicelength <= 0) {
8677            return PyUnicode_FromUnicode(NULL, 0);
8678        } else if (start == 0 && step == 1 && slicelength == self->length &&
8679                   PyUnicode_CheckExact(self)) {
8680            Py_INCREF(self);
8681            return (PyObject *)self;
8682        } else if (step == 1) {
8683            return PyUnicode_FromUnicode(self->str + start, slicelength);
8684        } else {
8685            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8686            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8687                                                       sizeof(Py_UNICODE));
8688
8689	    if (result_buf == NULL)
8690		    return PyErr_NoMemory();
8691
8692            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8693                result_buf[i] = source_buf[cur];
8694            }
8695
8696            result = PyUnicode_FromUnicode(result_buf, slicelength);
8697            PyObject_FREE(result_buf);
8698            return result;
8699        }
8700    } else {
8701        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8702        return NULL;
8703    }
8704}
8705
8706static PyMappingMethods unicode_as_mapping = {
8707    (lenfunc)unicode_length,		/* mp_length */
8708    (binaryfunc)unicode_subscript,	/* mp_subscript */
8709    (objobjargproc)0,			/* mp_ass_subscript */
8710};
8711
8712
8713/* Helpers for PyUnicode_Format() */
8714
8715static PyObject *
8716getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8717{
8718    Py_ssize_t argidx = *p_argidx;
8719    if (argidx < arglen) {
8720	(*p_argidx)++;
8721	if (arglen < 0)
8722	    return args;
8723	else
8724	    return PyTuple_GetItem(args, argidx);
8725    }
8726    PyErr_SetString(PyExc_TypeError,
8727		    "not enough arguments for format string");
8728    return NULL;
8729}
8730
8731static Py_ssize_t
8732strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8733{
8734    register Py_ssize_t i;
8735    Py_ssize_t len = strlen(charbuffer);
8736    for (i = len - 1; i >= 0; i--)
8737	buffer[i] = (Py_UNICODE) charbuffer[i];
8738
8739    return len;
8740}
8741
8742static int
8743doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8744{
8745    Py_ssize_t result;
8746
8747    PyOS_ascii_formatd((char *)buffer, len, format, x);
8748    result = strtounicode(buffer, (char *)buffer);
8749    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8750}
8751
8752#if 0
8753static int
8754longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8755{
8756    Py_ssize_t result;
8757
8758    PyOS_snprintf((char *)buffer, len, format, x);
8759    result = strtounicode(buffer, (char *)buffer);
8760    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8761}
8762#endif
8763
8764/* XXX To save some code duplication, formatfloat/long/int could have been
8765   shared with stringobject.c, converting from 8-bit to Unicode after the
8766   formatting is done. */
8767
8768static int
8769formatfloat(Py_UNICODE *buf,
8770	    size_t buflen,
8771	    int flags,
8772	    int prec,
8773	    int type,
8774	    PyObject *v)
8775{
8776    /* fmt = '%#.' + `prec` + `type`
8777       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8778    char fmt[20];
8779    double x;
8780
8781    x = PyFloat_AsDouble(v);
8782    if (x == -1.0 && PyErr_Occurred())
8783	return -1;
8784    if (prec < 0)
8785	prec = 6;
8786    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8787	type = 'g';
8788    /* Worst case length calc to ensure no buffer overrun:
8789
8790       'g' formats:
8791	 fmt = %#.<prec>g
8792	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8793	    for any double rep.)
8794	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8795
8796       'f' formats:
8797	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8798	 len = 1 + 50 + 1 + prec = 52 + prec
8799
8800       If prec=0 the effective precision is 1 (the leading digit is
8801       always given), therefore increase the length by one.
8802
8803    */
8804    if (((type == 'g' || type == 'G') &&
8805          buflen <= (size_t)10 + (size_t)prec) ||
8806	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8807	PyErr_SetString(PyExc_OverflowError,
8808			"formatted float is too long (precision too large?)");
8809	return -1;
8810    }
8811    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8812		  (flags&F_ALT) ? "#" : "",
8813		  prec, type);
8814    return doubletounicode(buf, buflen, fmt, x);
8815}
8816
8817static PyObject*
8818formatlong(PyObject *val, int flags, int prec, int type)
8819{
8820	char *buf;
8821	int len;
8822	PyObject *str; /* temporary string object. */
8823	PyObject *result;
8824
8825	str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8826	if (!str)
8827		return NULL;
8828	result = PyUnicode_FromStringAndSize(buf, len);
8829	Py_DECREF(str);
8830	return result;
8831}
8832
8833#if 0
8834static int
8835formatint(Py_UNICODE *buf,
8836	  size_t buflen,
8837	  int flags,
8838	  int prec,
8839	  int type,
8840	  PyObject *v)
8841{
8842    /* fmt = '%#.' + `prec` + 'l' + `type`
8843     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8844     *                     + 1 + 1
8845     *                   = 24
8846     */
8847    char fmt[64]; /* plenty big enough! */
8848    char *sign;
8849    long x;
8850
8851    x = PyLong_AsLong(v);
8852    if (x == -1 && PyErr_Occurred())
8853        return -1;
8854    if (x < 0 && type == 'u') {
8855        type = 'd';
8856    }
8857    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8858        sign = "-";
8859    else
8860        sign = "";
8861    if (prec < 0)
8862        prec = 1;
8863
8864    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8865     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8866     */
8867    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8868        PyErr_SetString(PyExc_OverflowError,
8869    	        "formatted integer is too long (precision too large?)");
8870        return -1;
8871    }
8872
8873    if ((flags & F_ALT) &&
8874        (type == 'x' || type == 'X' || type == 'o')) {
8875        /* When converting under %#o, %#x or %#X, there are a number
8876         * of issues that cause pain:
8877	 * - for %#o, we want a different base marker than C
8878         * - when 0 is being converted, the C standard leaves off
8879         *   the '0x' or '0X', which is inconsistent with other
8880         *   %#x/%#X conversions and inconsistent with Python's
8881         *   hex() function
8882         * - there are platforms that violate the standard and
8883         *   convert 0 with the '0x' or '0X'
8884         *   (Metrowerks, Compaq Tru64)
8885         * - there are platforms that give '0x' when converting
8886         *   under %#X, but convert 0 in accordance with the
8887         *   standard (OS/2 EMX)
8888         *
8889         * We can achieve the desired consistency by inserting our
8890         * own '0x' or '0X' prefix, and substituting %x/%X in place
8891         * of %#x/%#X.
8892         *
8893         * Note that this is the same approach as used in
8894         * formatint() in stringobject.c
8895         */
8896        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8897                      sign, type, prec, type);
8898    }
8899    else {
8900        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8901                      sign, (flags&F_ALT) ? "#" : "",
8902                      prec, type);
8903    }
8904    if (sign[0])
8905        return longtounicode(buf, buflen, fmt, -x);
8906    else
8907        return longtounicode(buf, buflen, fmt, x);
8908}
8909#endif
8910
8911static int
8912formatchar(Py_UNICODE *buf,
8913           size_t buflen,
8914           PyObject *v)
8915{
8916    /* presume that the buffer is at least 3 characters long */
8917    if (PyUnicode_Check(v)) {
8918	if (PyUnicode_GET_SIZE(v) == 1) {
8919	    buf[0] = PyUnicode_AS_UNICODE(v)[0];
8920	    buf[1] = '\0';
8921	    return 1;
8922	}
8923#ifndef Py_UNICODE_WIDE
8924	if (PyUnicode_GET_SIZE(v) == 2) {
8925	    /* Decode a valid surrogate pair */
8926	    int c0 = PyUnicode_AS_UNICODE(v)[0];
8927	    int c1 = PyUnicode_AS_UNICODE(v)[1];
8928	    if (0xD800 <= c0 && c0 <= 0xDBFF &&
8929		0xDC00 <= c1 && c1 <= 0xDFFF) {
8930		buf[0] = c0;
8931		buf[1] = c1;
8932		buf[2] = '\0';
8933		return 2;
8934	    }
8935	}
8936#endif
8937	goto onError;
8938    }
8939    else {
8940	/* Integer input truncated to a character */
8941        long x;
8942	x = PyLong_AsLong(v);
8943	if (x == -1 && PyErr_Occurred())
8944	    goto onError;
8945
8946	if (x < 0 || x > 0x10ffff) {
8947	    PyErr_SetString(PyExc_OverflowError,
8948			    "%c arg not in range(0x110000)");
8949	    return -1;
8950	}
8951
8952#ifndef Py_UNICODE_WIDE
8953	if (x > 0xffff) {
8954	    x -= 0x10000;
8955	    buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8956	    buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8957	    return 2;
8958	}
8959#endif
8960	buf[0] = (Py_UNICODE) x;
8961	buf[1] = '\0';
8962	return 1;
8963    }
8964
8965 onError:
8966    PyErr_SetString(PyExc_TypeError,
8967		    "%c requires int or char");
8968    return -1;
8969}
8970
8971/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8972
8973   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8974   chars are formatted. XXX This is a magic number. Each formatting
8975   routine does bounds checking to ensure no overflow, but a better
8976   solution may be to malloc a buffer of appropriate size for each
8977   format. For now, the current solution is sufficient.
8978*/
8979#define FORMATBUFLEN (size_t)120
8980
8981PyObject *PyUnicode_Format(PyObject *format,
8982			   PyObject *args)
8983{
8984    Py_UNICODE *fmt, *res;
8985    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8986    int args_owned = 0;
8987    PyUnicodeObject *result = NULL;
8988    PyObject *dict = NULL;
8989    PyObject *uformat;
8990
8991    if (format == NULL || args == NULL) {
8992	PyErr_BadInternalCall();
8993	return NULL;
8994    }
8995    uformat = PyUnicode_FromObject(format);
8996    if (uformat == NULL)
8997	return NULL;
8998    fmt = PyUnicode_AS_UNICODE(uformat);
8999    fmtcnt = PyUnicode_GET_SIZE(uformat);
9000
9001    reslen = rescnt = fmtcnt + 100;
9002    result = _PyUnicode_New(reslen);
9003    if (result == NULL)
9004	goto onError;
9005    res = PyUnicode_AS_UNICODE(result);
9006
9007    if (PyTuple_Check(args)) {
9008	arglen = PyTuple_Size(args);
9009	argidx = 0;
9010    }
9011    else {
9012	arglen = -1;
9013	argidx = -2;
9014    }
9015    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9016        !PyUnicode_Check(args))
9017	dict = args;
9018
9019    while (--fmtcnt >= 0) {
9020	if (*fmt != '%') {
9021	    if (--rescnt < 0) {
9022		rescnt = fmtcnt + 100;
9023		reslen += rescnt;
9024		if (_PyUnicode_Resize(&result, reslen) < 0)
9025		    goto onError;
9026		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9027		--rescnt;
9028	    }
9029	    *res++ = *fmt++;
9030	}
9031	else {
9032	    /* Got a format specifier */
9033	    int flags = 0;
9034	    Py_ssize_t width = -1;
9035	    int prec = -1;
9036	    Py_UNICODE c = '\0';
9037	    Py_UNICODE fill;
9038	    int isnumok;
9039	    PyObject *v = NULL;
9040	    PyObject *temp = NULL;
9041	    Py_UNICODE *pbuf;
9042	    Py_UNICODE sign;
9043	    Py_ssize_t len;
9044	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
9045
9046	    fmt++;
9047	    if (*fmt == '(') {
9048		Py_UNICODE *keystart;
9049		Py_ssize_t keylen;
9050		PyObject *key;
9051		int pcount = 1;
9052
9053		if (dict == NULL) {
9054		    PyErr_SetString(PyExc_TypeError,
9055				    "format requires a mapping");
9056		    goto onError;
9057		}
9058		++fmt;
9059		--fmtcnt;
9060		keystart = fmt;
9061		/* Skip over balanced parentheses */
9062		while (pcount > 0 && --fmtcnt >= 0) {
9063		    if (*fmt == ')')
9064			--pcount;
9065		    else if (*fmt == '(')
9066			++pcount;
9067		    fmt++;
9068		}
9069		keylen = fmt - keystart - 1;
9070		if (fmtcnt < 0 || pcount > 0) {
9071		    PyErr_SetString(PyExc_ValueError,
9072				    "incomplete format key");
9073		    goto onError;
9074		}
9075#if 0
9076		/* keys are converted to strings using UTF-8 and
9077		   then looked up since Python uses strings to hold
9078		   variables names etc. in its namespaces and we
9079		   wouldn't want to break common idioms. */
9080		key = PyUnicode_EncodeUTF8(keystart,
9081					   keylen,
9082					   NULL);
9083#else
9084		key = PyUnicode_FromUnicode(keystart, keylen);
9085#endif
9086		if (key == NULL)
9087		    goto onError;
9088		if (args_owned) {
9089		    Py_DECREF(args);
9090		    args_owned = 0;
9091		}
9092		args = PyObject_GetItem(dict, key);
9093		Py_DECREF(key);
9094		if (args == NULL) {
9095		    goto onError;
9096		}
9097		args_owned = 1;
9098		arglen = -1;
9099		argidx = -2;
9100	    }
9101	    while (--fmtcnt >= 0) {
9102		switch (c = *fmt++) {
9103		case '-': flags |= F_LJUST; continue;
9104		case '+': flags |= F_SIGN; continue;
9105		case ' ': flags |= F_BLANK; continue;
9106		case '#': flags |= F_ALT; continue;
9107		case '0': flags |= F_ZERO; continue;
9108		}
9109		break;
9110	    }
9111	    if (c == '*') {
9112		v = getnextarg(args, arglen, &argidx);
9113		if (v == NULL)
9114		    goto onError;
9115		if (!PyLong_Check(v)) {
9116		    PyErr_SetString(PyExc_TypeError,
9117				    "* wants int");
9118		    goto onError;
9119		}
9120		width = PyLong_AsLong(v);
9121		if (width == -1 && PyErr_Occurred())
9122			goto onError;
9123		if (width < 0) {
9124		    flags |= F_LJUST;
9125		    width = -width;
9126		}
9127		if (--fmtcnt >= 0)
9128		    c = *fmt++;
9129	    }
9130	    else if (c >= '0' && c <= '9') {
9131		width = c - '0';
9132		while (--fmtcnt >= 0) {
9133		    c = *fmt++;
9134		    if (c < '0' || c > '9')
9135			break;
9136		    if ((width*10) / 10 != width) {
9137			PyErr_SetString(PyExc_ValueError,
9138					"width too big");
9139			goto onError;
9140		    }
9141		    width = width*10 + (c - '0');
9142		}
9143	    }
9144	    if (c == '.') {
9145		prec = 0;
9146		if (--fmtcnt >= 0)
9147		    c = *fmt++;
9148		if (c == '*') {
9149		    v = getnextarg(args, arglen, &argidx);
9150		    if (v == NULL)
9151			goto onError;
9152		    if (!PyLong_Check(v)) {
9153			PyErr_SetString(PyExc_TypeError,
9154					"* wants int");
9155			goto onError;
9156		    }
9157		    prec = PyLong_AsLong(v);
9158		    if (prec == -1 && PyErr_Occurred())
9159			goto onError;
9160		    if (prec < 0)
9161			prec = 0;
9162		    if (--fmtcnt >= 0)
9163			c = *fmt++;
9164		}
9165		else if (c >= '0' && c <= '9') {
9166		    prec = c - '0';
9167		    while (--fmtcnt >= 0) {
9168			c = Py_CHARMASK(*fmt++);
9169			if (c < '0' || c > '9')
9170			    break;
9171			if ((prec*10) / 10 != prec) {
9172			    PyErr_SetString(PyExc_ValueError,
9173					    "prec too big");
9174			    goto onError;
9175			}
9176			prec = prec*10 + (c - '0');
9177		    }
9178		}
9179	    } /* prec */
9180	    if (fmtcnt >= 0) {
9181		if (c == 'h' || c == 'l' || c == 'L') {
9182		    if (--fmtcnt >= 0)
9183			c = *fmt++;
9184		}
9185	    }
9186	    if (fmtcnt < 0) {
9187		PyErr_SetString(PyExc_ValueError,
9188				"incomplete format");
9189		goto onError;
9190	    }
9191	    if (c != '%') {
9192		v = getnextarg(args, arglen, &argidx);
9193		if (v == NULL)
9194		    goto onError;
9195	    }
9196	    sign = 0;
9197	    fill = ' ';
9198	    switch (c) {
9199
9200	    case '%':
9201		pbuf = formatbuf;
9202		/* presume that buffer length is at least 1 */
9203		pbuf[0] = '%';
9204		len = 1;
9205		break;
9206
9207	    case 's':
9208	    case 'r':
9209	    case 'a':
9210		if (PyUnicode_Check(v) && c == 's') {
9211		    temp = v;
9212		    Py_INCREF(temp);
9213		}
9214		else {
9215		    if (c == 's')
9216			temp = PyObject_Str(v);
9217		    else if (c == 'r')
9218			temp = PyObject_Repr(v);
9219		    else
9220			temp = PyObject_ASCII(v);
9221		    if (temp == NULL)
9222			goto onError;
9223                    if (PyUnicode_Check(temp))
9224                        /* nothing to do */;
9225		    else {
9226			Py_DECREF(temp);
9227			PyErr_SetString(PyExc_TypeError,
9228					"%s argument has non-string str()");
9229			goto onError;
9230		    }
9231		}
9232		pbuf = PyUnicode_AS_UNICODE(temp);
9233		len = PyUnicode_GET_SIZE(temp);
9234		if (prec >= 0 && len > prec)
9235		    len = prec;
9236		break;
9237
9238	    case 'i':
9239	    case 'd':
9240	    case 'u':
9241	    case 'o':
9242	    case 'x':
9243	    case 'X':
9244		if (c == 'i')
9245		    c = 'd';
9246		isnumok = 0;
9247		if (PyNumber_Check(v)) {
9248			PyObject *iobj=NULL;
9249
9250			if (PyLong_Check(v)) {
9251				iobj = v;
9252				Py_INCREF(iobj);
9253			}
9254			else {
9255				iobj = PyNumber_Long(v);
9256			}
9257			if (iobj!=NULL) {
9258				if (PyLong_Check(iobj)) {
9259					isnumok = 1;
9260					temp = formatlong(iobj, flags, prec, c);
9261					Py_DECREF(iobj);
9262					if (!temp)
9263					    goto onError;
9264					pbuf = PyUnicode_AS_UNICODE(temp);
9265					len = PyUnicode_GET_SIZE(temp);
9266					sign = 1;
9267				}
9268				else {
9269					Py_DECREF(iobj);
9270				}
9271			}
9272		}
9273		if (!isnumok) {
9274			PyErr_Format(PyExc_TypeError,
9275			    "%%%c format: a number is required, "
9276                                     "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9277			goto onError;
9278		}
9279		if (flags & F_ZERO)
9280		    fill = '0';
9281		break;
9282
9283	    case 'e':
9284	    case 'E':
9285	    case 'f':
9286	    case 'F':
9287	    case 'g':
9288	    case 'G':
9289		if (c == 'F')
9290			c = 'f';
9291		pbuf = formatbuf;
9292		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9293			flags, prec, c, v);
9294		if (len < 0)
9295		    goto onError;
9296		sign = 1;
9297		if (flags & F_ZERO)
9298		    fill = '0';
9299		break;
9300
9301	    case 'c':
9302		pbuf = formatbuf;
9303		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9304		if (len < 0)
9305		    goto onError;
9306		break;
9307
9308	    default:
9309		PyErr_Format(PyExc_ValueError,
9310			     "unsupported format character '%c' (0x%x) "
9311			     "at index %zd",
9312			     (31<=c && c<=126) ? (char)c : '?',
9313                             (int)c,
9314			     (Py_ssize_t)(fmt - 1 -
9315					  PyUnicode_AS_UNICODE(uformat)));
9316		goto onError;
9317	    }
9318	    if (sign) {
9319		if (*pbuf == '-' || *pbuf == '+') {
9320		    sign = *pbuf++;
9321		    len--;
9322		}
9323		else if (flags & F_SIGN)
9324		    sign = '+';
9325		else if (flags & F_BLANK)
9326		    sign = ' ';
9327		else
9328		    sign = 0;
9329	    }
9330	    if (width < len)
9331		width = len;
9332	    if (rescnt - (sign != 0) < width) {
9333		reslen -= rescnt;
9334		rescnt = width + fmtcnt + 100;
9335		reslen += rescnt;
9336		if (reslen < 0) {
9337		    Py_XDECREF(temp);
9338		    PyErr_NoMemory();
9339		    goto onError;
9340		}
9341		if (_PyUnicode_Resize(&result, reslen) < 0) {
9342		    Py_XDECREF(temp);
9343		    goto onError;
9344		}
9345		res = PyUnicode_AS_UNICODE(result)
9346		    + reslen - rescnt;
9347	    }
9348	    if (sign) {
9349		if (fill != ' ')
9350		    *res++ = sign;
9351		rescnt--;
9352		if (width > len)
9353		    width--;
9354	    }
9355	    if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9356		assert(pbuf[0] == '0');
9357		assert(pbuf[1] == c);
9358		if (fill != ' ') {
9359		    *res++ = *pbuf++;
9360		    *res++ = *pbuf++;
9361		}
9362		rescnt -= 2;
9363		width -= 2;
9364		if (width < 0)
9365		    width = 0;
9366		len -= 2;
9367	    }
9368	    if (width > len && !(flags & F_LJUST)) {
9369		do {
9370		    --rescnt;
9371		    *res++ = fill;
9372		} while (--width > len);
9373	    }
9374	    if (fill == ' ') {
9375		if (sign)
9376		    *res++ = sign;
9377		if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9378		    assert(pbuf[0] == '0');
9379		    assert(pbuf[1] == c);
9380		    *res++ = *pbuf++;
9381		    *res++ = *pbuf++;
9382		}
9383	    }
9384	    Py_UNICODE_COPY(res, pbuf, len);
9385	    res += len;
9386	    rescnt -= len;
9387	    while (--width >= len) {
9388		--rescnt;
9389		*res++ = ' ';
9390	    }
9391	    if (dict && (argidx < arglen) && c != '%') {
9392		PyErr_SetString(PyExc_TypeError,
9393				"not all arguments converted during string formatting");
9394                Py_XDECREF(temp);
9395		goto onError;
9396	    }
9397	    Py_XDECREF(temp);
9398	} /* '%' */
9399    } /* until end */
9400    if (argidx < arglen && !dict) {
9401	PyErr_SetString(PyExc_TypeError,
9402			"not all arguments converted during string formatting");
9403	goto onError;
9404    }
9405
9406    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9407	goto onError;
9408    if (args_owned) {
9409	Py_DECREF(args);
9410    }
9411    Py_DECREF(uformat);
9412    return (PyObject *)result;
9413
9414 onError:
9415    Py_XDECREF(result);
9416    Py_DECREF(uformat);
9417    if (args_owned) {
9418	Py_DECREF(args);
9419    }
9420    return NULL;
9421}
9422
9423static PyObject *
9424unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9425
9426static PyObject *
9427unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9428{
9429        PyObject *x = NULL;
9430	static char *kwlist[] = {"object", "encoding", "errors", 0};
9431	char *encoding = NULL;
9432	char *errors = NULL;
9433
9434	if (type != &PyUnicode_Type)
9435		return unicode_subtype_new(type, args, kwds);
9436	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9437					  kwlist, &x, &encoding, &errors))
9438	    return NULL;
9439	if (x == NULL)
9440		return (PyObject *)_PyUnicode_New(0);
9441	if (encoding == NULL && errors == NULL)
9442	    return PyObject_Str(x);
9443	else
9444	return PyUnicode_FromEncodedObject(x, encoding, errors);
9445}
9446
9447static PyObject *
9448unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9449{
9450	PyUnicodeObject *tmp, *pnew;
9451	Py_ssize_t n;
9452
9453	assert(PyType_IsSubtype(type, &PyUnicode_Type));
9454	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9455	if (tmp == NULL)
9456		return NULL;
9457	assert(PyUnicode_Check(tmp));
9458	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9459	if (pnew == NULL) {
9460		Py_DECREF(tmp);
9461		return NULL;
9462	}
9463	pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9464	if (pnew->str == NULL) {
9465		_Py_ForgetReference((PyObject *)pnew);
9466		PyObject_Del(pnew);
9467		Py_DECREF(tmp);
9468		return PyErr_NoMemory();
9469	}
9470	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9471	pnew->length = n;
9472	pnew->hash = tmp->hash;
9473	Py_DECREF(tmp);
9474	return (PyObject *)pnew;
9475}
9476
9477PyDoc_STRVAR(unicode_doc,
9478"str(string[, encoding[, errors]]) -> str\n\
9479\n\
9480Create a new string object from the given encoded string.\n\
9481encoding defaults to the current default string encoding.\n\
9482errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9483
9484static PyObject *unicode_iter(PyObject *seq);
9485
9486PyTypeObject PyUnicode_Type = {
9487    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9488    "str", 				/* tp_name */
9489    sizeof(PyUnicodeObject), 		/* tp_size */
9490    0, 					/* tp_itemsize */
9491    /* Slots */
9492    (destructor)unicode_dealloc, 	/* tp_dealloc */
9493    0, 					/* tp_print */
9494    0,				 	/* tp_getattr */
9495    0, 					/* tp_setattr */
9496    0, 					/* tp_compare */
9497    unicode_repr, 			/* tp_repr */
9498    &unicode_as_number, 		/* tp_as_number */
9499    &unicode_as_sequence, 		/* tp_as_sequence */
9500    &unicode_as_mapping, 		/* tp_as_mapping */
9501    (hashfunc) unicode_hash, 		/* tp_hash*/
9502    0, 					/* tp_call*/
9503    (reprfunc) unicode_str,	 	/* tp_str */
9504    PyObject_GenericGetAttr, 		/* tp_getattro */
9505    0,			 		/* tp_setattro */
9506    0, 					/* tp_as_buffer */
9507    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9508        Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
9509    unicode_doc,			/* tp_doc */
9510    0,					/* tp_traverse */
9511    0,					/* tp_clear */
9512    PyUnicode_RichCompare,		/* tp_richcompare */
9513    0,					/* tp_weaklistoffset */
9514    unicode_iter,			/* tp_iter */
9515    0,					/* tp_iternext */
9516    unicode_methods,			/* tp_methods */
9517    0,					/* tp_members */
9518    0,					/* tp_getset */
9519    &PyBaseObject_Type,			/* tp_base */
9520    0,					/* tp_dict */
9521    0,					/* tp_descr_get */
9522    0,					/* tp_descr_set */
9523    0,					/* tp_dictoffset */
9524    0,					/* tp_init */
9525    0,					/* tp_alloc */
9526    unicode_new,			/* tp_new */
9527    PyObject_Del,      		/* tp_free */
9528};
9529
9530/* Initialize the Unicode implementation */
9531
9532void _PyUnicode_Init(void)
9533{
9534    int i;
9535
9536    /* XXX - move this array to unicodectype.c ? */
9537    Py_UNICODE linebreak[] = {
9538        0x000A, /* LINE FEED */
9539        0x000D, /* CARRIAGE RETURN */
9540        0x001C, /* FILE SEPARATOR */
9541        0x001D, /* GROUP SEPARATOR */
9542        0x001E, /* RECORD SEPARATOR */
9543        0x0085, /* NEXT LINE */
9544        0x2028, /* LINE SEPARATOR */
9545        0x2029, /* PARAGRAPH SEPARATOR */
9546    };
9547
9548    /* Init the implementation */
9549    free_list = NULL;
9550    numfree = 0;
9551    unicode_empty = _PyUnicode_New(0);
9552    if (!unicode_empty)
9553	return;
9554
9555    for (i = 0; i < 256; i++)
9556	unicode_latin1[i] = NULL;
9557    if (PyType_Ready(&PyUnicode_Type) < 0)
9558	Py_FatalError("Can't initialize 'unicode'");
9559
9560    /* initialize the linebreak bloom filter */
9561    bloom_linebreak = make_bloom_mask(
9562        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9563        );
9564
9565    PyType_Ready(&EncodingMapType);
9566}
9567
9568/* Finalize the Unicode implementation */
9569
9570int
9571PyUnicode_ClearFreeList(void)
9572{
9573    int freelist_size = numfree;
9574    PyUnicodeObject *u;
9575
9576    for (u = free_list; u != NULL;) {
9577	PyUnicodeObject *v = u;
9578	u = *(PyUnicodeObject **)u;
9579	if (v->str)
9580	    PyObject_DEL(v->str);
9581	Py_XDECREF(v->defenc);
9582	PyObject_Del(v);
9583	numfree--;
9584    }
9585    free_list = NULL;
9586    assert(numfree == 0);
9587    return freelist_size;
9588}
9589
9590void
9591_PyUnicode_Fini(void)
9592{
9593    int i;
9594
9595    Py_XDECREF(unicode_empty);
9596    unicode_empty = NULL;
9597
9598    for (i = 0; i < 256; i++) {
9599	if (unicode_latin1[i]) {
9600	    Py_DECREF(unicode_latin1[i]);
9601	    unicode_latin1[i] = NULL;
9602	}
9603    }
9604    (void)PyUnicode_ClearFreeList();
9605}
9606
9607void
9608PyUnicode_InternInPlace(PyObject **p)
9609{
9610	register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9611	PyObject *t;
9612	if (s == NULL || !PyUnicode_Check(s))
9613		Py_FatalError(
9614		    "PyUnicode_InternInPlace: unicode strings only please!");
9615	/* If it's a subclass, we don't really know what putting
9616	   it in the interned dict might do. */
9617	if (!PyUnicode_CheckExact(s))
9618		return;
9619	if (PyUnicode_CHECK_INTERNED(s))
9620		return;
9621	if (interned == NULL) {
9622		interned = PyDict_New();
9623		if (interned == NULL) {
9624			PyErr_Clear(); /* Don't leave an exception */
9625			return;
9626		}
9627	}
9628	/* It might be that the GetItem call fails even
9629	   though the key is present in the dictionary,
9630	   namely when this happens during a stack overflow. */
9631	Py_ALLOW_RECURSION
9632	t = PyDict_GetItem(interned, (PyObject *)s);
9633	Py_END_ALLOW_RECURSION
9634
9635	if (t) {
9636		Py_INCREF(t);
9637		Py_DECREF(*p);
9638		*p = t;
9639		return;
9640	}
9641
9642	PyThreadState_GET()->recursion_critical = 1;
9643	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9644		PyErr_Clear();
9645		PyThreadState_GET()->recursion_critical = 0;
9646		return;
9647	}
9648	PyThreadState_GET()->recursion_critical = 0;
9649	/* The two references in interned are not counted by refcnt.
9650	   The deallocator will take care of this */
9651	Py_REFCNT(s) -= 2;
9652	PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9653}
9654
9655void
9656PyUnicode_InternImmortal(PyObject **p)
9657{
9658	PyUnicode_InternInPlace(p);
9659	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9660		PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9661		Py_INCREF(*p);
9662	}
9663}
9664
9665PyObject *
9666PyUnicode_InternFromString(const char *cp)
9667{
9668	PyObject *s = PyUnicode_FromString(cp);
9669	if (s == NULL)
9670		return NULL;
9671	PyUnicode_InternInPlace(&s);
9672	return s;
9673}
9674
9675void _Py_ReleaseInternedUnicodeStrings(void)
9676{
9677	PyObject *keys;
9678	PyUnicodeObject *s;
9679	Py_ssize_t i, n;
9680	Py_ssize_t immortal_size = 0, mortal_size = 0;
9681
9682	if (interned == NULL || !PyDict_Check(interned))
9683		return;
9684	keys = PyDict_Keys(interned);
9685	if (keys == NULL || !PyList_Check(keys)) {
9686		PyErr_Clear();
9687		return;
9688	}
9689
9690	/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9691	   detector, interned unicode strings are not forcibly deallocated;
9692	   rather, we give them their stolen references back, and then clear
9693	   and DECREF the interned dict. */
9694
9695	n = PyList_GET_SIZE(keys);
9696	fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9697		n);
9698	for (i = 0; i < n; i++) {
9699		s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9700		switch (s->state) {
9701		case SSTATE_NOT_INTERNED:
9702			/* XXX Shouldn't happen */
9703			break;
9704		case SSTATE_INTERNED_IMMORTAL:
9705			Py_REFCNT(s) += 1;
9706			immortal_size += s->length;
9707			break;
9708		case SSTATE_INTERNED_MORTAL:
9709			Py_REFCNT(s) += 2;
9710			mortal_size += s->length;
9711			break;
9712		default:
9713			Py_FatalError("Inconsistent interned string state.");
9714		}
9715		s->state = SSTATE_NOT_INTERNED;
9716	}
9717	fprintf(stderr, "total size of all interned strings: "
9718			"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9719			"mortal/immortal\n", mortal_size, immortal_size);
9720	Py_DECREF(keys);
9721	PyDict_Clear(interned);
9722	Py_DECREF(interned);
9723	interned = NULL;
9724}
9725
9726
9727/********************* Unicode Iterator **************************/
9728
9729typedef struct {
9730	PyObject_HEAD
9731	Py_ssize_t it_index;
9732	PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9733} unicodeiterobject;
9734
9735static void
9736unicodeiter_dealloc(unicodeiterobject *it)
9737{
9738	_PyObject_GC_UNTRACK(it);
9739	Py_XDECREF(it->it_seq);
9740	PyObject_GC_Del(it);
9741}
9742
9743static int
9744unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9745{
9746	Py_VISIT(it->it_seq);
9747	return 0;
9748}
9749
9750static PyObject *
9751unicodeiter_next(unicodeiterobject *it)
9752{
9753	PyUnicodeObject *seq;
9754	PyObject *item;
9755
9756	assert(it != NULL);
9757	seq = it->it_seq;
9758	if (seq == NULL)
9759		return NULL;
9760	assert(PyUnicode_Check(seq));
9761
9762	if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9763		item = PyUnicode_FromUnicode(
9764                    PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9765		if (item != NULL)
9766			++it->it_index;
9767		return item;
9768	}
9769
9770	Py_DECREF(seq);
9771	it->it_seq = NULL;
9772	return NULL;
9773}
9774
9775static PyObject *
9776unicodeiter_len(unicodeiterobject *it)
9777{
9778	Py_ssize_t len = 0;
9779	if (it->it_seq)
9780		len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9781	return PyLong_FromSsize_t(len);
9782}
9783
9784PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9785
9786static PyMethodDef unicodeiter_methods[] = {
9787	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9788         length_hint_doc},
9789 	{NULL,		NULL}		/* sentinel */
9790};
9791
9792PyTypeObject PyUnicodeIter_Type = {
9793	PyVarObject_HEAD_INIT(&PyType_Type, 0)
9794	"str_iterator",			/* tp_name */
9795	sizeof(unicodeiterobject),		/* tp_basicsize */
9796	0,					/* tp_itemsize */
9797	/* methods */
9798	(destructor)unicodeiter_dealloc,	/* tp_dealloc */
9799	0,					/* tp_print */
9800	0,					/* tp_getattr */
9801	0,					/* tp_setattr */
9802	0,					/* tp_compare */
9803	0,					/* tp_repr */
9804	0,					/* tp_as_number */
9805	0,					/* tp_as_sequence */
9806	0,					/* tp_as_mapping */
9807	0,					/* tp_hash */
9808	0,					/* tp_call */
9809	0,					/* tp_str */
9810	PyObject_GenericGetAttr,		/* tp_getattro */
9811	0,					/* tp_setattro */
9812	0,					/* tp_as_buffer */
9813	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9814	0,					/* tp_doc */
9815	(traverseproc)unicodeiter_traverse,	/* tp_traverse */
9816	0,					/* tp_clear */
9817	0,					/* tp_richcompare */
9818	0,					/* tp_weaklistoffset */
9819	PyObject_SelfIter,			/* tp_iter */
9820	(iternextfunc)unicodeiter_next,		/* tp_iternext */
9821	unicodeiter_methods,			/* tp_methods */
9822	0,
9823};
9824
9825static PyObject *
9826unicode_iter(PyObject *seq)
9827{
9828	unicodeiterobject *it;
9829
9830	if (!PyUnicode_Check(seq)) {
9831		PyErr_BadInternalCall();
9832		return NULL;
9833	}
9834	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9835	if (it == NULL)
9836		return NULL;
9837	it->it_index = 0;
9838	Py_INCREF(seq);
9839	it->it_seq = (PyUnicodeObject *)seq;
9840	_PyObject_GC_TRACK(it);
9841	return (PyObject *)it;
9842}
9843
9844size_t
9845Py_UNICODE_strlen(const Py_UNICODE *u)
9846{
9847    int res = 0;
9848    while(*u++)
9849        res++;
9850    return res;
9851}
9852
9853Py_UNICODE*
9854Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9855{
9856    Py_UNICODE *u = s1;
9857    while ((*u++ = *s2++));
9858    return s1;
9859}
9860
9861Py_UNICODE*
9862Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9863{
9864    Py_UNICODE *u = s1;
9865    while ((*u++ = *s2++))
9866        if (n-- == 0)
9867            break;
9868    return s1;
9869}
9870
9871int
9872Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9873{
9874    while (*s1 && *s2 && *s1 == *s2)
9875        s1++, s2++;
9876    if (*s1 && *s2)
9877        return (*s1 < *s2) ? -1 : +1;
9878    if (*s1)
9879        return 1;
9880    if (*s2)
9881        return -1;
9882    return 0;
9883}
9884
9885Py_UNICODE*
9886Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9887{
9888    const Py_UNICODE *p;
9889    for (p = s; *p; p++)
9890        if (*p == c)
9891            return (Py_UNICODE*)p;
9892    return NULL;
9893}
9894
9895
9896#ifdef __cplusplus
9897}
9898#endif
9899
9900
9901/*
9902Local variables:
9903c-basic-offset: 4
9904indent-tabs-mode: nil
9905End:
9906*/
9907