unicodeobject.c revision c2504931ee6bb19b4d38d0d654b02a6fbc797ebd
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44
45#include "unicodeobject.h"
46#include "ucnhash.h"
47
48#include "formatter_unicode.h"
49
50#ifdef MS_WINDOWS
51#include <windows.h>
52#endif
53
54/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE       1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60   The implementation will keep allocated Unicode memory intact for
61   all objects on the free list having a size less than this
62   limit. This reduces malloc() overhead for small Unicode objects.
63
64   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
65   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
66   malloc()-overhead) bytes of unused garbage.
67
68   Setting the limit to 0 effectively turns the feature off.
69
70   Note: This is an experimental feature ! If you get core dumps when
71   using Unicode objects, turn this feature off.
72
73*/
74
75#define KEEPALIVE_SIZE_LIMIT       9
76
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
85/* --- Globals ------------------------------------------------------------
86
87   The globals are initialized by the _PyUnicode_Init() API and should
88   not be used before calling that API.
89
90*/
91
92
93#ifdef __cplusplus
94extern "C" {
95#endif
96
97/* This dictionary holds all interned unicode strings.  Note that references
98   to strings in this dictionary are *not* counted in the string's ob_refcnt.
99   When the interned string reaches a refcnt of 0 the string deallocation
100   function will delete the reference from this dictionary.
101
102   Another way to look at this is that to say that the actual reference
103   count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
107/* Free list for Unicode objects */
108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
110
111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115   shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
118/* Default encoding to use and assume when NULL is passed as encoding
119   parameter; it is fixed to "utf-8".  Always use the
120   PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
122
123Py_UNICODE
124PyUnicode_GetMax(void)
125{
126#ifdef Py_UNICODE_WIDE
127	return 0x10FFFF;
128#else
129	/* This is actually an illegal character, so it should
130	   not be passed to unichr. */
131	return 0xFFFF;
132#endif
133}
134
135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138   to keep things simple, we use a single bitmask, using the least 5
139   bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154    /* calculate simple bloom-style bitmask for a given unicode string */
155
156    long mask;
157    Py_ssize_t i;
158
159    mask = 0;
160    for (i = 0; i < len; i++)
161        mask |= (1 << (ptr[i] & 0x1F));
162
163    return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168    Py_ssize_t i;
169
170    for (i = 0; i < setlen; i++)
171        if (set[i] == chr)
172            return 1;
173
174    return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
180/* --- Unicode Object ----------------------------------------------------- */
181
182static
183int unicode_resize(register PyUnicodeObject *unicode,
184                      Py_ssize_t length)
185{
186    void *oldstr;
187
188    /* Shortcut if there's nothing much to do. */
189    if (unicode->length == length)
190	goto reset;
191
192    /* Resizing shared object (unicode_empty or single character
193       objects) in-place is not allowed. Use PyUnicode_Resize()
194       instead ! */
195
196    if (unicode == unicode_empty ||
197	(unicode->length == 1 &&
198	 unicode->str[0] < 256U &&
199	 unicode_latin1[unicode->str[0]] == unicode)) {
200        PyErr_SetString(PyExc_SystemError,
201                        "can't resize shared unicode objects");
202        return -1;
203    }
204
205    /* We allocate one more byte to make sure the string is Ux0000 terminated.
206       The overallocation is also used by fastsearch, which assumes that it's
207       safe to look at str[length] (without making any assumptions about what
208       it contains). */
209
210    oldstr = unicode->str;
211    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212    if (!unicode->str) {
213	unicode->str = (Py_UNICODE *)oldstr;
214        PyErr_NoMemory();
215        return -1;
216    }
217    unicode->str[length] = 0;
218    unicode->length = length;
219
220 reset:
221    /* Reset the object caches */
222    if (unicode->defenc) {
223        Py_DECREF(unicode->defenc);
224        unicode->defenc = NULL;
225    }
226    unicode->hash = -1;
227
228    return 0;
229}
230
231/* We allocate one more byte to make sure the string is
232   Ux0000 terminated; some code (e.g. new_identifier)
233   relies on that.
234
235   XXX This allocator could further be enhanced by assuring that the
236       free list never reduces its size below 1.
237
238*/
239
240static
241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
242{
243    register PyUnicodeObject *unicode;
244
245    /* Optimization for empty strings */
246    if (length == 0 && unicode_empty != NULL) {
247        Py_INCREF(unicode_empty);
248        return unicode_empty;
249    }
250
251    /* Unicode freelist & memory allocation */
252    if (unicode_freelist) {
253        unicode = unicode_freelist;
254        unicode_freelist = *(PyUnicodeObject **)unicode;
255        unicode_freelist_size--;
256	if (unicode->str) {
257	    /* Keep-Alive optimization: we only upsize the buffer,
258	       never downsize it. */
259	    if ((unicode->length < length) &&
260                unicode_resize(unicode, length) < 0) {
261		PyMem_DEL(unicode->str);
262		goto onError;
263	    }
264	}
265        else {
266	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
267        }
268        PyObject_INIT(unicode, &PyUnicode_Type);
269    }
270    else {
271        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
272        if (unicode == NULL)
273            return NULL;
274	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275    }
276
277    if (!unicode->str) {
278	PyErr_NoMemory();
279	goto onError;
280    }
281    /* Initialize the first element to guard against cases where
282     * the caller fails before initializing str -- unicode_resize()
283     * reads str[0], and the Keep-Alive optimization can keep memory
284     * allocated for str alive across a call to unicode_dealloc(unicode).
285     * We don't want unicode_resize to read uninitialized memory in
286     * that case.
287     */
288    unicode->str[0] = 0;
289    unicode->str[length] = 0;
290    unicode->length = length;
291    unicode->hash = -1;
292    unicode->state = 0;
293    unicode->defenc = NULL;
294    return unicode;
295
296 onError:
297    _Py_ForgetReference((PyObject *)unicode);
298    PyObject_Del(unicode);
299    return NULL;
300}
301
302static
303void unicode_dealloc(register PyUnicodeObject *unicode)
304{
305    switch (PyUnicode_CHECK_INTERNED(unicode)) {
306        case SSTATE_NOT_INTERNED:
307            break;
308
309        case SSTATE_INTERNED_MORTAL:
310            /* revive dead object temporarily for DelItem */
311            Py_Refcnt(unicode) = 3;
312            if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313                Py_FatalError(
314                    "deletion of interned unicode string failed");
315            break;
316
317        case SSTATE_INTERNED_IMMORTAL:
318            Py_FatalError("Immortal interned unicode string died.");
319
320        default:
321            Py_FatalError("Inconsistent interned unicode string state.");
322    }
323
324    if (PyUnicode_CheckExact(unicode) &&
325	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
326        /* Keep-Alive optimization */
327	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
328	    PyMem_DEL(unicode->str);
329	    unicode->str = NULL;
330	    unicode->length = 0;
331	}
332	if (unicode->defenc) {
333	    Py_DECREF(unicode->defenc);
334	    unicode->defenc = NULL;
335	}
336	/* Add to free list */
337        *(PyUnicodeObject **)unicode = unicode_freelist;
338        unicode_freelist = unicode;
339        unicode_freelist_size++;
340    }
341    else {
342	PyMem_DEL(unicode->str);
343	Py_XDECREF(unicode->defenc);
344	Py_Type(unicode)->tp_free((PyObject *)unicode);
345    }
346}
347
348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
349{
350    register PyUnicodeObject *v;
351
352    /* Argument checks */
353    if (unicode == NULL) {
354	PyErr_BadInternalCall();
355	return -1;
356    }
357    v = (PyUnicodeObject *)*unicode;
358    if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
359	PyErr_BadInternalCall();
360	return -1;
361    }
362
363    /* Resizing unicode_empty and single character objects is not
364       possible since these are being shared. We simply return a fresh
365       copy with the same Unicode content. */
366    if (v->length != length &&
367	(v == unicode_empty || v->length == 1)) {
368	PyUnicodeObject *w = _PyUnicode_New(length);
369	if (w == NULL)
370	    return -1;
371	Py_UNICODE_COPY(w->str, v->str,
372			length < v->length ? length : v->length);
373	Py_DECREF(*unicode);
374	*unicode = (PyObject *)w;
375	return 0;
376    }
377
378    /* Note that we don't have to modify *unicode for unshared Unicode
379       objects, since we can modify them in-place. */
380    return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
388				Py_ssize_t size)
389{
390    PyUnicodeObject *unicode;
391
392    /* If the Unicode data is known at construction time, we can apply
393       some optimizations which share commonly used objects. */
394    if (u != NULL) {
395
396	/* Optimization for empty strings */
397	if (size == 0 && unicode_empty != NULL) {
398	    Py_INCREF(unicode_empty);
399	    return (PyObject *)unicode_empty;
400	}
401
402	/* Single character Unicode objects in the Latin-1 range are
403	   shared when using this constructor */
404	if (size == 1 && *u < 256) {
405	    unicode = unicode_latin1[*u];
406	    if (!unicode) {
407		unicode = _PyUnicode_New(1);
408		if (!unicode)
409		    return NULL;
410		unicode->str[0] = *u;
411		unicode_latin1[*u] = unicode;
412	    }
413	    Py_INCREF(unicode);
414	    return (PyObject *)unicode;
415	}
416    }
417
418    unicode = _PyUnicode_New(size);
419    if (!unicode)
420        return NULL;
421
422    /* Copy the Unicode data into the new object */
423    if (u != NULL)
424	Py_UNICODE_COPY(unicode->str, u, size);
425
426    return (PyObject *)unicode;
427}
428
429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
430{
431    PyUnicodeObject *unicode;
432    /* If the Unicode data is known at construction time, we can apply
433       some optimizations which share commonly used objects.
434       Also, this means the input must be UTF-8, so fall back to the
435       UTF-8 decoder at the end. */
436    if (u != NULL) {
437
438	/* Optimization for empty strings */
439	if (size == 0 && unicode_empty != NULL) {
440	    Py_INCREF(unicode_empty);
441	    return (PyObject *)unicode_empty;
442	}
443
444	/* Single characters are shared when using this constructor.
445           Restrict to ASCII, since the input must be UTF-8. */
446	if (size == 1 && Py_CHARMASK(*u) < 128) {
447	    unicode = unicode_latin1[Py_CHARMASK(*u)];
448	    if (!unicode) {
449		unicode = _PyUnicode_New(1);
450		if (!unicode)
451		    return NULL;
452		unicode->str[0] = Py_CHARMASK(*u);
453		unicode_latin1[Py_CHARMASK(*u)] = unicode;
454	    }
455	    Py_INCREF(unicode);
456	    return (PyObject *)unicode;
457	}
458
459        return PyUnicode_DecodeUTF8(u, size, NULL);
460    }
461
462    unicode = _PyUnicode_New(size);
463    if (!unicode)
464        return NULL;
465
466    return (PyObject *)unicode;
467}
468
469PyObject *PyUnicode_FromString(const char *u)
470{
471    size_t size = strlen(u);
472    if (size > PY_SSIZE_T_MAX) {
473        PyErr_SetString(PyExc_OverflowError, "input too long");
474        return NULL;
475    }
476
477    return PyUnicode_FromStringAndSize(u, size);
478}
479
480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
483				 Py_ssize_t size)
484{
485    PyUnicodeObject *unicode;
486
487    if (w == NULL) {
488	PyErr_BadInternalCall();
489	return NULL;
490    }
491
492    unicode = _PyUnicode_New(size);
493    if (!unicode)
494        return NULL;
495
496    /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498    memcpy(unicode->str, w, size * sizeof(wchar_t));
499#else
500    {
501	register Py_UNICODE *u;
502	register Py_ssize_t i;
503	u = PyUnicode_AS_UNICODE(unicode);
504	for (i = size; i > 0; i--)
505	    *u++ = *w++;
506    }
507#endif
508
509    return (PyObject *)unicode;
510}
511
512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515	*fmt++ = '%';
516	if (width) {
517		if (zeropad)
518			*fmt++ = '0';
519		fmt += sprintf(fmt, "%d", width);
520	}
521	if (precision)
522		fmt += sprintf(fmt, ".%d", precision);
523	if (longflag)
524		*fmt++ = 'l';
525	else if (size_tflag) {
526		char *f = PY_FORMAT_SIZE_T;
527		while (*f)
528			*fmt++ = *f++;
529	}
530	*fmt++ = c;
531	*fmt = '\0';
532}
533
534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539	va_list count;
540	Py_ssize_t callcount = 0;
541	PyObject **callresults = NULL;
542	PyObject **callresult = NULL;
543	Py_ssize_t n = 0;
544	int width = 0;
545	int precision = 0;
546	int zeropad;
547	const char* f;
548	Py_UNICODE *s;
549	PyObject *string;
550	/* used by sprintf */
551	char buffer[21];
552	/* use abuffer instead of buffer, if we need more space
553	 * (which can happen if there's a format specifier with width). */
554	char *abuffer = NULL;
555	char *realbuffer;
556	Py_ssize_t abuffersize = 0;
557	char fmt[60]; /* should be enough for %0width.precisionld */
558	const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561	Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef  __va_copy
564	__va_copy(count, vargs);
565#else
566	count = vargs;
567#endif
568#endif
569	/* step 1: count the number of %S/%R format specifications
570	 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571	 * once during step 3 and put the result in an array) */
572	for (f = format; *f; f++) {
573		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
574			++callcount;
575	}
576	/* step 2: allocate memory for the results of
577	 * PyObject_Unicode()/PyObject_Repr() calls */
578	if (callcount) {
579		callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580		if (!callresults) {
581			PyErr_NoMemory();
582			return NULL;
583		}
584		callresult = callresults;
585	}
586	/* step 3: figure out how large a buffer we need */
587	for (f = format; *f; f++) {
588		if (*f == '%') {
589			const char* p = f;
590			width = 0;
591			while (isdigit(Py_CHARMASK(*f)))
592				width = (width*10) + *f++ - '0';
593			while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594				;
595
596			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597			 * they don't affect the amount of space we reserve.
598			 */
599			if ((*f == 'l' || *f == 'z') &&
600					(f[1] == 'd' || f[1] == 'u'))
601                                ++f;
602
603			switch (*f) {
604			case 'c':
605				(void)va_arg(count, int);
606				/* fall through... */
607			case '%':
608				n++;
609				break;
610			case 'd': case 'u': case 'i': case 'x':
611				(void) va_arg(count, int);
612				/* 20 bytes is enough to hold a 64-bit
613				   integer.  Decimal takes the most space.
614				   This isn't enough for octal.
615				   If a width is specified we need more
616				   (which we allocate later). */
617				if (width < 20)
618					width = 20;
619				n += width;
620				if (abuffersize < width)
621					abuffersize = width;
622				break;
623			case 's':
624			{
625				/* UTF-8 */
626				unsigned char*s;
627				s = va_arg(count, unsigned char*);
628				while (*s) {
629					if (*s < 128) {
630						n++; s++;
631					} else if (*s < 0xc0) {
632						/* invalid UTF-8 */
633						n++; s++;
634					} else if (*s < 0xc0) {
635						n++;
636						s++; if(!*s)break;
637						s++;
638					} else if (*s < 0xe0) {
639						n++;
640						s++; if(!*s)break;
641						s++; if(!*s)break;
642						s++;
643					} else {
644						#ifdef Py_UNICODE_WIDE
645						n++;
646						#else
647						n+=2;
648						#endif
649						s++; if(!*s)break;
650						s++; if(!*s)break;
651						s++; if(!*s)break;
652						s++;
653					}
654				}
655				break;
656			}
657			case 'U':
658			{
659				PyObject *obj = va_arg(count, PyObject *);
660				assert(obj && PyUnicode_Check(obj));
661				n += PyUnicode_GET_SIZE(obj);
662				break;
663			}
664			case 'V':
665			{
666				PyObject *obj = va_arg(count, PyObject *);
667				const char *str = va_arg(count, const char *);
668				assert(obj || str);
669				assert(!obj || PyUnicode_Check(obj));
670				if (obj)
671					n += PyUnicode_GET_SIZE(obj);
672				else
673					n += strlen(str);
674				break;
675			}
676			case 'S':
677			{
678				PyObject *obj = va_arg(count, PyObject *);
679				PyObject *str;
680				assert(obj);
681				str = PyObject_Unicode(obj);
682				if (!str)
683					goto fail;
684				n += PyUnicode_GET_SIZE(str);
685				/* Remember the str and switch to the next slot */
686				*callresult++ = str;
687				break;
688			}
689			case 'R':
690			{
691				PyObject *obj = va_arg(count, PyObject *);
692				PyObject *repr;
693				assert(obj);
694				repr = PyObject_Repr(obj);
695				if (!repr)
696					goto fail;
697				n += PyUnicode_GET_SIZE(repr);
698				/* Remember the repr and switch to the next slot */
699				*callresult++ = repr;
700				break;
701			}
702			case 'p':
703				(void) va_arg(count, int);
704				/* maximum 64-bit pointer representation:
705				 * 0xffffffffffffffff
706				 * so 19 characters is enough.
707				 * XXX I count 18 -- what's the extra for?
708				 */
709				n += 19;
710				break;
711			default:
712				/* if we stumble upon an unknown
713				   formatting code, copy the rest of
714				   the format string to the output
715				   string. (we cannot just skip the
716				   code, since there's no way to know
717				   what's in the argument list) */
718				n += strlen(p);
719				goto expand;
720			}
721		} else
722			n++;
723	}
724 expand:
725	if (abuffersize > 20) {
726		abuffer = PyMem_Malloc(abuffersize);
727		if (!abuffer) {
728			PyErr_NoMemory();
729			goto fail;
730		}
731		realbuffer = abuffer;
732	}
733	else
734		realbuffer = buffer;
735	/* step 4: fill the buffer */
736	/* Since we've analyzed how much space we need for the worst case,
737	   we don't have to resize the string.
738	   There can be no errors beyond this point. */
739	string = PyUnicode_FromUnicode(NULL, n);
740	if (!string)
741		goto fail;
742
743	s = PyUnicode_AS_UNICODE(string);
744	callresult = callresults;
745
746	for (f = format; *f; f++) {
747		if (*f == '%') {
748			const char* p = f++;
749			int longflag = 0;
750			int size_tflag = 0;
751			zeropad = (*f == '0');
752			/* parse the width.precision part */
753			width = 0;
754			while (isdigit(Py_CHARMASK(*f)))
755				width = (width*10) + *f++ - '0';
756			precision = 0;
757			if (*f == '.') {
758				f++;
759				while (isdigit(Py_CHARMASK(*f)))
760					precision = (precision*10) + *f++ - '0';
761			}
762			/* handle the long flag, but only for %ld and %lu.
763			   others can be added when necessary. */
764			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
765				longflag = 1;
766				++f;
767			}
768			/* handle the size_t flag. */
769			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
770				size_tflag = 1;
771				++f;
772			}
773
774			switch (*f) {
775			case 'c':
776				*s++ = va_arg(vargs, int);
777				break;
778			case 'd':
779				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
780				if (longflag)
781					sprintf(realbuffer, fmt, va_arg(vargs, long));
782				else if (size_tflag)
783					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
784				else
785					sprintf(realbuffer, fmt, va_arg(vargs, int));
786				appendstring(realbuffer);
787				break;
788			case 'u':
789				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
790				if (longflag)
791					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
792				else if (size_tflag)
793					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
794				else
795					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
796				appendstring(realbuffer);
797				break;
798			case 'i':
799				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
800				sprintf(realbuffer, fmt, va_arg(vargs, int));
801				appendstring(realbuffer);
802				break;
803			case 'x':
804				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
805				sprintf(realbuffer, fmt, va_arg(vargs, int));
806				appendstring(realbuffer);
807				break;
808			case 's':
809			{
810				/* Parameter must be UTF-8 encoded.
811				   In case of encoding errors, use
812				   the replacement character. */
813				PyObject *u;
814				p = va_arg(vargs, char*);
815				u = PyUnicode_DecodeUTF8(p, strlen(p),
816							 "replace");
817				if (!u)
818					goto fail;
819				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
820						PyUnicode_GET_SIZE(u));
821				s += PyUnicode_GET_SIZE(u);
822				Py_DECREF(u);
823				break;
824			}
825			case 'U':
826			{
827				PyObject *obj = va_arg(vargs, PyObject *);
828				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
829				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
830				s += size;
831				break;
832			}
833			case 'V':
834			{
835				PyObject *obj = va_arg(vargs, PyObject *);
836				const char *str = va_arg(vargs, const char *);
837				if (obj) {
838					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
839					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
840					s += size;
841				} else {
842					appendstring(str);
843				}
844				break;
845			}
846			case 'S':
847			case 'R':
848			{
849				Py_UNICODE *ucopy;
850				Py_ssize_t usize;
851				Py_ssize_t upos;
852				/* unused, since we already have the result */
853				(void) va_arg(vargs, PyObject *);
854				ucopy = PyUnicode_AS_UNICODE(*callresult);
855				usize = PyUnicode_GET_SIZE(*callresult);
856				for (upos = 0; upos<usize;)
857					*s++ = ucopy[upos++];
858				/* We're done with the unicode()/repr() => forget it */
859				Py_DECREF(*callresult);
860				/* switch to next unicode()/repr() result */
861				++callresult;
862				break;
863			}
864			case 'p':
865				sprintf(buffer, "%p", va_arg(vargs, void*));
866				/* %p is ill-defined:  ensure leading 0x. */
867				if (buffer[1] == 'X')
868					buffer[1] = 'x';
869				else if (buffer[1] != 'x') {
870					memmove(buffer+2, buffer, strlen(buffer)+1);
871					buffer[0] = '0';
872					buffer[1] = 'x';
873				}
874				appendstring(buffer);
875				break;
876			case '%':
877				*s++ = '%';
878				break;
879			default:
880				appendstring(p);
881				goto end;
882			}
883		} else
884			*s++ = *f;
885	}
886
887 end:
888	if (callresults)
889		PyMem_Free(callresults);
890	if (abuffer)
891		PyMem_Free(abuffer);
892	_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
893	return string;
894 fail:
895	if (callresults) {
896		PyObject **callresult2 = callresults;
897		while (callresult2 < callresult) {
898			Py_DECREF(*callresult2);
899			++callresult2;
900		}
901		PyMem_Free(callresults);
902	}
903	if (abuffer)
904		PyMem_Free(abuffer);
905	return NULL;
906}
907
908#undef appendstring
909
910PyObject *
911PyUnicode_FromFormat(const char *format, ...)
912{
913	PyObject* ret;
914	va_list vargs;
915
916#ifdef HAVE_STDARG_PROTOTYPES
917	va_start(vargs, format);
918#else
919	va_start(vargs);
920#endif
921	ret = PyUnicode_FromFormatV(format, vargs);
922	va_end(vargs);
923	return ret;
924}
925
926Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
927				wchar_t *w,
928				Py_ssize_t size)
929{
930    if (unicode == NULL) {
931	PyErr_BadInternalCall();
932	return -1;
933    }
934
935    /* If possible, try to copy the 0-termination as well */
936    if (size > PyUnicode_GET_SIZE(unicode))
937	size = PyUnicode_GET_SIZE(unicode) + 1;
938
939#ifdef HAVE_USABLE_WCHAR_T
940    memcpy(w, unicode->str, size * sizeof(wchar_t));
941#else
942    {
943	register Py_UNICODE *u;
944	register Py_ssize_t i;
945	u = PyUnicode_AS_UNICODE(unicode);
946	for (i = size; i > 0; i--)
947	    *w++ = *u++;
948    }
949#endif
950
951    if (size > PyUnicode_GET_SIZE(unicode))
952        return PyUnicode_GET_SIZE(unicode);
953    else
954    return size;
955}
956
957#endif
958
959PyObject *PyUnicode_FromOrdinal(int ordinal)
960{
961    Py_UNICODE s[2];
962
963    if (ordinal < 0 || ordinal > 0x10ffff) {
964	PyErr_SetString(PyExc_ValueError,
965			"chr() arg not in range(0x110000)");
966	return NULL;
967    }
968
969#ifndef Py_UNICODE_WIDE
970    if (ordinal > 0xffff) {
971        ordinal -= 0x10000;
972        s[0] = 0xD800 | (ordinal >> 10);
973        s[1] = 0xDC00 | (ordinal & 0x3FF);
974        return PyUnicode_FromUnicode(s, 2);
975    }
976#endif
977
978    s[0] = (Py_UNICODE)ordinal;
979    return PyUnicode_FromUnicode(s, 1);
980}
981
982PyObject *PyUnicode_FromObject(register PyObject *obj)
983{
984    /* XXX Perhaps we should make this API an alias of
985           PyObject_Unicode() instead ?! */
986    if (PyUnicode_CheckExact(obj)) {
987	Py_INCREF(obj);
988	return obj;
989    }
990    if (PyUnicode_Check(obj)) {
991	/* For a Unicode subtype that's not a Unicode object,
992	   return a true Unicode object with the same data. */
993	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
994				     PyUnicode_GET_SIZE(obj));
995    }
996    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
997}
998
999PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1000				      const char *encoding,
1001				      const char *errors)
1002{
1003    const char *s = NULL;
1004    Py_ssize_t len;
1005    PyObject *v;
1006
1007    if (obj == NULL) {
1008	PyErr_BadInternalCall();
1009	return NULL;
1010    }
1011
1012    if (PyUnicode_Check(obj)) {
1013	PyErr_SetString(PyExc_TypeError,
1014			"decoding Unicode is not supported");
1015	return NULL;
1016	}
1017
1018    /* Coerce object */
1019    if (PyString_Check(obj)) {
1020	    s = PyString_AS_STRING(obj);
1021	    len = PyString_GET_SIZE(obj);
1022	    }
1023    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1024	/* Overwrite the error message with something more useful in
1025	   case of a TypeError. */
1026	if (PyErr_ExceptionMatches(PyExc_TypeError))
1027	PyErr_Format(PyExc_TypeError,
1028			 "coercing to Unicode: need string or buffer, "
1029			 "%.80s found",
1030		     Py_Type(obj)->tp_name);
1031	goto onError;
1032    }
1033
1034    /* Convert to Unicode */
1035    if (len == 0) {
1036	Py_INCREF(unicode_empty);
1037	v = (PyObject *)unicode_empty;
1038    }
1039    else
1040	v = PyUnicode_Decode(s, len, encoding, errors);
1041
1042    return v;
1043
1044 onError:
1045    return NULL;
1046}
1047
1048PyObject *PyUnicode_Decode(const char *s,
1049			   Py_ssize_t size,
1050			   const char *encoding,
1051			   const char *errors)
1052{
1053    PyObject *buffer = NULL, *unicode;
1054
1055    if (encoding == NULL)
1056	encoding = PyUnicode_GetDefaultEncoding();
1057
1058    /* Shortcuts for common default encodings */
1059    if (strcmp(encoding, "utf-8") == 0)
1060        return PyUnicode_DecodeUTF8(s, size, errors);
1061    else if (strcmp(encoding, "latin-1") == 0)
1062        return PyUnicode_DecodeLatin1(s, size, errors);
1063#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1064    else if (strcmp(encoding, "mbcs") == 0)
1065        return PyUnicode_DecodeMBCS(s, size, errors);
1066#endif
1067    else if (strcmp(encoding, "ascii") == 0)
1068        return PyUnicode_DecodeASCII(s, size, errors);
1069
1070    /* Decode via the codec registry */
1071    buffer = PyBuffer_FromMemory((void *)s, size);
1072    if (buffer == NULL)
1073        goto onError;
1074    unicode = PyCodec_Decode(buffer, encoding, errors);
1075    if (unicode == NULL)
1076        goto onError;
1077    if (!PyUnicode_Check(unicode)) {
1078        PyErr_Format(PyExc_TypeError,
1079                     "decoder did not return an unicode object (type=%.400s)",
1080                     Py_Type(unicode)->tp_name);
1081        Py_DECREF(unicode);
1082        goto onError;
1083    }
1084    Py_DECREF(buffer);
1085    return unicode;
1086
1087 onError:
1088    Py_XDECREF(buffer);
1089    return NULL;
1090}
1091
1092PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1093                                    const char *encoding,
1094                                    const char *errors)
1095{
1096    PyObject *v;
1097
1098    if (!PyUnicode_Check(unicode)) {
1099        PyErr_BadArgument();
1100        goto onError;
1101    }
1102
1103    if (encoding == NULL)
1104	encoding = PyUnicode_GetDefaultEncoding();
1105
1106    /* Decode via the codec registry */
1107    v = PyCodec_Decode(unicode, encoding, errors);
1108    if (v == NULL)
1109        goto onError;
1110    return v;
1111
1112 onError:
1113    return NULL;
1114}
1115
1116PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1117			   Py_ssize_t size,
1118			   const char *encoding,
1119			   const char *errors)
1120{
1121    PyObject *v, *unicode;
1122
1123    unicode = PyUnicode_FromUnicode(s, size);
1124    if (unicode == NULL)
1125	return NULL;
1126    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1127    Py_DECREF(unicode);
1128    return v;
1129}
1130
1131PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1132                                    const char *encoding,
1133                                    const char *errors)
1134{
1135    PyObject *v;
1136
1137    if (!PyUnicode_Check(unicode)) {
1138        PyErr_BadArgument();
1139        goto onError;
1140    }
1141
1142    if (encoding == NULL)
1143	encoding = PyUnicode_GetDefaultEncoding();
1144
1145    /* Encode via the codec registry */
1146    v = PyCodec_Encode(unicode, encoding, errors);
1147    if (v == NULL)
1148        goto onError;
1149    return v;
1150
1151 onError:
1152    return NULL;
1153}
1154
1155PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1156                                    const char *encoding,
1157                                    const char *errors)
1158{
1159    PyObject *v;
1160
1161    if (!PyUnicode_Check(unicode)) {
1162        PyErr_BadArgument();
1163        goto onError;
1164    }
1165
1166    if (encoding == NULL)
1167	encoding = PyUnicode_GetDefaultEncoding();
1168
1169    /* Shortcuts for common default encodings */
1170    if (errors == NULL) {
1171	if (strcmp(encoding, "utf-8") == 0)
1172	    return PyUnicode_AsUTF8String(unicode);
1173	else if (strcmp(encoding, "latin-1") == 0)
1174	    return PyUnicode_AsLatin1String(unicode);
1175#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1176	else if (strcmp(encoding, "mbcs") == 0)
1177	    return PyUnicode_AsMBCSString(unicode);
1178#endif
1179	else if (strcmp(encoding, "ascii") == 0)
1180	    return PyUnicode_AsASCIIString(unicode);
1181    }
1182
1183    /* Encode via the codec registry */
1184    v = PyCodec_Encode(unicode, encoding, errors);
1185    if (v == NULL)
1186        goto onError;
1187    if (!PyBytes_Check(v)) {
1188        if (PyString_Check(v)) {
1189            /* Old codec, turn it into bytes */
1190            PyObject *b = PyBytes_FromObject(v);
1191            Py_DECREF(v);
1192            return b;
1193        }
1194        PyErr_Format(PyExc_TypeError,
1195                     "encoder did not return a bytes object "
1196                     "(type=%.400s, encoding=%.20s, errors=%.20s)",
1197                     v->ob_type->tp_name,
1198                     encoding ? encoding : "NULL",
1199                     errors ? errors : "NULL");
1200        Py_DECREF(v);
1201        goto onError;
1202    }
1203    return v;
1204
1205 onError:
1206    return NULL;
1207}
1208
1209PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1210					    const char *errors)
1211{
1212    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1213    PyObject *b;
1214    if (v)
1215        return v;
1216    if (errors != NULL)
1217        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1218    b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1219                             PyUnicode_GET_SIZE(unicode),
1220                             NULL);
1221    if (!b)
1222        return NULL;
1223    v = PyString_FromStringAndSize(PyBytes_AsString(b),
1224                                   PyBytes_Size(b));
1225    Py_DECREF(b);
1226    ((PyUnicodeObject *)unicode)->defenc = v;
1227    return v;
1228}
1229
1230char*
1231PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1232{
1233    PyObject *str8;
1234    if (!PyUnicode_Check(unicode)) {
1235        PyErr_BadArgument();
1236        return NULL;
1237    }
1238    str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1239    if (str8 == NULL)
1240        return NULL;
1241    if (psize != NULL)
1242        *psize = PyString_GET_SIZE(str8);
1243    return PyString_AS_STRING(str8);
1244}
1245
1246char*
1247PyUnicode_AsString(PyObject *unicode)
1248{
1249    return PyUnicode_AsStringAndSize(unicode, NULL);
1250}
1251
1252Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1253{
1254    if (!PyUnicode_Check(unicode)) {
1255        PyErr_BadArgument();
1256        goto onError;
1257    }
1258    return PyUnicode_AS_UNICODE(unicode);
1259
1260 onError:
1261    return NULL;
1262}
1263
1264Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1265{
1266    if (!PyUnicode_Check(unicode)) {
1267        PyErr_BadArgument();
1268        goto onError;
1269    }
1270    return PyUnicode_GET_SIZE(unicode);
1271
1272 onError:
1273    return -1;
1274}
1275
1276const char *PyUnicode_GetDefaultEncoding(void)
1277{
1278    return unicode_default_encoding;
1279}
1280
1281int PyUnicode_SetDefaultEncoding(const char *encoding)
1282{
1283    if (strcmp(encoding, unicode_default_encoding) != 0) {
1284        PyErr_Format(PyExc_ValueError,
1285                     "Can only set default encoding to %s",
1286                     unicode_default_encoding);
1287        return -1;
1288    }
1289    return 0;
1290}
1291
1292/* error handling callback helper:
1293   build arguments, call the callback and check the arguments,
1294   if no exception occurred, copy the replacement to the output
1295   and adjust various state variables.
1296   return 0 on success, -1 on error
1297*/
1298
1299static
1300int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1301                 const char *encoding, const char *reason,
1302                 const char **input, const char **inend, Py_ssize_t *startinpos,
1303                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1304                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1305{
1306    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1307
1308    PyObject *restuple = NULL;
1309    PyObject *repunicode = NULL;
1310    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1311    Py_ssize_t insize;
1312    Py_ssize_t requiredsize;
1313    Py_ssize_t newpos;
1314    Py_UNICODE *repptr;
1315    PyObject *inputobj = NULL;
1316    Py_ssize_t repsize;
1317    int res = -1;
1318
1319    if (*errorHandler == NULL) {
1320	*errorHandler = PyCodec_LookupError(errors);
1321	if (*errorHandler == NULL)
1322	   goto onError;
1323    }
1324
1325    if (*exceptionObject == NULL) {
1326    	*exceptionObject = PyUnicodeDecodeError_Create(
1327	    encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1328	if (*exceptionObject == NULL)
1329	   goto onError;
1330    }
1331    else {
1332	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1333	    goto onError;
1334	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1335	    goto onError;
1336	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1337	    goto onError;
1338    }
1339
1340    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1341    if (restuple == NULL)
1342	goto onError;
1343    if (!PyTuple_Check(restuple)) {
1344	PyErr_Format(PyExc_TypeError, &argparse[4]);
1345	goto onError;
1346    }
1347    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1348	goto onError;
1349
1350    /* Copy back the bytes variables, which might have been modified by the
1351       callback */
1352    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1353    if (!inputobj)
1354        goto onError;
1355    if (!PyBytes_Check(inputobj)) {
1356	PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1357    }
1358    *input = PyBytes_AS_STRING(inputobj);
1359    insize = PyBytes_GET_SIZE(inputobj);
1360    *inend = *input + insize;
1361    /* we can DECREF safely, as the exception has another reference,
1362       so the object won't go away. */
1363    Py_DECREF(inputobj);
1364
1365    if (newpos<0)
1366	newpos = insize+newpos;
1367    if (newpos<0 || newpos>insize) {
1368	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1369	goto onError;
1370    }
1371
1372    /* need more space? (at least enough for what we
1373       have+the replacement+the rest of the string (starting
1374       at the new input position), so we won't have to check space
1375       when there are no errors in the rest of the string) */
1376    repptr = PyUnicode_AS_UNICODE(repunicode);
1377    repsize = PyUnicode_GET_SIZE(repunicode);
1378    requiredsize = *outpos + repsize + insize-newpos;
1379    if (requiredsize > outsize) {
1380	if (requiredsize<2*outsize)
1381	    requiredsize = 2*outsize;
1382	if (PyUnicode_Resize(output, requiredsize) < 0)
1383	    goto onError;
1384	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1385    }
1386    *endinpos = newpos;
1387    *inptr = *input + newpos;
1388    Py_UNICODE_COPY(*outptr, repptr, repsize);
1389    *outptr += repsize;
1390    *outpos += repsize;
1391
1392    /* we made it! */
1393    res = 0;
1394
1395    onError:
1396    Py_XDECREF(restuple);
1397    return res;
1398}
1399
1400/* --- UTF-7 Codec -------------------------------------------------------- */
1401
1402/* see RFC2152 for details */
1403
1404static
1405char utf7_special[128] = {
1406    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1407       encoded:
1408	   0 - not special
1409	   1 - special
1410	   2 - whitespace (optional)
1411	   3 - RFC2152 Set O (optional) */
1412    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1413    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1414    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1415    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1416    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1417    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1418    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1419    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1420
1421};
1422
1423/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1424   warnings about the comparison always being false; since
1425   utf7_special[0] is 1, we can safely make that one comparison
1426   true  */
1427
1428#define SPECIAL(c, encodeO, encodeWS) \
1429    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1430     (encodeWS && (utf7_special[(c)] == 2)) || \
1431     (encodeO && (utf7_special[(c)] == 3)))
1432
1433#define B64(n)  \
1434    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1435#define B64CHAR(c) \
1436    (isalnum(c) || (c) == '+' || (c) == '/')
1437#define UB64(c) \
1438    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1439     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1440
1441#define ENCODE(out, ch, bits)                   \
1442    while (bits >= 6) {                         \
1443        *out++ = B64(ch >> (bits-6));           \
1444        bits -= 6;                              \
1445    }
1446
1447#define DECODE(out, ch, bits, surrogate)                                \
1448    while (bits >= 16) {                                                \
1449        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1450        bits -= 16;                                                     \
1451        if (surrogate) {                                                \
1452            /* We have already generated an error for the high surrogate \
1453               so let's not bother seeing if the low surrogate is correct or not */ \
1454            surrogate = 0;                                              \
1455        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1456            /* This is a surrogate pair. Unfortunately we can't represent \
1457               it in a 16-bit character */                              \
1458            surrogate = 1;                                              \
1459            errmsg = "code pairs are not supported";                    \
1460            goto utf7Error;                                             \
1461        } else {                                                        \
1462            *out++ = outCh;                                             \
1463        }                                                               \
1464    }
1465
1466PyObject *PyUnicode_DecodeUTF7(const char *s,
1467			       Py_ssize_t size,
1468			       const char *errors)
1469{
1470    const char *starts = s;
1471    Py_ssize_t startinpos;
1472    Py_ssize_t endinpos;
1473    Py_ssize_t outpos;
1474    const char *e;
1475    PyUnicodeObject *unicode;
1476    Py_UNICODE *p;
1477    const char *errmsg = "";
1478    int inShift = 0;
1479    unsigned int bitsleft = 0;
1480    unsigned long charsleft = 0;
1481    int surrogate = 0;
1482    PyObject *errorHandler = NULL;
1483    PyObject *exc = NULL;
1484
1485    unicode = _PyUnicode_New(size);
1486    if (!unicode)
1487        return NULL;
1488    if (size == 0)
1489        return (PyObject *)unicode;
1490
1491    p = unicode->str;
1492    e = s + size;
1493
1494    while (s < e) {
1495        Py_UNICODE ch;
1496        restart:
1497        ch = *s;
1498
1499        if (inShift) {
1500            if ((ch == '-') || !B64CHAR(ch)) {
1501                inShift = 0;
1502                s++;
1503
1504                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1505                if (bitsleft >= 6) {
1506                    /* The shift sequence has a partial character in it. If
1507                       bitsleft < 6 then we could just classify it as padding
1508                       but that is not the case here */
1509
1510                    errmsg = "partial character in shift sequence";
1511                    goto utf7Error;
1512                }
1513                /* According to RFC2152 the remaining bits should be zero. We
1514                   choose to signal an error/insert a replacement character
1515                   here so indicate the potential of a misencoded character. */
1516
1517                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1518                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1519                    errmsg = "non-zero padding bits in shift sequence";
1520                    goto utf7Error;
1521                }
1522
1523                if (ch == '-') {
1524                    if ((s < e) && (*(s) == '-')) {
1525                        *p++ = '-';
1526                        inShift = 1;
1527                    }
1528                } else if (SPECIAL(ch,0,0)) {
1529                    errmsg = "unexpected special character";
1530	                goto utf7Error;
1531                } else  {
1532                    *p++ = ch;
1533                }
1534            } else {
1535                charsleft = (charsleft << 6) | UB64(ch);
1536                bitsleft += 6;
1537                s++;
1538                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1539            }
1540        }
1541        else if ( ch == '+' ) {
1542            startinpos = s-starts;
1543            s++;
1544            if (s < e && *s == '-') {
1545                s++;
1546                *p++ = '+';
1547            } else
1548            {
1549                inShift = 1;
1550                bitsleft = 0;
1551            }
1552        }
1553        else if (SPECIAL(ch,0,0)) {
1554            startinpos = s-starts;
1555            errmsg = "unexpected special character";
1556            s++;
1557            goto utf7Error;
1558        }
1559        else {
1560            *p++ = ch;
1561            s++;
1562        }
1563        continue;
1564    utf7Error:
1565        outpos = p-PyUnicode_AS_UNICODE(unicode);
1566        endinpos = s-starts;
1567        if (unicode_decode_call_errorhandler(
1568             errors, &errorHandler,
1569             "utf7", errmsg,
1570             &starts, &e, &startinpos, &endinpos, &exc, &s,
1571             (PyObject **)&unicode, &outpos, &p))
1572        goto onError;
1573    }
1574
1575    if (inShift) {
1576        outpos = p-PyUnicode_AS_UNICODE(unicode);
1577        endinpos = size;
1578        if (unicode_decode_call_errorhandler(
1579             errors, &errorHandler,
1580             "utf7", "unterminated shift sequence",
1581             &starts, &e, &startinpos, &endinpos, &exc, &s,
1582             (PyObject **)&unicode, &outpos, &p))
1583            goto onError;
1584        if (s < e)
1585           goto restart;
1586    }
1587
1588    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1589        goto onError;
1590
1591    Py_XDECREF(errorHandler);
1592    Py_XDECREF(exc);
1593    return (PyObject *)unicode;
1594
1595onError:
1596    Py_XDECREF(errorHandler);
1597    Py_XDECREF(exc);
1598    Py_DECREF(unicode);
1599    return NULL;
1600}
1601
1602
1603PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1604                   Py_ssize_t size,
1605                   int encodeSetO,
1606                   int encodeWhiteSpace,
1607                   const char *errors)
1608{
1609    PyObject *v;
1610    /* It might be possible to tighten this worst case */
1611    Py_ssize_t cbAllocated = 5 * size;
1612    int inShift = 0;
1613    Py_ssize_t i = 0;
1614    unsigned int bitsleft = 0;
1615    unsigned long charsleft = 0;
1616    char * out;
1617    char * start;
1618
1619    if (size == 0)
1620	return PyBytes_FromStringAndSize(NULL, 0);
1621
1622    v = PyBytes_FromStringAndSize(NULL, cbAllocated);
1623    if (v == NULL)
1624        return NULL;
1625
1626    start = out = PyBytes_AS_STRING(v);
1627    for (;i < size; ++i) {
1628        Py_UNICODE ch = s[i];
1629
1630        if (!inShift) {
1631            if (ch == '+') {
1632                *out++ = '+';
1633                *out++ = '-';
1634            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1635                charsleft = ch;
1636                bitsleft = 16;
1637                *out++ = '+';
1638                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1639                inShift = bitsleft > 0;
1640            } else {
1641                *out++ = (char) ch;
1642            }
1643        } else {
1644            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1645                *out++ = B64(charsleft << (6-bitsleft));
1646                charsleft = 0;
1647                bitsleft = 0;
1648                /* Characters not in the BASE64 set implicitly unshift the sequence
1649                   so no '-' is required, except if the character is itself a '-' */
1650                if (B64CHAR(ch) || ch == '-') {
1651                    *out++ = '-';
1652                }
1653                inShift = 0;
1654                *out++ = (char) ch;
1655            } else {
1656                bitsleft += 16;
1657                charsleft = (charsleft << 16) | ch;
1658                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1659
1660                /* If the next character is special then we dont' need to terminate
1661                   the shift sequence. If the next character is not a BASE64 character
1662                   or '-' then the shift sequence will be terminated implicitly and we
1663                   don't have to insert a '-'. */
1664
1665                if (bitsleft == 0) {
1666                    if (i + 1 < size) {
1667                        Py_UNICODE ch2 = s[i+1];
1668
1669                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1670
1671                        } else if (B64CHAR(ch2) || ch2 == '-') {
1672                            *out++ = '-';
1673                            inShift = 0;
1674                        } else {
1675                            inShift = 0;
1676                        }
1677
1678                    }
1679                    else {
1680                        *out++ = '-';
1681                        inShift = 0;
1682                    }
1683                }
1684            }
1685        }
1686    }
1687    if (bitsleft) {
1688        *out++= B64(charsleft << (6-bitsleft) );
1689        *out++ = '-';
1690    }
1691
1692    if (PyBytes_Resize(v, out - start)) {
1693        Py_DECREF(v);
1694        return NULL;
1695    }
1696    return v;
1697}
1698
1699#undef SPECIAL
1700#undef B64
1701#undef B64CHAR
1702#undef UB64
1703#undef ENCODE
1704#undef DECODE
1705
1706/* --- UTF-8 Codec -------------------------------------------------------- */
1707
1708static
1709char utf8_code_length[256] = {
1710    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1711       illegal prefix.  see RFC 2279 for details */
1712    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1713    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1714    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1715    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1716    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1717    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1718    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1719    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1720    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1721    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1722    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1723    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1724    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1725    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1726    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1727    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1728};
1729
1730PyObject *PyUnicode_DecodeUTF8(const char *s,
1731			       Py_ssize_t size,
1732			       const char *errors)
1733{
1734    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1735}
1736
1737PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1738			                Py_ssize_t size,
1739			                const char *errors,
1740			                Py_ssize_t *consumed)
1741{
1742    const char *starts = s;
1743    int n;
1744    Py_ssize_t startinpos;
1745    Py_ssize_t endinpos;
1746    Py_ssize_t outpos;
1747    const char *e;
1748    PyUnicodeObject *unicode;
1749    Py_UNICODE *p;
1750    const char *errmsg = "";
1751    PyObject *errorHandler = NULL;
1752    PyObject *exc = NULL;
1753
1754    /* Note: size will always be longer than the resulting Unicode
1755       character count */
1756    unicode = _PyUnicode_New(size);
1757    if (!unicode)
1758        return NULL;
1759    if (size == 0) {
1760        if (consumed)
1761            *consumed = 0;
1762        return (PyObject *)unicode;
1763    }
1764
1765    /* Unpack UTF-8 encoded data */
1766    p = unicode->str;
1767    e = s + size;
1768
1769    while (s < e) {
1770        Py_UCS4 ch = (unsigned char)*s;
1771
1772        if (ch < 0x80) {
1773            *p++ = (Py_UNICODE)ch;
1774            s++;
1775            continue;
1776        }
1777
1778        n = utf8_code_length[ch];
1779
1780        if (s + n > e) {
1781	    if (consumed)
1782		break;
1783	    else {
1784		errmsg = "unexpected end of data";
1785		startinpos = s-starts;
1786		endinpos = size;
1787		goto utf8Error;
1788	    }
1789	}
1790
1791        switch (n) {
1792
1793        case 0:
1794            errmsg = "unexpected code byte";
1795	    startinpos = s-starts;
1796	    endinpos = startinpos+1;
1797	    goto utf8Error;
1798
1799        case 1:
1800            errmsg = "internal error";
1801	    startinpos = s-starts;
1802	    endinpos = startinpos+1;
1803	    goto utf8Error;
1804
1805        case 2:
1806            if ((s[1] & 0xc0) != 0x80) {
1807                errmsg = "invalid data";
1808		startinpos = s-starts;
1809		endinpos = startinpos+2;
1810		goto utf8Error;
1811	    }
1812            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1813            if (ch < 0x80) {
1814		startinpos = s-starts;
1815		endinpos = startinpos+2;
1816                errmsg = "illegal encoding";
1817		goto utf8Error;
1818	    }
1819	    else
1820		*p++ = (Py_UNICODE)ch;
1821            break;
1822
1823        case 3:
1824            if ((s[1] & 0xc0) != 0x80 ||
1825                (s[2] & 0xc0) != 0x80) {
1826                errmsg = "invalid data";
1827		startinpos = s-starts;
1828		endinpos = startinpos+3;
1829		goto utf8Error;
1830	    }
1831            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1832            if (ch < 0x0800) {
1833		/* Note: UTF-8 encodings of surrogates are considered
1834		   legal UTF-8 sequences;
1835
1836		   XXX For wide builds (UCS-4) we should probably try
1837		       to recombine the surrogates into a single code
1838		       unit.
1839		*/
1840                errmsg = "illegal encoding";
1841		startinpos = s-starts;
1842		endinpos = startinpos+3;
1843		goto utf8Error;
1844	    }
1845	    else
1846		*p++ = (Py_UNICODE)ch;
1847            break;
1848
1849        case 4:
1850            if ((s[1] & 0xc0) != 0x80 ||
1851                (s[2] & 0xc0) != 0x80 ||
1852                (s[3] & 0xc0) != 0x80) {
1853                errmsg = "invalid data";
1854		startinpos = s-starts;
1855		endinpos = startinpos+4;
1856		goto utf8Error;
1857	    }
1858            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1859                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1860            /* validate and convert to UTF-16 */
1861            if ((ch < 0x10000)        /* minimum value allowed for 4
1862					 byte encoding */
1863                || (ch > 0x10ffff))   /* maximum value allowed for
1864					 UTF-16 */
1865	    {
1866                errmsg = "illegal encoding";
1867		startinpos = s-starts;
1868		endinpos = startinpos+4;
1869		goto utf8Error;
1870	    }
1871#ifdef Py_UNICODE_WIDE
1872	    *p++ = (Py_UNICODE)ch;
1873#else
1874            /*  compute and append the two surrogates: */
1875
1876            /*  translate from 10000..10FFFF to 0..FFFF */
1877            ch -= 0x10000;
1878
1879            /*  high surrogate = top 10 bits added to D800 */
1880            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1881
1882            /*  low surrogate = bottom 10 bits added to DC00 */
1883            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1884#endif
1885            break;
1886
1887        default:
1888            /* Other sizes are only needed for UCS-4 */
1889            errmsg = "unsupported Unicode code range";
1890	    startinpos = s-starts;
1891	    endinpos = startinpos+n;
1892	    goto utf8Error;
1893        }
1894        s += n;
1895	continue;
1896
1897    utf8Error:
1898    outpos = p-PyUnicode_AS_UNICODE(unicode);
1899    if (unicode_decode_call_errorhandler(
1900	     errors, &errorHandler,
1901	     "utf8", errmsg,
1902	     &starts, &e, &startinpos, &endinpos, &exc, &s,
1903	     (PyObject **)&unicode, &outpos, &p))
1904	goto onError;
1905    }
1906    if (consumed)
1907	*consumed = s-starts;
1908
1909    /* Adjust length */
1910    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1911        goto onError;
1912
1913    Py_XDECREF(errorHandler);
1914    Py_XDECREF(exc);
1915    return (PyObject *)unicode;
1916
1917onError:
1918    Py_XDECREF(errorHandler);
1919    Py_XDECREF(exc);
1920    Py_DECREF(unicode);
1921    return NULL;
1922}
1923
1924/* Allocation strategy:  if the string is short, convert into a stack buffer
1925   and allocate exactly as much space needed at the end.  Else allocate the
1926   maximum possible needed (4 result bytes per Unicode character), and return
1927   the excess memory at the end.
1928*/
1929PyObject *
1930PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1931		     Py_ssize_t size,
1932		     const char *errors)
1933{
1934#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1935
1936    Py_ssize_t i;           /* index into s of next input byte */
1937    PyObject *v;        /* result string object */
1938    char *p;            /* next free byte in output buffer */
1939    Py_ssize_t nallocated;  /* number of result bytes allocated */
1940    Py_ssize_t nneeded;        /* number of result bytes needed */
1941    char stackbuf[MAX_SHORT_UNICHARS * 4];
1942
1943    assert(s != NULL);
1944    assert(size >= 0);
1945
1946    if (size <= MAX_SHORT_UNICHARS) {
1947        /* Write into the stack buffer; nallocated can't overflow.
1948         * At the end, we'll allocate exactly as much heap space as it
1949         * turns out we need.
1950         */
1951        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1952        v = NULL;   /* will allocate after we're done */
1953        p = stackbuf;
1954    }
1955    else {
1956        /* Overallocate on the heap, and give the excess back at the end. */
1957        nallocated = size * 4;
1958        if (nallocated / 4 != size)  /* overflow! */
1959            return PyErr_NoMemory();
1960        v = PyBytes_FromStringAndSize(NULL, nallocated);
1961        if (v == NULL)
1962            return NULL;
1963        p = PyBytes_AS_STRING(v);
1964    }
1965
1966    for (i = 0; i < size;) {
1967        Py_UCS4 ch = s[i++];
1968
1969        if (ch < 0x80)
1970            /* Encode ASCII */
1971            *p++ = (char) ch;
1972
1973        else if (ch < 0x0800) {
1974            /* Encode Latin-1 */
1975            *p++ = (char)(0xc0 | (ch >> 6));
1976            *p++ = (char)(0x80 | (ch & 0x3f));
1977        }
1978        else {
1979            /* Encode UCS2 Unicode ordinals */
1980            if (ch < 0x10000) {
1981                /* Special case: check for high surrogate */
1982                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1983                    Py_UCS4 ch2 = s[i];
1984                    /* Check for low surrogate and combine the two to
1985                       form a UCS4 value */
1986                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1987                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1988                        i++;
1989                        goto encodeUCS4;
1990                    }
1991                    /* Fall through: handles isolated high surrogates */
1992                }
1993                *p++ = (char)(0xe0 | (ch >> 12));
1994                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1995                *p++ = (char)(0x80 | (ch & 0x3f));
1996                continue;
1997    	    }
1998encodeUCS4:
1999            /* Encode UCS4 Unicode ordinals */
2000            *p++ = (char)(0xf0 | (ch >> 18));
2001            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2002            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2003            *p++ = (char)(0x80 | (ch & 0x3f));
2004        }
2005    }
2006
2007    if (v == NULL) {
2008        /* This was stack allocated. */
2009        nneeded = p - stackbuf;
2010        assert(nneeded <= nallocated);
2011        v = PyBytes_FromStringAndSize(stackbuf, nneeded);
2012    }
2013    else {
2014    	/* Cut back to size actually needed. */
2015        nneeded = p - PyBytes_AS_STRING(v);
2016        assert(nneeded <= nallocated);
2017        PyBytes_Resize(v, nneeded);
2018    }
2019    return v;
2020
2021#undef MAX_SHORT_UNICHARS
2022}
2023
2024PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2025{
2026    if (!PyUnicode_Check(unicode)) {
2027        PyErr_BadArgument();
2028        return NULL;
2029    }
2030    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2031				PyUnicode_GET_SIZE(unicode),
2032				NULL);
2033}
2034
2035/* --- UTF-32 Codec ------------------------------------------------------- */
2036
2037PyObject *
2038PyUnicode_DecodeUTF32(const char *s,
2039		      Py_ssize_t size,
2040		      const char *errors,
2041		      int *byteorder)
2042{
2043    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2044}
2045
2046PyObject *
2047PyUnicode_DecodeUTF32Stateful(const char *s,
2048			      Py_ssize_t size,
2049			      const char *errors,
2050			      int *byteorder,
2051			      Py_ssize_t *consumed)
2052{
2053    const char *starts = s;
2054    Py_ssize_t startinpos;
2055    Py_ssize_t endinpos;
2056    Py_ssize_t outpos;
2057    PyUnicodeObject *unicode;
2058    Py_UNICODE *p;
2059#ifndef Py_UNICODE_WIDE
2060    int i, pairs;
2061#else
2062    const int pairs = 0;
2063#endif
2064    const unsigned char *q, *e;
2065    int bo = 0;       /* assume native ordering by default */
2066    const char *errmsg = "";
2067    /* Offsets from q for retrieving bytes in the right order. */
2068#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2069    int iorder[] = {0, 1, 2, 3};
2070#else
2071    int iorder[] = {3, 2, 1, 0};
2072#endif
2073    PyObject *errorHandler = NULL;
2074    PyObject *exc = NULL;
2075    /* On narrow builds we split characters outside the BMP into two
2076       codepoints => count how much extra space we need. */
2077#ifndef Py_UNICODE_WIDE
2078    for (i = pairs = 0; i < size/4; i++)
2079	if (((Py_UCS4 *)s)[i] >= 0x10000)
2080	    pairs++;
2081#endif
2082
2083    /* This might be one to much, because of a BOM */
2084    unicode = _PyUnicode_New((size+3)/4+pairs);
2085    if (!unicode)
2086        return NULL;
2087    if (size == 0)
2088        return (PyObject *)unicode;
2089
2090    /* Unpack UTF-32 encoded data */
2091    p = unicode->str;
2092    q = (unsigned char *)s;
2093    e = q + size;
2094
2095    if (byteorder)
2096        bo = *byteorder;
2097
2098    /* Check for BOM marks (U+FEFF) in the input and adjust current
2099       byte order setting accordingly. In native mode, the leading BOM
2100       mark is skipped, in all other modes, it is copied to the output
2101       stream as-is (giving a ZWNBSP character). */
2102    if (bo == 0) {
2103        if (size >= 4) {
2104            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2105                                (q[iorder[1]] << 8) | q[iorder[0]];
2106#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2107	    if (bom == 0x0000FEFF) {
2108		q += 4;
2109		bo = -1;
2110	    }
2111	    else if (bom == 0xFFFE0000) {
2112		q += 4;
2113		bo = 1;
2114	    }
2115#else
2116	    if (bom == 0x0000FEFF) {
2117		q += 4;
2118		bo = 1;
2119	    }
2120	    else if (bom == 0xFFFE0000) {
2121		q += 4;
2122		bo = -1;
2123	    }
2124#endif
2125	}
2126    }
2127
2128    if (bo == -1) {
2129        /* force LE */
2130        iorder[0] = 0;
2131        iorder[1] = 1;
2132        iorder[2] = 2;
2133        iorder[3] = 3;
2134    }
2135    else if (bo == 1) {
2136        /* force BE */
2137        iorder[0] = 3;
2138        iorder[1] = 2;
2139        iorder[2] = 1;
2140        iorder[3] = 0;
2141    }
2142
2143    while (q < e) {
2144	Py_UCS4 ch;
2145	/* remaining bytes at the end? (size should be divisible by 4) */
2146	if (e-q<4) {
2147	    if (consumed)
2148		break;
2149	    errmsg = "truncated data";
2150	    startinpos = ((const char *)q)-starts;
2151	    endinpos = ((const char *)e)-starts;
2152	    goto utf32Error;
2153	    /* The remaining input chars are ignored if the callback
2154	       chooses to skip the input */
2155	}
2156	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2157	     (q[iorder[1]] << 8) | q[iorder[0]];
2158
2159	if (ch >= 0x110000)
2160	{
2161	    errmsg = "codepoint not in range(0x110000)";
2162	    startinpos = ((const char *)q)-starts;
2163	    endinpos = startinpos+4;
2164	    goto utf32Error;
2165	}
2166#ifndef Py_UNICODE_WIDE
2167	if (ch >= 0x10000)
2168	{
2169	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
2170	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2171	}
2172	else
2173#endif
2174	    *p++ = ch;
2175	q += 4;
2176	continue;
2177    utf32Error:
2178	outpos = p-PyUnicode_AS_UNICODE(unicode);
2179	if (unicode_decode_call_errorhandler(
2180	         errors, &errorHandler,
2181	         "utf32", errmsg,
2182	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2183	         (PyObject **)&unicode, &outpos, &p))
2184	    goto onError;
2185    }
2186
2187    if (byteorder)
2188        *byteorder = bo;
2189
2190    if (consumed)
2191	*consumed = (const char *)q-starts;
2192
2193    /* Adjust length */
2194    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2195        goto onError;
2196
2197    Py_XDECREF(errorHandler);
2198    Py_XDECREF(exc);
2199    return (PyObject *)unicode;
2200
2201onError:
2202    Py_DECREF(unicode);
2203    Py_XDECREF(errorHandler);
2204    Py_XDECREF(exc);
2205    return NULL;
2206}
2207
2208PyObject *
2209PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2210		      Py_ssize_t size,
2211		      const char *errors,
2212		      int byteorder)
2213{
2214    PyObject *v;
2215    unsigned char *p;
2216#ifndef Py_UNICODE_WIDE
2217    int i, pairs;
2218#else
2219    const int pairs = 0;
2220#endif
2221    /* Offsets from p for storing byte pairs in the right order. */
2222#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2223    int iorder[] = {0, 1, 2, 3};
2224#else
2225    int iorder[] = {3, 2, 1, 0};
2226#endif
2227
2228#define STORECHAR(CH)                       \
2229    do {                                    \
2230        p[iorder[3]] = ((CH) >> 24) & 0xff; \
2231        p[iorder[2]] = ((CH) >> 16) & 0xff; \
2232        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2233        p[iorder[0]] = (CH) & 0xff;         \
2234        p += 4;                             \
2235    } while(0)
2236
2237    /* In narrow builds we can output surrogate pairs as one codepoint,
2238       so we need less space. */
2239#ifndef Py_UNICODE_WIDE
2240    for (i = pairs = 0; i < size-1; i++)
2241	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2242	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2243	    pairs++;
2244#endif
2245    v = PyBytes_FromStringAndSize(NULL,
2246		  4 * (size - pairs + (byteorder == 0)));
2247    if (v == NULL)
2248        return NULL;
2249
2250    p = (unsigned char *)PyBytes_AS_STRING(v);
2251    if (byteorder == 0)
2252	STORECHAR(0xFEFF);
2253    if (size == 0)
2254        return v;
2255
2256    if (byteorder == -1) {
2257        /* force LE */
2258        iorder[0] = 0;
2259        iorder[1] = 1;
2260        iorder[2] = 2;
2261        iorder[3] = 3;
2262    }
2263    else if (byteorder == 1) {
2264        /* force BE */
2265        iorder[0] = 3;
2266        iorder[1] = 2;
2267        iorder[2] = 1;
2268        iorder[3] = 0;
2269    }
2270
2271    while (size-- > 0) {
2272	Py_UCS4 ch = *s++;
2273#ifndef Py_UNICODE_WIDE
2274	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2275	    Py_UCS4 ch2 = *s;
2276	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2277		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2278		s++;
2279		size--;
2280	    }
2281	}
2282#endif
2283        STORECHAR(ch);
2284    }
2285    return v;
2286#undef STORECHAR
2287}
2288
2289PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2290{
2291    if (!PyUnicode_Check(unicode)) {
2292        PyErr_BadArgument();
2293        return NULL;
2294    }
2295    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2296				 PyUnicode_GET_SIZE(unicode),
2297				 NULL,
2298				 0);
2299}
2300
2301/* --- UTF-16 Codec ------------------------------------------------------- */
2302
2303PyObject *
2304PyUnicode_DecodeUTF16(const char *s,
2305		      Py_ssize_t size,
2306		      const char *errors,
2307		      int *byteorder)
2308{
2309    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2310}
2311
2312PyObject *
2313PyUnicode_DecodeUTF16Stateful(const char *s,
2314			      Py_ssize_t size,
2315			      const char *errors,
2316			      int *byteorder,
2317			      Py_ssize_t *consumed)
2318{
2319    const char *starts = s;
2320    Py_ssize_t startinpos;
2321    Py_ssize_t endinpos;
2322    Py_ssize_t outpos;
2323    PyUnicodeObject *unicode;
2324    Py_UNICODE *p;
2325    const unsigned char *q, *e;
2326    int bo = 0;       /* assume native ordering by default */
2327    const char *errmsg = "";
2328    /* Offsets from q for retrieving byte pairs in the right order. */
2329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2330    int ihi = 1, ilo = 0;
2331#else
2332    int ihi = 0, ilo = 1;
2333#endif
2334    PyObject *errorHandler = NULL;
2335    PyObject *exc = NULL;
2336
2337    /* Note: size will always be longer than the resulting Unicode
2338       character count */
2339    unicode = _PyUnicode_New(size);
2340    if (!unicode)
2341        return NULL;
2342    if (size == 0)
2343        return (PyObject *)unicode;
2344
2345    /* Unpack UTF-16 encoded data */
2346    p = unicode->str;
2347    q = (unsigned char *)s;
2348    e = q + size;
2349
2350    if (byteorder)
2351        bo = *byteorder;
2352
2353    /* Check for BOM marks (U+FEFF) in the input and adjust current
2354       byte order setting accordingly. In native mode, the leading BOM
2355       mark is skipped, in all other modes, it is copied to the output
2356       stream as-is (giving a ZWNBSP character). */
2357    if (bo == 0) {
2358        if (size >= 2) {
2359            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2360#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2361	    if (bom == 0xFEFF) {
2362		q += 2;
2363		bo = -1;
2364	    }
2365	    else if (bom == 0xFFFE) {
2366		q += 2;
2367		bo = 1;
2368	    }
2369#else
2370	    if (bom == 0xFEFF) {
2371		q += 2;
2372		bo = 1;
2373	    }
2374	    else if (bom == 0xFFFE) {
2375		q += 2;
2376		bo = -1;
2377	    }
2378#endif
2379	}
2380    }
2381
2382    if (bo == -1) {
2383        /* force LE */
2384        ihi = 1;
2385        ilo = 0;
2386    }
2387    else if (bo == 1) {
2388        /* force BE */
2389        ihi = 0;
2390        ilo = 1;
2391    }
2392
2393    while (q < e) {
2394	Py_UNICODE ch;
2395	/* remaining bytes at the end? (size should be even) */
2396	if (e-q<2) {
2397	    if (consumed)
2398		break;
2399	    errmsg = "truncated data";
2400	    startinpos = ((const char *)q)-starts;
2401	    endinpos = ((const char *)e)-starts;
2402	    goto utf16Error;
2403	    /* The remaining input chars are ignored if the callback
2404	       chooses to skip the input */
2405	}
2406	ch = (q[ihi] << 8) | q[ilo];
2407
2408	q += 2;
2409
2410	if (ch < 0xD800 || ch > 0xDFFF) {
2411	    *p++ = ch;
2412	    continue;
2413	}
2414
2415	/* UTF-16 code pair: */
2416	if (q >= e) {
2417	    errmsg = "unexpected end of data";
2418	    startinpos = (((const char *)q)-2)-starts;
2419	    endinpos = ((const char *)e)-starts;
2420	    goto utf16Error;
2421	}
2422	if (0xD800 <= ch && ch <= 0xDBFF) {
2423	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2424	    q += 2;
2425	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2426#ifndef Py_UNICODE_WIDE
2427		*p++ = ch;
2428		*p++ = ch2;
2429#else
2430		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2431#endif
2432		continue;
2433	    }
2434	    else {
2435                errmsg = "illegal UTF-16 surrogate";
2436		startinpos = (((const char *)q)-4)-starts;
2437		endinpos = startinpos+2;
2438		goto utf16Error;
2439	    }
2440
2441	}
2442	errmsg = "illegal encoding";
2443	startinpos = (((const char *)q)-2)-starts;
2444	endinpos = startinpos+2;
2445	/* Fall through to report the error */
2446
2447    utf16Error:
2448	outpos = p-PyUnicode_AS_UNICODE(unicode);
2449	if (unicode_decode_call_errorhandler(
2450	         errors, &errorHandler,
2451	         "utf16", errmsg,
2452	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2453	         (PyObject **)&unicode, &outpos, &p))
2454	    goto onError;
2455    }
2456
2457    if (byteorder)
2458        *byteorder = bo;
2459
2460    if (consumed)
2461	*consumed = (const char *)q-starts;
2462
2463    /* Adjust length */
2464    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2465        goto onError;
2466
2467    Py_XDECREF(errorHandler);
2468    Py_XDECREF(exc);
2469    return (PyObject *)unicode;
2470
2471onError:
2472    Py_DECREF(unicode);
2473    Py_XDECREF(errorHandler);
2474    Py_XDECREF(exc);
2475    return NULL;
2476}
2477
2478PyObject *
2479PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2480		      Py_ssize_t size,
2481		      const char *errors,
2482		      int byteorder)
2483{
2484    PyObject *v;
2485    unsigned char *p;
2486#ifdef Py_UNICODE_WIDE
2487    int i, pairs;
2488#else
2489    const int pairs = 0;
2490#endif
2491    /* Offsets from p for storing byte pairs in the right order. */
2492#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2493    int ihi = 1, ilo = 0;
2494#else
2495    int ihi = 0, ilo = 1;
2496#endif
2497
2498#define STORECHAR(CH)                   \
2499    do {                                \
2500        p[ihi] = ((CH) >> 8) & 0xff;    \
2501        p[ilo] = (CH) & 0xff;           \
2502        p += 2;                         \
2503    } while(0)
2504
2505#ifdef Py_UNICODE_WIDE
2506    for (i = pairs = 0; i < size; i++)
2507	if (s[i] >= 0x10000)
2508	    pairs++;
2509#endif
2510    v = PyBytes_FromStringAndSize(NULL,
2511		  2 * (size + pairs + (byteorder == 0)));
2512    if (v == NULL)
2513        return NULL;
2514
2515    p = (unsigned char *)PyBytes_AS_STRING(v);
2516    if (byteorder == 0)
2517	STORECHAR(0xFEFF);
2518    if (size == 0)
2519        return v;
2520
2521    if (byteorder == -1) {
2522        /* force LE */
2523        ihi = 1;
2524        ilo = 0;
2525    }
2526    else if (byteorder == 1) {
2527        /* force BE */
2528        ihi = 0;
2529        ilo = 1;
2530    }
2531
2532    while (size-- > 0) {
2533	Py_UNICODE ch = *s++;
2534	Py_UNICODE ch2 = 0;
2535#ifdef Py_UNICODE_WIDE
2536	if (ch >= 0x10000) {
2537	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2538	    ch  = 0xD800 | ((ch-0x10000) >> 10);
2539	}
2540#endif
2541        STORECHAR(ch);
2542        if (ch2)
2543            STORECHAR(ch2);
2544    }
2545    return v;
2546#undef STORECHAR
2547}
2548
2549PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2550{
2551    if (!PyUnicode_Check(unicode)) {
2552        PyErr_BadArgument();
2553        return NULL;
2554    }
2555    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2556				 PyUnicode_GET_SIZE(unicode),
2557				 NULL,
2558				 0);
2559}
2560
2561/* --- Unicode Escape Codec ----------------------------------------------- */
2562
2563static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2564
2565PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2566					Py_ssize_t size,
2567					const char *errors)
2568{
2569    const char *starts = s;
2570    Py_ssize_t startinpos;
2571    Py_ssize_t endinpos;
2572    Py_ssize_t outpos;
2573    int i;
2574    PyUnicodeObject *v;
2575    Py_UNICODE *p;
2576    const char *end;
2577    char* message;
2578    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2579    PyObject *errorHandler = NULL;
2580    PyObject *exc = NULL;
2581
2582    /* Escaped strings will always be longer than the resulting
2583       Unicode string, so we start with size here and then reduce the
2584       length after conversion to the true value.
2585       (but if the error callback returns a long replacement string
2586       we'll have to allocate more space) */
2587    v = _PyUnicode_New(size);
2588    if (v == NULL)
2589        goto onError;
2590    if (size == 0)
2591        return (PyObject *)v;
2592
2593    p = PyUnicode_AS_UNICODE(v);
2594    end = s + size;
2595
2596    while (s < end) {
2597        unsigned char c;
2598        Py_UNICODE x;
2599        int digits;
2600
2601        /* Non-escape characters are interpreted as Unicode ordinals */
2602        if (*s != '\\') {
2603            *p++ = (unsigned char) *s++;
2604            continue;
2605        }
2606
2607        startinpos = s-starts;
2608        /* \ - Escapes */
2609        s++;
2610        switch (*s++) {
2611
2612        /* \x escapes */
2613        case '\n': break;
2614        case '\\': *p++ = '\\'; break;
2615        case '\'': *p++ = '\''; break;
2616        case '\"': *p++ = '\"'; break;
2617        case 'b': *p++ = '\b'; break;
2618        case 'f': *p++ = '\014'; break; /* FF */
2619        case 't': *p++ = '\t'; break;
2620        case 'n': *p++ = '\n'; break;
2621        case 'r': *p++ = '\r'; break;
2622        case 'v': *p++ = '\013'; break; /* VT */
2623        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2624
2625        /* \OOO (octal) escapes */
2626        case '0': case '1': case '2': case '3':
2627        case '4': case '5': case '6': case '7':
2628            x = s[-1] - '0';
2629            if ('0' <= *s && *s <= '7') {
2630                x = (x<<3) + *s++ - '0';
2631                if ('0' <= *s && *s <= '7')
2632                    x = (x<<3) + *s++ - '0';
2633            }
2634            *p++ = x;
2635            break;
2636
2637        /* hex escapes */
2638        /* \xXX */
2639        case 'x':
2640            digits = 2;
2641            message = "truncated \\xXX escape";
2642            goto hexescape;
2643
2644        /* \uXXXX */
2645        case 'u':
2646            digits = 4;
2647            message = "truncated \\uXXXX escape";
2648            goto hexescape;
2649
2650        /* \UXXXXXXXX */
2651        case 'U':
2652            digits = 8;
2653            message = "truncated \\UXXXXXXXX escape";
2654        hexescape:
2655            chr = 0;
2656            outpos = p-PyUnicode_AS_UNICODE(v);
2657            if (s+digits>end) {
2658                endinpos = size;
2659                if (unicode_decode_call_errorhandler(
2660                    errors, &errorHandler,
2661                    "unicodeescape", "end of string in escape sequence",
2662                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2663                    (PyObject **)&v, &outpos, &p))
2664                    goto onError;
2665                goto nextByte;
2666            }
2667            for (i = 0; i < digits; ++i) {
2668                c = (unsigned char) s[i];
2669                if (!isxdigit(c)) {
2670                    endinpos = (s+i+1)-starts;
2671                    if (unicode_decode_call_errorhandler(
2672                        errors, &errorHandler,
2673                        "unicodeescape", message,
2674                        &starts, &end, &startinpos, &endinpos, &exc, &s,
2675                        (PyObject **)&v, &outpos, &p))
2676                        goto onError;
2677                    goto nextByte;
2678                }
2679                chr = (chr<<4) & ~0xF;
2680                if (c >= '0' && c <= '9')
2681                    chr += c - '0';
2682                else if (c >= 'a' && c <= 'f')
2683                    chr += 10 + c - 'a';
2684                else
2685                    chr += 10 + c - 'A';
2686            }
2687            s += i;
2688            if (chr == 0xffffffff && PyErr_Occurred())
2689                /* _decoding_error will have already written into the
2690                   target buffer. */
2691                break;
2692        store:
2693            /* when we get here, chr is a 32-bit unicode character */
2694            if (chr <= 0xffff)
2695                /* UCS-2 character */
2696                *p++ = (Py_UNICODE) chr;
2697            else if (chr <= 0x10ffff) {
2698                /* UCS-4 character. Either store directly, or as
2699                   surrogate pair. */
2700#ifdef Py_UNICODE_WIDE
2701                *p++ = chr;
2702#else
2703                chr -= 0x10000L;
2704                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2705                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2706#endif
2707            } else {
2708                endinpos = s-starts;
2709                outpos = p-PyUnicode_AS_UNICODE(v);
2710                if (unicode_decode_call_errorhandler(
2711                    errors, &errorHandler,
2712                    "unicodeescape", "illegal Unicode character",
2713                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2714                    (PyObject **)&v, &outpos, &p))
2715                    goto onError;
2716            }
2717            break;
2718
2719        /* \N{name} */
2720        case 'N':
2721            message = "malformed \\N character escape";
2722            if (ucnhash_CAPI == NULL) {
2723                /* load the unicode data module */
2724                PyObject *m, *api;
2725                m = PyImport_ImportModule("unicodedata");
2726                if (m == NULL)
2727                    goto ucnhashError;
2728                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2729                Py_DECREF(m);
2730                if (api == NULL)
2731                    goto ucnhashError;
2732                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2733                Py_DECREF(api);
2734                if (ucnhash_CAPI == NULL)
2735                    goto ucnhashError;
2736            }
2737            if (*s == '{') {
2738                const char *start = s+1;
2739                /* look for the closing brace */
2740                while (*s != '}' && s < end)
2741                    s++;
2742                if (s > start && s < end && *s == '}') {
2743                    /* found a name.  look it up in the unicode database */
2744                    message = "unknown Unicode character name";
2745                    s++;
2746                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2747                        goto store;
2748                }
2749            }
2750            endinpos = s-starts;
2751            outpos = p-PyUnicode_AS_UNICODE(v);
2752            if (unicode_decode_call_errorhandler(
2753                errors, &errorHandler,
2754                "unicodeescape", message,
2755                &starts, &end, &startinpos, &endinpos, &exc, &s,
2756                (PyObject **)&v, &outpos, &p))
2757                goto onError;
2758            break;
2759
2760        default:
2761            if (s > end) {
2762                message = "\\ at end of string";
2763                s--;
2764                endinpos = s-starts;
2765                outpos = p-PyUnicode_AS_UNICODE(v);
2766                if (unicode_decode_call_errorhandler(
2767                    errors, &errorHandler,
2768                    "unicodeescape", message,
2769                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2770                    (PyObject **)&v, &outpos, &p))
2771                    goto onError;
2772            }
2773            else {
2774                *p++ = '\\';
2775                *p++ = (unsigned char)s[-1];
2776            }
2777            break;
2778        }
2779        nextByte:
2780        ;
2781    }
2782    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2783        goto onError;
2784    Py_XDECREF(errorHandler);
2785    Py_XDECREF(exc);
2786    return (PyObject *)v;
2787
2788ucnhashError:
2789    PyErr_SetString(
2790        PyExc_UnicodeError,
2791        "\\N escapes not supported (can't load unicodedata module)"
2792        );
2793    Py_XDECREF(v);
2794    Py_XDECREF(errorHandler);
2795    Py_XDECREF(exc);
2796    return NULL;
2797
2798onError:
2799    Py_XDECREF(v);
2800    Py_XDECREF(errorHandler);
2801    Py_XDECREF(exc);
2802    return NULL;
2803}
2804
2805/* Return a Unicode-Escape string version of the Unicode object.
2806
2807   If quotes is true, the string is enclosed in u"" or u'' quotes as
2808   appropriate.
2809
2810*/
2811
2812Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2813                                      Py_ssize_t size,
2814                                      Py_UNICODE ch)
2815{
2816    /* like wcschr, but doesn't stop at NULL characters */
2817
2818    while (size-- > 0) {
2819        if (*s == ch)
2820            return s;
2821        s++;
2822    }
2823
2824    return NULL;
2825}
2826
2827static const char *hexdigits = "0123456789abcdef";
2828
2829PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2830					Py_ssize_t size)
2831{
2832    PyObject *repr;
2833    char *p;
2834
2835    /* XXX(nnorwitz): rather than over-allocating, it would be
2836       better to choose a different scheme.  Perhaps scan the
2837       first N-chars of the string and allocate based on that size.
2838    */
2839    /* Initial allocation is based on the longest-possible unichr
2840       escape.
2841
2842       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2843       unichr, so in this case it's the longest unichr escape. In
2844       narrow (UTF-16) builds this is five chars per source unichr
2845       since there are two unichrs in the surrogate pair, so in narrow
2846       (UTF-16) builds it's not the longest unichr escape.
2847
2848       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2849       so in the narrow (UTF-16) build case it's the longest unichr
2850       escape.
2851    */
2852
2853    repr = PyBytes_FromStringAndSize(NULL,
2854#ifdef Py_UNICODE_WIDE
2855        + 10*size
2856#else
2857        + 6*size
2858#endif
2859        + 1);
2860    if (repr == NULL)
2861        return NULL;
2862
2863    p = PyBytes_AS_STRING(repr);
2864
2865    while (size-- > 0) {
2866        Py_UNICODE ch = *s++;
2867
2868        /* Escape backslashes */
2869        if (ch == '\\') {
2870            *p++ = '\\';
2871            *p++ = (char) ch;
2872            continue;
2873        }
2874
2875#ifdef Py_UNICODE_WIDE
2876        /* Map 21-bit characters to '\U00xxxxxx' */
2877        else if (ch >= 0x10000) {
2878            *p++ = '\\';
2879            *p++ = 'U';
2880            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2881            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2882            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2883            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2884            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2885            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2886            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2887            *p++ = hexdigits[ch & 0x0000000F];
2888	    continue;
2889        }
2890#else
2891	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2892	else if (ch >= 0xD800 && ch < 0xDC00) {
2893	    Py_UNICODE ch2;
2894	    Py_UCS4 ucs;
2895
2896	    ch2 = *s++;
2897	    size--;
2898	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2899		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2900		*p++ = '\\';
2901		*p++ = 'U';
2902		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2903		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2904		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2905		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2906		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2907		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2908		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2909		*p++ = hexdigits[ucs & 0x0000000F];
2910		continue;
2911	    }
2912	    /* Fall through: isolated surrogates are copied as-is */
2913	    s--;
2914	    size++;
2915	}
2916#endif
2917
2918        /* Map 16-bit characters to '\uxxxx' */
2919        if (ch >= 256) {
2920            *p++ = '\\';
2921            *p++ = 'u';
2922            *p++ = hexdigits[(ch >> 12) & 0x000F];
2923            *p++ = hexdigits[(ch >> 8) & 0x000F];
2924            *p++ = hexdigits[(ch >> 4) & 0x000F];
2925            *p++ = hexdigits[ch & 0x000F];
2926        }
2927
2928        /* Map special whitespace to '\t', \n', '\r' */
2929        else if (ch == '\t') {
2930            *p++ = '\\';
2931            *p++ = 't';
2932        }
2933        else if (ch == '\n') {
2934            *p++ = '\\';
2935            *p++ = 'n';
2936        }
2937        else if (ch == '\r') {
2938            *p++ = '\\';
2939            *p++ = 'r';
2940        }
2941
2942        /* Map non-printable US ASCII to '\xhh' */
2943        else if (ch < ' ' || ch >= 0x7F) {
2944            *p++ = '\\';
2945            *p++ = 'x';
2946            *p++ = hexdigits[(ch >> 4) & 0x000F];
2947            *p++ = hexdigits[ch & 0x000F];
2948        }
2949
2950        /* Copy everything else as-is */
2951        else
2952            *p++ = (char) ch;
2953    }
2954
2955    *p = '\0';
2956    if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2957        Py_DECREF(repr);
2958        return NULL;
2959    }
2960    return repr;
2961}
2962
2963PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2964{
2965    PyObject *s, *result;
2966    if (!PyUnicode_Check(unicode)) {
2967        PyErr_BadArgument();
2968        return NULL;
2969    }
2970    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2971                                      PyUnicode_GET_SIZE(unicode));
2972
2973    if (!s)
2974        return NULL;
2975    result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2976                                        PyBytes_GET_SIZE(s));
2977    Py_DECREF(s);
2978    return result;
2979}
2980
2981/* --- Raw Unicode Escape Codec ------------------------------------------- */
2982
2983PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2984					   Py_ssize_t size,
2985					   const char *errors)
2986{
2987    const char *starts = s;
2988    Py_ssize_t startinpos;
2989    Py_ssize_t endinpos;
2990    Py_ssize_t outpos;
2991    PyUnicodeObject *v;
2992    Py_UNICODE *p;
2993    const char *end;
2994    const char *bs;
2995    PyObject *errorHandler = NULL;
2996    PyObject *exc = NULL;
2997
2998    /* Escaped strings will always be longer than the resulting
2999       Unicode string, so we start with size here and then reduce the
3000       length after conversion to the true value. (But decoding error
3001       handler might have to resize the string) */
3002    v = _PyUnicode_New(size);
3003    if (v == NULL)
3004	goto onError;
3005    if (size == 0)
3006	return (PyObject *)v;
3007    p = PyUnicode_AS_UNICODE(v);
3008    end = s + size;
3009    while (s < end) {
3010	unsigned char c;
3011	Py_UCS4 x;
3012	int i;
3013        int count;
3014
3015	/* Non-escape characters are interpreted as Unicode ordinals */
3016	if (*s != '\\') {
3017	    *p++ = (unsigned char)*s++;
3018	    continue;
3019	}
3020	startinpos = s-starts;
3021
3022	/* \u-escapes are only interpreted iff the number of leading
3023	   backslashes if odd */
3024	bs = s;
3025	for (;s < end;) {
3026	    if (*s != '\\')
3027		break;
3028	    *p++ = (unsigned char)*s++;
3029	}
3030	if (((s - bs) & 1) == 0 ||
3031	    s >= end ||
3032	    (*s != 'u' && *s != 'U')) {
3033	    continue;
3034	}
3035	p--;
3036        count = *s=='u' ? 4 : 8;
3037	s++;
3038
3039	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3040	outpos = p-PyUnicode_AS_UNICODE(v);
3041	for (x = 0, i = 0; i < count; ++i, ++s) {
3042	    c = (unsigned char)*s;
3043	    if (!isxdigit(c)) {
3044		endinpos = s-starts;
3045		if (unicode_decode_call_errorhandler(
3046		    errors, &errorHandler,
3047		    "rawunicodeescape", "truncated \\uXXXX",
3048		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3049		    (PyObject **)&v, &outpos, &p))
3050		    goto onError;
3051		goto nextByte;
3052	    }
3053	    x = (x<<4) & ~0xF;
3054	    if (c >= '0' && c <= '9')
3055		x += c - '0';
3056	    else if (c >= 'a' && c <= 'f')
3057		x += 10 + c - 'a';
3058	    else
3059		x += 10 + c - 'A';
3060	}
3061#ifndef Py_UNICODE_WIDE
3062        if (x > 0x10000) {
3063            if (unicode_decode_call_errorhandler(
3064                    errors, &errorHandler,
3065                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3066		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3067		    (PyObject **)&v, &outpos, &p))
3068		    goto onError;
3069        }
3070#endif
3071	*p++ = x;
3072	nextByte:
3073	;
3074    }
3075    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3076	goto onError;
3077    Py_XDECREF(errorHandler);
3078    Py_XDECREF(exc);
3079    return (PyObject *)v;
3080
3081 onError:
3082    Py_XDECREF(v);
3083    Py_XDECREF(errorHandler);
3084    Py_XDECREF(exc);
3085    return NULL;
3086}
3087
3088PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3089					   Py_ssize_t size)
3090{
3091    PyObject *repr;
3092    char *p;
3093    char *q;
3094
3095#ifdef Py_UNICODE_WIDE
3096    repr = PyBytes_FromStringAndSize(NULL, 10 * size);
3097#else
3098    repr = PyBytes_FromStringAndSize(NULL, 6 * size);
3099#endif
3100    if (repr == NULL)
3101        return NULL;
3102    if (size == 0)
3103	return repr;
3104
3105    p = q = PyBytes_AS_STRING(repr);
3106    while (size-- > 0) {
3107        Py_UNICODE ch = *s++;
3108#ifdef Py_UNICODE_WIDE
3109	/* Map 32-bit characters to '\Uxxxxxxxx' */
3110	if (ch >= 0x10000) {
3111            *p++ = '\\';
3112            *p++ = 'U';
3113            *p++ = hexdigits[(ch >> 28) & 0xf];
3114            *p++ = hexdigits[(ch >> 24) & 0xf];
3115            *p++ = hexdigits[(ch >> 20) & 0xf];
3116            *p++ = hexdigits[(ch >> 16) & 0xf];
3117            *p++ = hexdigits[(ch >> 12) & 0xf];
3118            *p++ = hexdigits[(ch >> 8) & 0xf];
3119            *p++ = hexdigits[(ch >> 4) & 0xf];
3120            *p++ = hexdigits[ch & 15];
3121        }
3122        else
3123#endif
3124	/* Map 16-bit characters to '\uxxxx' */
3125	if (ch >= 256) {
3126            *p++ = '\\';
3127            *p++ = 'u';
3128            *p++ = hexdigits[(ch >> 12) & 0xf];
3129            *p++ = hexdigits[(ch >> 8) & 0xf];
3130            *p++ = hexdigits[(ch >> 4) & 0xf];
3131            *p++ = hexdigits[ch & 15];
3132        }
3133	/* Copy everything else as-is */
3134	else
3135            *p++ = (char) ch;
3136    }
3137    *p = '\0';
3138    if (PyBytes_Resize(repr, p - q)) {
3139        Py_DECREF(repr);
3140        return NULL;
3141    }
3142    return repr;
3143}
3144
3145PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3146{
3147    PyObject *s, *result;
3148    if (!PyUnicode_Check(unicode)) {
3149        PyErr_BadArgument();
3150        return NULL;
3151    }
3152    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3153                                         PyUnicode_GET_SIZE(unicode));
3154
3155    if (!s)
3156        return NULL;
3157    result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3158                                        PyBytes_GET_SIZE(s));
3159    Py_DECREF(s);
3160    return result;
3161}
3162
3163/* --- Unicode Internal Codec ------------------------------------------- */
3164
3165PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3166					   Py_ssize_t size,
3167					   const char *errors)
3168{
3169    const char *starts = s;
3170    Py_ssize_t startinpos;
3171    Py_ssize_t endinpos;
3172    Py_ssize_t outpos;
3173    PyUnicodeObject *v;
3174    Py_UNICODE *p;
3175    const char *end;
3176    const char *reason;
3177    PyObject *errorHandler = NULL;
3178    PyObject *exc = NULL;
3179
3180#ifdef Py_UNICODE_WIDE
3181    Py_UNICODE unimax = PyUnicode_GetMax();
3182#endif
3183
3184    /* XXX overflow detection missing */
3185    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3186    if (v == NULL)
3187	goto onError;
3188    if (PyUnicode_GetSize((PyObject *)v) == 0)
3189	return (PyObject *)v;
3190    p = PyUnicode_AS_UNICODE(v);
3191    end = s + size;
3192
3193    while (s < end) {
3194        memcpy(p, s, sizeof(Py_UNICODE));
3195        /* We have to sanity check the raw data, otherwise doom looms for
3196           some malformed UCS-4 data. */
3197        if (
3198            #ifdef Py_UNICODE_WIDE
3199            *p > unimax || *p < 0 ||
3200            #endif
3201            end-s < Py_UNICODE_SIZE
3202            )
3203            {
3204            startinpos = s - starts;
3205            if (end-s < Py_UNICODE_SIZE) {
3206                endinpos = end-starts;
3207                reason = "truncated input";
3208            }
3209            else {
3210                endinpos = s - starts + Py_UNICODE_SIZE;
3211                reason = "illegal code point (> 0x10FFFF)";
3212            }
3213            outpos = p - PyUnicode_AS_UNICODE(v);
3214            if (unicode_decode_call_errorhandler(
3215                    errors, &errorHandler,
3216                    "unicode_internal", reason,
3217                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3218                    (PyObject **)&v, &outpos, &p)) {
3219                goto onError;
3220            }
3221        }
3222        else {
3223            p++;
3224            s += Py_UNICODE_SIZE;
3225        }
3226    }
3227
3228    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3229        goto onError;
3230    Py_XDECREF(errorHandler);
3231    Py_XDECREF(exc);
3232    return (PyObject *)v;
3233
3234 onError:
3235    Py_XDECREF(v);
3236    Py_XDECREF(errorHandler);
3237    Py_XDECREF(exc);
3238    return NULL;
3239}
3240
3241/* --- Latin-1 Codec ------------------------------------------------------ */
3242
3243PyObject *PyUnicode_DecodeLatin1(const char *s,
3244				 Py_ssize_t size,
3245				 const char *errors)
3246{
3247    PyUnicodeObject *v;
3248    Py_UNICODE *p;
3249
3250    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3251    if (size == 1) {
3252	Py_UNICODE r = *(unsigned char*)s;
3253	return PyUnicode_FromUnicode(&r, 1);
3254    }
3255
3256    v = _PyUnicode_New(size);
3257    if (v == NULL)
3258	goto onError;
3259    if (size == 0)
3260	return (PyObject *)v;
3261    p = PyUnicode_AS_UNICODE(v);
3262    while (size-- > 0)
3263	*p++ = (unsigned char)*s++;
3264    return (PyObject *)v;
3265
3266 onError:
3267    Py_XDECREF(v);
3268    return NULL;
3269}
3270
3271/* create or adjust a UnicodeEncodeError */
3272static void make_encode_exception(PyObject **exceptionObject,
3273    const char *encoding,
3274    const Py_UNICODE *unicode, Py_ssize_t size,
3275    Py_ssize_t startpos, Py_ssize_t endpos,
3276    const char *reason)
3277{
3278    if (*exceptionObject == NULL) {
3279	*exceptionObject = PyUnicodeEncodeError_Create(
3280	    encoding, unicode, size, startpos, endpos, reason);
3281    }
3282    else {
3283	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3284	    goto onError;
3285	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3286	    goto onError;
3287	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3288	    goto onError;
3289	return;
3290	onError:
3291	Py_DECREF(*exceptionObject);
3292	*exceptionObject = NULL;
3293    }
3294}
3295
3296/* raises a UnicodeEncodeError */
3297static void raise_encode_exception(PyObject **exceptionObject,
3298    const char *encoding,
3299    const Py_UNICODE *unicode, Py_ssize_t size,
3300    Py_ssize_t startpos, Py_ssize_t endpos,
3301    const char *reason)
3302{
3303    make_encode_exception(exceptionObject,
3304	encoding, unicode, size, startpos, endpos, reason);
3305    if (*exceptionObject != NULL)
3306	PyCodec_StrictErrors(*exceptionObject);
3307}
3308
3309/* error handling callback helper:
3310   build arguments, call the callback and check the arguments,
3311   put the result into newpos and return the replacement string, which
3312   has to be freed by the caller */
3313static PyObject *unicode_encode_call_errorhandler(const char *errors,
3314    PyObject **errorHandler,
3315    const char *encoding, const char *reason,
3316    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3317    Py_ssize_t startpos, Py_ssize_t endpos,
3318    Py_ssize_t *newpos)
3319{
3320    static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3321
3322    PyObject *restuple;
3323    PyObject *resunicode;
3324
3325    if (*errorHandler == NULL) {
3326	*errorHandler = PyCodec_LookupError(errors);
3327        if (*errorHandler == NULL)
3328	    return NULL;
3329    }
3330
3331    make_encode_exception(exceptionObject,
3332	encoding, unicode, size, startpos, endpos, reason);
3333    if (*exceptionObject == NULL)
3334	return NULL;
3335
3336    restuple = PyObject_CallFunctionObjArgs(
3337	*errorHandler, *exceptionObject, NULL);
3338    if (restuple == NULL)
3339	return NULL;
3340    if (!PyTuple_Check(restuple)) {
3341	PyErr_Format(PyExc_TypeError, &argparse[4]);
3342	Py_DECREF(restuple);
3343	return NULL;
3344    }
3345    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3346	&resunicode, newpos)) {
3347	Py_DECREF(restuple);
3348	return NULL;
3349    }
3350    if (*newpos<0)
3351	*newpos = size+*newpos;
3352    if (*newpos<0 || *newpos>size) {
3353	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3354	Py_DECREF(restuple);
3355	return NULL;
3356    }
3357    Py_INCREF(resunicode);
3358    Py_DECREF(restuple);
3359    return resunicode;
3360}
3361
3362static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3363				 Py_ssize_t size,
3364				 const char *errors,
3365				 int limit)
3366{
3367    /* output object */
3368    PyObject *res;
3369    /* pointers to the beginning and end+1 of input */
3370    const Py_UNICODE *startp = p;
3371    const Py_UNICODE *endp = p + size;
3372    /* pointer to the beginning of the unencodable characters */
3373    /* const Py_UNICODE *badp = NULL; */
3374    /* pointer into the output */
3375    char *str;
3376    /* current output position */
3377    Py_ssize_t respos = 0;
3378    Py_ssize_t ressize;
3379    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3380    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3381    PyObject *errorHandler = NULL;
3382    PyObject *exc = NULL;
3383    /* the following variable is used for caching string comparisons
3384     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3385    int known_errorHandler = -1;
3386
3387    /* allocate enough for a simple encoding without
3388       replacements, if we need more, we'll resize */
3389    res = PyBytes_FromStringAndSize(NULL, size);
3390    if (res == NULL)
3391        goto onError;
3392    if (size == 0)
3393	return res;
3394    str = PyBytes_AS_STRING(res);
3395    ressize = size;
3396
3397    while (p<endp) {
3398	Py_UNICODE c = *p;
3399
3400	/* can we encode this? */
3401	if (c<limit) {
3402	    /* no overflow check, because we know that the space is enough */
3403	    *str++ = (char)c;
3404	    ++p;
3405	}
3406	else {
3407	    Py_ssize_t unicodepos = p-startp;
3408	    Py_ssize_t requiredsize;
3409	    PyObject *repunicode;
3410	    Py_ssize_t repsize;
3411	    Py_ssize_t newpos;
3412	    Py_ssize_t respos;
3413	    Py_UNICODE *uni2;
3414	    /* startpos for collecting unencodable chars */
3415	    const Py_UNICODE *collstart = p;
3416	    const Py_UNICODE *collend = p;
3417	    /* find all unecodable characters */
3418	    while ((collend < endp) && ((*collend)>=limit))
3419		++collend;
3420	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3421	    if (known_errorHandler==-1) {
3422		if ((errors==NULL) || (!strcmp(errors, "strict")))
3423		    known_errorHandler = 1;
3424		else if (!strcmp(errors, "replace"))
3425		    known_errorHandler = 2;
3426		else if (!strcmp(errors, "ignore"))
3427		    known_errorHandler = 3;
3428		else if (!strcmp(errors, "xmlcharrefreplace"))
3429		    known_errorHandler = 4;
3430		else
3431		    known_errorHandler = 0;
3432	    }
3433	    switch (known_errorHandler) {
3434		case 1: /* strict */
3435		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3436		    goto onError;
3437		case 2: /* replace */
3438		    while (collstart++<collend)
3439			*str++ = '?'; /* fall through */
3440		case 3: /* ignore */
3441		    p = collend;
3442		    break;
3443		case 4: /* xmlcharrefreplace */
3444		    respos = str - PyBytes_AS_STRING(res);
3445		    /* determine replacement size (temporarily (mis)uses p) */
3446		    for (p = collstart, repsize = 0; p < collend; ++p) {
3447			if (*p<10)
3448			    repsize += 2+1+1;
3449			else if (*p<100)
3450			    repsize += 2+2+1;
3451			else if (*p<1000)
3452			    repsize += 2+3+1;
3453			else if (*p<10000)
3454			    repsize += 2+4+1;
3455#ifndef Py_UNICODE_WIDE
3456			else
3457			    repsize += 2+5+1;
3458#else
3459			else if (*p<100000)
3460			    repsize += 2+5+1;
3461			else if (*p<1000000)
3462			    repsize += 2+6+1;
3463			else
3464			    repsize += 2+7+1;
3465#endif
3466		    }
3467		    requiredsize = respos+repsize+(endp-collend);
3468		    if (requiredsize > ressize) {
3469			if (requiredsize<2*ressize)
3470			    requiredsize = 2*ressize;
3471			if (PyBytes_Resize(res, requiredsize))
3472			    goto onError;
3473			str = PyBytes_AS_STRING(res) + respos;
3474			ressize = requiredsize;
3475		    }
3476		    /* generate replacement (temporarily (mis)uses p) */
3477		    for (p = collstart; p < collend; ++p) {
3478			str += sprintf(str, "&#%d;", (int)*p);
3479		    }
3480		    p = collend;
3481		    break;
3482		default:
3483		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3484			encoding, reason, startp, size, &exc,
3485			collstart-startp, collend-startp, &newpos);
3486		    if (repunicode == NULL)
3487			goto onError;
3488		    /* need more space? (at least enough for what we
3489		       have+the replacement+the rest of the string, so
3490		       we won't have to check space for encodable characters) */
3491		    respos = str - PyBytes_AS_STRING(res);
3492		    repsize = PyUnicode_GET_SIZE(repunicode);
3493		    requiredsize = respos+repsize+(endp-collend);
3494		    if (requiredsize > ressize) {
3495			if (requiredsize<2*ressize)
3496			    requiredsize = 2*ressize;
3497			if (PyBytes_Resize(res, requiredsize)) {
3498			    Py_DECREF(repunicode);
3499			    goto onError;
3500			}
3501			str = PyBytes_AS_STRING(res) + respos;
3502			ressize = requiredsize;
3503		    }
3504		    /* check if there is anything unencodable in the replacement
3505		       and copy it to the output */
3506		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3507			c = *uni2;
3508			if (c >= limit) {
3509			    raise_encode_exception(&exc, encoding, startp, size,
3510				unicodepos, unicodepos+1, reason);
3511			    Py_DECREF(repunicode);
3512			    goto onError;
3513			}
3514			*str = (char)c;
3515		    }
3516		    p = startp + newpos;
3517		    Py_DECREF(repunicode);
3518	    }
3519	}
3520    }
3521    /* Resize if we allocated to much */
3522    respos = str - PyBytes_AS_STRING(res);
3523    if (respos<ressize)
3524       /* If this falls res will be NULL */
3525	PyBytes_Resize(res, respos);
3526    Py_XDECREF(errorHandler);
3527    Py_XDECREF(exc);
3528    return res;
3529
3530    onError:
3531    Py_XDECREF(res);
3532    Py_XDECREF(errorHandler);
3533    Py_XDECREF(exc);
3534    return NULL;
3535}
3536
3537PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3538				 Py_ssize_t size,
3539				 const char *errors)
3540{
3541    return unicode_encode_ucs1(p, size, errors, 256);
3542}
3543
3544PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3545{
3546    if (!PyUnicode_Check(unicode)) {
3547	PyErr_BadArgument();
3548	return NULL;
3549    }
3550    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3551				  PyUnicode_GET_SIZE(unicode),
3552				  NULL);
3553}
3554
3555/* --- 7-bit ASCII Codec -------------------------------------------------- */
3556
3557PyObject *PyUnicode_DecodeASCII(const char *s,
3558				Py_ssize_t size,
3559				const char *errors)
3560{
3561    const char *starts = s;
3562    PyUnicodeObject *v;
3563    Py_UNICODE *p;
3564    Py_ssize_t startinpos;
3565    Py_ssize_t endinpos;
3566    Py_ssize_t outpos;
3567    const char *e;
3568    PyObject *errorHandler = NULL;
3569    PyObject *exc = NULL;
3570
3571    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3572    if (size == 1 && *(unsigned char*)s < 128) {
3573	Py_UNICODE r = *(unsigned char*)s;
3574	return PyUnicode_FromUnicode(&r, 1);
3575    }
3576
3577    v = _PyUnicode_New(size);
3578    if (v == NULL)
3579	goto onError;
3580    if (size == 0)
3581	return (PyObject *)v;
3582    p = PyUnicode_AS_UNICODE(v);
3583    e = s + size;
3584    while (s < e) {
3585	register unsigned char c = (unsigned char)*s;
3586	if (c < 128) {
3587	    *p++ = c;
3588	    ++s;
3589	}
3590	else {
3591	    startinpos = s-starts;
3592	    endinpos = startinpos + 1;
3593	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3594	    if (unicode_decode_call_errorhandler(
3595		 errors, &errorHandler,
3596		 "ascii", "ordinal not in range(128)",
3597		 &starts, &e, &startinpos, &endinpos, &exc, &s,
3598		 (PyObject **)&v, &outpos, &p))
3599		goto onError;
3600	}
3601    }
3602    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3603	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3604	    goto onError;
3605    Py_XDECREF(errorHandler);
3606    Py_XDECREF(exc);
3607    return (PyObject *)v;
3608
3609 onError:
3610    Py_XDECREF(v);
3611    Py_XDECREF(errorHandler);
3612    Py_XDECREF(exc);
3613    return NULL;
3614}
3615
3616PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3617				Py_ssize_t size,
3618				const char *errors)
3619{
3620    return unicode_encode_ucs1(p, size, errors, 128);
3621}
3622
3623PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3624{
3625    if (!PyUnicode_Check(unicode)) {
3626	PyErr_BadArgument();
3627	return NULL;
3628    }
3629    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3630				 PyUnicode_GET_SIZE(unicode),
3631				 NULL);
3632}
3633
3634#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3635
3636/* --- MBCS codecs for Windows -------------------------------------------- */
3637
3638#if SIZEOF_INT < SIZEOF_SSIZE_T
3639#define NEED_RETRY
3640#endif
3641
3642/* XXX This code is limited to "true" double-byte encodings, as
3643   a) it assumes an incomplete character consists of a single byte, and
3644   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3645      encodings, see IsDBCSLeadByteEx documentation. */
3646
3647static int is_dbcs_lead_byte(const char *s, int offset)
3648{
3649    const char *curr = s + offset;
3650
3651    if (IsDBCSLeadByte(*curr)) {
3652	const char *prev = CharPrev(s, curr);
3653	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3654    }
3655    return 0;
3656}
3657
3658/*
3659 * Decode MBCS string into unicode object. If 'final' is set, converts
3660 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3661 */
3662static int decode_mbcs(PyUnicodeObject **v,
3663			const char *s, /* MBCS string */
3664			int size, /* sizeof MBCS string */
3665			int final)
3666{
3667    Py_UNICODE *p;
3668    Py_ssize_t n = 0;
3669    int usize = 0;
3670
3671    assert(size >= 0);
3672
3673    /* Skip trailing lead-byte unless 'final' is set */
3674    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3675	--size;
3676
3677    /* First get the size of the result */
3678    if (size > 0) {
3679	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3680	if (usize == 0) {
3681	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3682	    return -1;
3683	}
3684    }
3685
3686    if (*v == NULL) {
3687	/* Create unicode object */
3688	*v = _PyUnicode_New(usize);
3689	if (*v == NULL)
3690	    return -1;
3691    }
3692    else {
3693	/* Extend unicode object */
3694	n = PyUnicode_GET_SIZE(*v);
3695	if (_PyUnicode_Resize(v, n + usize) < 0)
3696	    return -1;
3697    }
3698
3699    /* Do the conversion */
3700    if (size > 0) {
3701	p = PyUnicode_AS_UNICODE(*v) + n;
3702	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3703	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3704	    return -1;
3705	}
3706    }
3707
3708    return size;
3709}
3710
3711PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3712					Py_ssize_t size,
3713					const char *errors,
3714					Py_ssize_t *consumed)
3715{
3716    PyUnicodeObject *v = NULL;
3717    int done;
3718
3719    if (consumed)
3720	*consumed = 0;
3721
3722#ifdef NEED_RETRY
3723  retry:
3724    if (size > INT_MAX)
3725	done = decode_mbcs(&v, s, INT_MAX, 0);
3726    else
3727#endif
3728	done = decode_mbcs(&v, s, (int)size, !consumed);
3729
3730    if (done < 0) {
3731        Py_XDECREF(v);
3732	return NULL;
3733    }
3734
3735    if (consumed)
3736	*consumed += done;
3737
3738#ifdef NEED_RETRY
3739    if (size > INT_MAX) {
3740	s += done;
3741	size -= done;
3742	goto retry;
3743    }
3744#endif
3745
3746    return (PyObject *)v;
3747}
3748
3749PyObject *PyUnicode_DecodeMBCS(const char *s,
3750				Py_ssize_t size,
3751				const char *errors)
3752{
3753    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3754}
3755
3756/*
3757 * Convert unicode into string object (MBCS).
3758 * Returns 0 if succeed, -1 otherwise.
3759 */
3760static int encode_mbcs(PyObject **repr,
3761			const Py_UNICODE *p, /* unicode */
3762			int size) /* size of unicode */
3763{
3764    int mbcssize = 0;
3765    Py_ssize_t n = 0;
3766
3767    assert(size >= 0);
3768
3769    /* First get the size of the result */
3770    if (size > 0) {
3771	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3772	if (mbcssize == 0) {
3773	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3774	    return -1;
3775	}
3776    }
3777
3778    if (*repr == NULL) {
3779	/* Create string object */
3780	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
3781	if (*repr == NULL)
3782	    return -1;
3783    }
3784    else {
3785	/* Extend string object */
3786	n = PyBytes_Size(*repr);
3787	if (PyBytes_Resize(*repr, n + mbcssize) < 0)
3788	    return -1;
3789    }
3790
3791    /* Do the conversion */
3792    if (size > 0) {
3793	char *s = PyBytes_AS_STRING(*repr) + n;
3794	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3795	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3796	    return -1;
3797	}
3798    }
3799
3800    return 0;
3801}
3802
3803PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3804				Py_ssize_t size,
3805				const char *errors)
3806{
3807    PyObject *repr = NULL;
3808    int ret;
3809
3810#ifdef NEED_RETRY
3811 retry:
3812    if (size > INT_MAX)
3813	ret = encode_mbcs(&repr, p, INT_MAX);
3814    else
3815#endif
3816	ret = encode_mbcs(&repr, p, (int)size);
3817
3818    if (ret < 0) {
3819	Py_XDECREF(repr);
3820	return NULL;
3821    }
3822
3823#ifdef NEED_RETRY
3824    if (size > INT_MAX) {
3825	p += INT_MAX;
3826	size -= INT_MAX;
3827	goto retry;
3828    }
3829#endif
3830
3831    return repr;
3832}
3833
3834PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3835{
3836    if (!PyUnicode_Check(unicode)) {
3837        PyErr_BadArgument();
3838        return NULL;
3839    }
3840    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3841				PyUnicode_GET_SIZE(unicode),
3842				NULL);
3843}
3844
3845#undef NEED_RETRY
3846
3847#endif /* MS_WINDOWS */
3848
3849/* --- Character Mapping Codec -------------------------------------------- */
3850
3851PyObject *PyUnicode_DecodeCharmap(const char *s,
3852				  Py_ssize_t size,
3853				  PyObject *mapping,
3854				  const char *errors)
3855{
3856    const char *starts = s;
3857    Py_ssize_t startinpos;
3858    Py_ssize_t endinpos;
3859    Py_ssize_t outpos;
3860    const char *e;
3861    PyUnicodeObject *v;
3862    Py_UNICODE *p;
3863    Py_ssize_t extrachars = 0;
3864    PyObject *errorHandler = NULL;
3865    PyObject *exc = NULL;
3866    Py_UNICODE *mapstring = NULL;
3867    Py_ssize_t maplen = 0;
3868
3869    /* Default to Latin-1 */
3870    if (mapping == NULL)
3871	return PyUnicode_DecodeLatin1(s, size, errors);
3872
3873    v = _PyUnicode_New(size);
3874    if (v == NULL)
3875	goto onError;
3876    if (size == 0)
3877	return (PyObject *)v;
3878    p = PyUnicode_AS_UNICODE(v);
3879    e = s + size;
3880    if (PyUnicode_CheckExact(mapping)) {
3881	mapstring = PyUnicode_AS_UNICODE(mapping);
3882	maplen = PyUnicode_GET_SIZE(mapping);
3883	while (s < e) {
3884	    unsigned char ch = *s;
3885	    Py_UNICODE x = 0xfffe; /* illegal value */
3886
3887	    if (ch < maplen)
3888		x = mapstring[ch];
3889
3890	    if (x == 0xfffe) {
3891		/* undefined mapping */
3892		outpos = p-PyUnicode_AS_UNICODE(v);
3893		startinpos = s-starts;
3894		endinpos = startinpos+1;
3895		if (unicode_decode_call_errorhandler(
3896		     errors, &errorHandler,
3897		     "charmap", "character maps to <undefined>",
3898		     &starts, &e, &startinpos, &endinpos, &exc, &s,
3899		     (PyObject **)&v, &outpos, &p)) {
3900		    goto onError;
3901		}
3902		continue;
3903	    }
3904	    *p++ = x;
3905	    ++s;
3906	}
3907    }
3908    else {
3909	while (s < e) {
3910	    unsigned char ch = *s;
3911	    PyObject *w, *x;
3912
3913	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
3914	    w = PyInt_FromLong((long)ch);
3915	    if (w == NULL)
3916		goto onError;
3917	    x = PyObject_GetItem(mapping, w);
3918	    Py_DECREF(w);
3919	    if (x == NULL) {
3920		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3921		    /* No mapping found means: mapping is undefined. */
3922		    PyErr_Clear();
3923		    x = Py_None;
3924		    Py_INCREF(x);
3925		} else
3926		    goto onError;
3927	    }
3928
3929	    /* Apply mapping */
3930	    if (PyInt_Check(x)) {
3931		long value = PyInt_AS_LONG(x);
3932		if (value < 0 || value > 65535) {
3933		    PyErr_SetString(PyExc_TypeError,
3934				    "character mapping must be in range(65536)");
3935		    Py_DECREF(x);
3936		    goto onError;
3937		}
3938		*p++ = (Py_UNICODE)value;
3939	    }
3940	    else if (x == Py_None) {
3941		/* undefined mapping */
3942		outpos = p-PyUnicode_AS_UNICODE(v);
3943		startinpos = s-starts;
3944		endinpos = startinpos+1;
3945		if (unicode_decode_call_errorhandler(
3946		     errors, &errorHandler,
3947		     "charmap", "character maps to <undefined>",
3948		     &starts, &e, &startinpos, &endinpos, &exc, &s,
3949		     (PyObject **)&v, &outpos, &p)) {
3950		    Py_DECREF(x);
3951		    goto onError;
3952		}
3953		Py_DECREF(x);
3954		continue;
3955	    }
3956	    else if (PyUnicode_Check(x)) {
3957		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3958
3959		if (targetsize == 1)
3960		    /* 1-1 mapping */
3961		    *p++ = *PyUnicode_AS_UNICODE(x);
3962
3963		else if (targetsize > 1) {
3964		    /* 1-n mapping */
3965		    if (targetsize > extrachars) {
3966			/* resize first */
3967			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3968			Py_ssize_t needed = (targetsize - extrachars) + \
3969				     (targetsize << 2);
3970			extrachars += needed;
3971			/* XXX overflow detection missing */
3972			if (_PyUnicode_Resize(&v,
3973					     PyUnicode_GET_SIZE(v) + needed) < 0) {
3974			    Py_DECREF(x);
3975			    goto onError;
3976			}
3977			p = PyUnicode_AS_UNICODE(v) + oldpos;
3978		    }
3979		    Py_UNICODE_COPY(p,
3980				    PyUnicode_AS_UNICODE(x),
3981				    targetsize);
3982		    p += targetsize;
3983		    extrachars -= targetsize;
3984		}
3985		/* 1-0 mapping: skip the character */
3986	    }
3987	    else {
3988		/* wrong return value */
3989		PyErr_SetString(PyExc_TypeError,
3990		      "character mapping must return integer, None or unicode");
3991		Py_DECREF(x);
3992		goto onError;
3993	    }
3994	    Py_DECREF(x);
3995	    ++s;
3996	}
3997    }
3998    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3999	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4000	    goto onError;
4001    Py_XDECREF(errorHandler);
4002    Py_XDECREF(exc);
4003    return (PyObject *)v;
4004
4005 onError:
4006    Py_XDECREF(errorHandler);
4007    Py_XDECREF(exc);
4008    Py_XDECREF(v);
4009    return NULL;
4010}
4011
4012/* Charmap encoding: the lookup table */
4013
4014struct encoding_map{
4015  PyObject_HEAD
4016  unsigned char level1[32];
4017  int count2, count3;
4018  unsigned char level23[1];
4019};
4020
4021static PyObject*
4022encoding_map_size(PyObject *obj, PyObject* args)
4023{
4024    struct encoding_map *map = (struct encoding_map*)obj;
4025    return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4026                          128*map->count3);
4027}
4028
4029static PyMethodDef encoding_map_methods[] = {
4030	{"size", encoding_map_size, METH_NOARGS,
4031         PyDoc_STR("Return the size (in bytes) of this object") },
4032        { 0 }
4033};
4034
4035static void
4036encoding_map_dealloc(PyObject* o)
4037{
4038	PyObject_FREE(o);
4039}
4040
4041static PyTypeObject EncodingMapType = {
4042	PyVarObject_HEAD_INIT(NULL, 0)
4043        "EncodingMap",          /*tp_name*/
4044        sizeof(struct encoding_map),   /*tp_basicsize*/
4045        0,                      /*tp_itemsize*/
4046        /* methods */
4047        encoding_map_dealloc,   /*tp_dealloc*/
4048        0,                      /*tp_print*/
4049        0,                      /*tp_getattr*/
4050        0,                      /*tp_setattr*/
4051        0,                      /*tp_compare*/
4052        0,                      /*tp_repr*/
4053        0,                      /*tp_as_number*/
4054        0,                      /*tp_as_sequence*/
4055        0,                      /*tp_as_mapping*/
4056        0,                      /*tp_hash*/
4057        0,                      /*tp_call*/
4058        0,                      /*tp_str*/
4059        0,                      /*tp_getattro*/
4060        0,                      /*tp_setattro*/
4061        0,                      /*tp_as_buffer*/
4062        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4063        0,                      /*tp_doc*/
4064        0,                      /*tp_traverse*/
4065        0,                      /*tp_clear*/
4066        0,                      /*tp_richcompare*/
4067        0,                      /*tp_weaklistoffset*/
4068        0,                      /*tp_iter*/
4069        0,                      /*tp_iternext*/
4070        encoding_map_methods,   /*tp_methods*/
4071        0,                      /*tp_members*/
4072        0,                      /*tp_getset*/
4073        0,                      /*tp_base*/
4074        0,                      /*tp_dict*/
4075        0,                      /*tp_descr_get*/
4076        0,                      /*tp_descr_set*/
4077        0,                      /*tp_dictoffset*/
4078        0,                      /*tp_init*/
4079        0,                      /*tp_alloc*/
4080        0,                      /*tp_new*/
4081        0,                      /*tp_free*/
4082        0,                      /*tp_is_gc*/
4083};
4084
4085PyObject*
4086PyUnicode_BuildEncodingMap(PyObject* string)
4087{
4088    Py_UNICODE *decode;
4089    PyObject *result;
4090    struct encoding_map *mresult;
4091    int i;
4092    int need_dict = 0;
4093    unsigned char level1[32];
4094    unsigned char level2[512];
4095    unsigned char *mlevel1, *mlevel2, *mlevel3;
4096    int count2 = 0, count3 = 0;
4097
4098    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4099        PyErr_BadArgument();
4100        return NULL;
4101    }
4102    decode = PyUnicode_AS_UNICODE(string);
4103    memset(level1, 0xFF, sizeof level1);
4104    memset(level2, 0xFF, sizeof level2);
4105
4106    /* If there isn't a one-to-one mapping of NULL to \0,
4107       or if there are non-BMP characters, we need to use
4108       a mapping dictionary. */
4109    if (decode[0] != 0)
4110        need_dict = 1;
4111    for (i = 1; i < 256; i++) {
4112        int l1, l2;
4113        if (decode[i] == 0
4114            #ifdef Py_UNICODE_WIDE
4115            || decode[i] > 0xFFFF
4116            #endif
4117        ) {
4118            need_dict = 1;
4119            break;
4120        }
4121        if (decode[i] == 0xFFFE)
4122            /* unmapped character */
4123            continue;
4124        l1 = decode[i] >> 11;
4125        l2 = decode[i] >> 7;
4126        if (level1[l1] == 0xFF)
4127            level1[l1] = count2++;
4128        if (level2[l2] == 0xFF)
4129            level2[l2] = count3++;
4130    }
4131
4132    if (count2 >= 0xFF || count3 >= 0xFF)
4133        need_dict = 1;
4134
4135    if (need_dict) {
4136        PyObject *result = PyDict_New();
4137        PyObject *key, *value;
4138        if (!result)
4139            return NULL;
4140        for (i = 0; i < 256; i++) {
4141            key = value = NULL;
4142            key = PyInt_FromLong(decode[i]);
4143            value = PyInt_FromLong(i);
4144            if (!key || !value)
4145                goto failed1;
4146            if (PyDict_SetItem(result, key, value) == -1)
4147                goto failed1;
4148            Py_DECREF(key);
4149            Py_DECREF(value);
4150        }
4151        return result;
4152      failed1:
4153        Py_XDECREF(key);
4154        Py_XDECREF(value);
4155        Py_DECREF(result);
4156        return NULL;
4157    }
4158
4159    /* Create a three-level trie */
4160    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4161                             16*count2 + 128*count3 - 1);
4162    if (!result)
4163        return PyErr_NoMemory();
4164    PyObject_Init(result, &EncodingMapType);
4165    mresult = (struct encoding_map*)result;
4166    mresult->count2 = count2;
4167    mresult->count3 = count3;
4168    mlevel1 = mresult->level1;
4169    mlevel2 = mresult->level23;
4170    mlevel3 = mresult->level23 + 16*count2;
4171    memcpy(mlevel1, level1, 32);
4172    memset(mlevel2, 0xFF, 16*count2);
4173    memset(mlevel3, 0, 128*count3);
4174    count3 = 0;
4175    for (i = 1; i < 256; i++) {
4176        int o1, o2, o3, i2, i3;
4177        if (decode[i] == 0xFFFE)
4178            /* unmapped character */
4179            continue;
4180        o1 = decode[i]>>11;
4181        o2 = (decode[i]>>7) & 0xF;
4182        i2 = 16*mlevel1[o1] + o2;
4183        if (mlevel2[i2] == 0xFF)
4184            mlevel2[i2] = count3++;
4185        o3 = decode[i] & 0x7F;
4186        i3 = 128*mlevel2[i2] + o3;
4187        mlevel3[i3] = i;
4188    }
4189    return result;
4190}
4191
4192static int
4193encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4194{
4195    struct encoding_map *map = (struct encoding_map*)mapping;
4196    int l1 = c>>11;
4197    int l2 = (c>>7) & 0xF;
4198    int l3 = c & 0x7F;
4199    int i;
4200
4201#ifdef Py_UNICODE_WIDE
4202    if (c > 0xFFFF) {
4203	return -1;
4204    }
4205#endif
4206    if (c == 0)
4207        return 0;
4208    /* level 1*/
4209    i = map->level1[l1];
4210    if (i == 0xFF) {
4211        return -1;
4212    }
4213    /* level 2*/
4214    i = map->level23[16*i+l2];
4215    if (i == 0xFF) {
4216        return -1;
4217    }
4218    /* level 3 */
4219    i = map->level23[16*map->count2 + 128*i + l3];
4220    if (i == 0) {
4221        return -1;
4222    }
4223    return i;
4224}
4225
4226/* Lookup the character ch in the mapping. If the character
4227   can't be found, Py_None is returned (or NULL, if another
4228   error occurred). */
4229static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4230{
4231    PyObject *w = PyInt_FromLong((long)c);
4232    PyObject *x;
4233
4234    if (w == NULL)
4235	 return NULL;
4236    x = PyObject_GetItem(mapping, w);
4237    Py_DECREF(w);
4238    if (x == NULL) {
4239	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4240	    /* No mapping found means: mapping is undefined. */
4241	    PyErr_Clear();
4242	    x = Py_None;
4243	    Py_INCREF(x);
4244	    return x;
4245	} else
4246	    return NULL;
4247    }
4248    else if (x == Py_None)
4249	return x;
4250    else if (PyInt_Check(x)) {
4251	long value = PyInt_AS_LONG(x);
4252	if (value < 0 || value > 255) {
4253	    PyErr_SetString(PyExc_TypeError,
4254			     "character mapping must be in range(256)");
4255	    Py_DECREF(x);
4256	    return NULL;
4257	}
4258	return x;
4259    }
4260    else if (PyString_Check(x))
4261	return x;
4262    else {
4263	/* wrong return value */
4264	PyErr_Format(PyExc_TypeError,
4265                "character mapping must return integer, None or str8, not %.400s",
4266                x->ob_type->tp_name);
4267	Py_DECREF(x);
4268	return NULL;
4269    }
4270}
4271
4272static int
4273charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4274{
4275	Py_ssize_t outsize = PyBytes_GET_SIZE(  outobj);
4276	/* exponentially overallocate to minimize reallocations */
4277	if (requiredsize < 2*outsize)
4278	    requiredsize = 2*outsize;
4279	if (PyBytes_Resize(outobj, requiredsize)) {
4280	    Py_DECREF(outobj);
4281	    return -1;
4282	}
4283	return 0;
4284}
4285
4286typedef enum charmapencode_result {
4287  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4288}charmapencode_result;
4289/* lookup the character, put the result in the output string and adjust
4290   various state variables. Resize the output bytes object if not enough
4291   space is available. Return a new reference to the object that
4292   was put in the output buffer, or Py_None, if the mapping was undefined
4293   (in which case no character was written) or NULL, if a
4294   reallocation error occurred. The caller must decref the result */
4295static
4296charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4297    PyObject *outobj, Py_ssize_t *outpos)
4298{
4299    PyObject *rep;
4300    char *outstart;
4301    Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
4302
4303    if (Py_Type(mapping) == &EncodingMapType) {
4304        int res = encoding_map_lookup(c, mapping);
4305	Py_ssize_t requiredsize = *outpos+1;
4306        if (res == -1)
4307            return enc_FAILED;
4308	if (outsize<requiredsize)
4309	    if (charmapencode_resize(outobj, outpos, requiredsize))
4310		return enc_EXCEPTION;
4311        outstart = PyBytes_AS_STRING(outobj);
4312	outstart[(*outpos)++] = (char)res;
4313	return enc_SUCCESS;
4314    }
4315
4316    rep = charmapencode_lookup(c, mapping);
4317    if (rep==NULL)
4318	return enc_EXCEPTION;
4319    else if (rep==Py_None) {
4320	Py_DECREF(rep);
4321	return enc_FAILED;
4322    } else {
4323	if (PyInt_Check(rep)) {
4324	    Py_ssize_t requiredsize = *outpos+1;
4325	    if (outsize<requiredsize)
4326		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4327		    Py_DECREF(rep);
4328		    return enc_EXCEPTION;
4329		}
4330            outstart = PyBytes_AS_STRING(outobj);
4331	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4332	}
4333	else {
4334	    const char *repchars = PyString_AS_STRING(rep);
4335	    Py_ssize_t repsize = PyString_GET_SIZE(rep);
4336	    Py_ssize_t requiredsize = *outpos+repsize;
4337	    if (outsize<requiredsize)
4338		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4339		    Py_DECREF(rep);
4340		    return enc_EXCEPTION;
4341		}
4342            outstart = PyBytes_AS_STRING(outobj);
4343	    memcpy(outstart + *outpos, repchars, repsize);
4344	    *outpos += repsize;
4345	}
4346    }
4347    Py_DECREF(rep);
4348    return enc_SUCCESS;
4349}
4350
4351/* handle an error in PyUnicode_EncodeCharmap
4352   Return 0 on success, -1 on error */
4353static
4354int charmap_encoding_error(
4355    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4356    PyObject **exceptionObject,
4357    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4358    PyObject *res, Py_ssize_t *respos)
4359{
4360    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4361    Py_ssize_t repsize;
4362    Py_ssize_t newpos;
4363    Py_UNICODE *uni2;
4364    /* startpos for collecting unencodable chars */
4365    Py_ssize_t collstartpos = *inpos;
4366    Py_ssize_t collendpos = *inpos+1;
4367    Py_ssize_t collpos;
4368    char *encoding = "charmap";
4369    char *reason = "character maps to <undefined>";
4370    charmapencode_result x;
4371
4372    /* find all unencodable characters */
4373    while (collendpos < size) {
4374        PyObject *rep;
4375        if (Py_Type(mapping) == &EncodingMapType) {
4376	    int res = encoding_map_lookup(p[collendpos], mapping);
4377	    if (res != -1)
4378		break;
4379	    ++collendpos;
4380	    continue;
4381	}
4382
4383	rep = charmapencode_lookup(p[collendpos], mapping);
4384	if (rep==NULL)
4385	    return -1;
4386	else if (rep!=Py_None) {
4387	    Py_DECREF(rep);
4388	    break;
4389	}
4390	Py_DECREF(rep);
4391	++collendpos;
4392    }
4393    /* cache callback name lookup
4394     * (if not done yet, i.e. it's the first error) */
4395    if (*known_errorHandler==-1) {
4396	if ((errors==NULL) || (!strcmp(errors, "strict")))
4397	    *known_errorHandler = 1;
4398	else if (!strcmp(errors, "replace"))
4399	    *known_errorHandler = 2;
4400	else if (!strcmp(errors, "ignore"))
4401	    *known_errorHandler = 3;
4402	else if (!strcmp(errors, "xmlcharrefreplace"))
4403	    *known_errorHandler = 4;
4404	else
4405	    *known_errorHandler = 0;
4406    }
4407    switch (*known_errorHandler) {
4408	case 1: /* strict */
4409	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4410	    return -1;
4411	case 2: /* replace */
4412	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4413		x = charmapencode_output('?', mapping, res, respos);
4414		if (x==enc_EXCEPTION) {
4415		    return -1;
4416		}
4417		else if (x==enc_FAILED) {
4418		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4419		    return -1;
4420		}
4421	    }
4422	    /* fall through */
4423	case 3: /* ignore */
4424	    *inpos = collendpos;
4425	    break;
4426	case 4: /* xmlcharrefreplace */
4427	    /* generate replacement (temporarily (mis)uses p) */
4428	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4429		char buffer[2+29+1+1];
4430		char *cp;
4431		sprintf(buffer, "&#%d;", (int)p[collpos]);
4432		for (cp = buffer; *cp; ++cp) {
4433		    x = charmapencode_output(*cp, mapping, res, respos);
4434		    if (x==enc_EXCEPTION)
4435			return -1;
4436		    else if (x==enc_FAILED) {
4437			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4438			return -1;
4439		    }
4440		}
4441	    }
4442	    *inpos = collendpos;
4443	    break;
4444	default:
4445	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4446		encoding, reason, p, size, exceptionObject,
4447		collstartpos, collendpos, &newpos);
4448	    if (repunicode == NULL)
4449		return -1;
4450	    /* generate replacement  */
4451	    repsize = PyUnicode_GET_SIZE(repunicode);
4452	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4453		x = charmapencode_output(*uni2, mapping, res, respos);
4454		if (x==enc_EXCEPTION) {
4455		    return -1;
4456		}
4457		else if (x==enc_FAILED) {
4458		    Py_DECREF(repunicode);
4459		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4460		    return -1;
4461		}
4462	    }
4463	    *inpos = newpos;
4464	    Py_DECREF(repunicode);
4465    }
4466    return 0;
4467}
4468
4469PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4470				  Py_ssize_t size,
4471				  PyObject *mapping,
4472				  const char *errors)
4473{
4474    /* output object */
4475    PyObject *res = NULL;
4476    /* current input position */
4477    Py_ssize_t inpos = 0;
4478    /* current output position */
4479    Py_ssize_t respos = 0;
4480    PyObject *errorHandler = NULL;
4481    PyObject *exc = NULL;
4482    /* the following variable is used for caching string comparisons
4483     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4484     * 3=ignore, 4=xmlcharrefreplace */
4485    int known_errorHandler = -1;
4486
4487    /* Default to Latin-1 */
4488    if (mapping == NULL)
4489	return PyUnicode_EncodeLatin1(p, size, errors);
4490
4491    /* allocate enough for a simple encoding without
4492       replacements, if we need more, we'll resize */
4493    res = PyBytes_FromStringAndSize(NULL, size);
4494    if (res == NULL)
4495        goto onError;
4496    if (size == 0)
4497	return res;
4498
4499    while (inpos<size) {
4500	/* try to encode it */
4501	charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
4502	if (x==enc_EXCEPTION) /* error */
4503	    goto onError;
4504	if (x==enc_FAILED) { /* unencodable character */
4505	    if (charmap_encoding_error(p, size, &inpos, mapping,
4506		&exc,
4507		&known_errorHandler, &errorHandler, errors,
4508		res, &respos)) {
4509		goto onError;
4510	    }
4511	}
4512	else
4513	    /* done with this character => adjust input position */
4514	    ++inpos;
4515    }
4516
4517    /* Resize if we allocated to much */
4518    if (respos<PyBytes_GET_SIZE(res)) {
4519	if (PyBytes_Resize(res, respos))
4520	    goto onError;
4521    }
4522    Py_XDECREF(exc);
4523    Py_XDECREF(errorHandler);
4524    return res;
4525
4526    onError:
4527    Py_XDECREF(res);
4528    Py_XDECREF(exc);
4529    Py_XDECREF(errorHandler);
4530    return NULL;
4531}
4532
4533PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4534				    PyObject *mapping)
4535{
4536    if (!PyUnicode_Check(unicode) || mapping == NULL) {
4537	PyErr_BadArgument();
4538	return NULL;
4539    }
4540    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4541				   PyUnicode_GET_SIZE(unicode),
4542				   mapping,
4543				   NULL);
4544}
4545
4546/* create or adjust a UnicodeTranslateError */
4547static void make_translate_exception(PyObject **exceptionObject,
4548    const Py_UNICODE *unicode, Py_ssize_t size,
4549    Py_ssize_t startpos, Py_ssize_t endpos,
4550    const char *reason)
4551{
4552    if (*exceptionObject == NULL) {
4553    	*exceptionObject = PyUnicodeTranslateError_Create(
4554	    unicode, size, startpos, endpos, reason);
4555    }
4556    else {
4557	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4558	    goto onError;
4559	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4560	    goto onError;
4561	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4562	    goto onError;
4563	return;
4564	onError:
4565	Py_DECREF(*exceptionObject);
4566	*exceptionObject = NULL;
4567    }
4568}
4569
4570/* raises a UnicodeTranslateError */
4571static void raise_translate_exception(PyObject **exceptionObject,
4572    const Py_UNICODE *unicode, Py_ssize_t size,
4573    Py_ssize_t startpos, Py_ssize_t endpos,
4574    const char *reason)
4575{
4576    make_translate_exception(exceptionObject,
4577	unicode, size, startpos, endpos, reason);
4578    if (*exceptionObject != NULL)
4579	PyCodec_StrictErrors(*exceptionObject);
4580}
4581
4582/* error handling callback helper:
4583   build arguments, call the callback and check the arguments,
4584   put the result into newpos and return the replacement string, which
4585   has to be freed by the caller */
4586static PyObject *unicode_translate_call_errorhandler(const char *errors,
4587    PyObject **errorHandler,
4588    const char *reason,
4589    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4590    Py_ssize_t startpos, Py_ssize_t endpos,
4591    Py_ssize_t *newpos)
4592{
4593    static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4594
4595    Py_ssize_t i_newpos;
4596    PyObject *restuple;
4597    PyObject *resunicode;
4598
4599    if (*errorHandler == NULL) {
4600	*errorHandler = PyCodec_LookupError(errors);
4601        if (*errorHandler == NULL)
4602	    return NULL;
4603    }
4604
4605    make_translate_exception(exceptionObject,
4606	unicode, size, startpos, endpos, reason);
4607    if (*exceptionObject == NULL)
4608	return NULL;
4609
4610    restuple = PyObject_CallFunctionObjArgs(
4611	*errorHandler, *exceptionObject, NULL);
4612    if (restuple == NULL)
4613	return NULL;
4614    if (!PyTuple_Check(restuple)) {
4615	PyErr_Format(PyExc_TypeError, &argparse[4]);
4616	Py_DECREF(restuple);
4617	return NULL;
4618    }
4619    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4620	&resunicode, &i_newpos)) {
4621	Py_DECREF(restuple);
4622	return NULL;
4623    }
4624    if (i_newpos<0)
4625	*newpos = size+i_newpos;
4626    else
4627        *newpos = i_newpos;
4628    if (*newpos<0 || *newpos>size) {
4629	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4630	Py_DECREF(restuple);
4631	return NULL;
4632    }
4633    Py_INCREF(resunicode);
4634    Py_DECREF(restuple);
4635    return resunicode;
4636}
4637
4638/* Lookup the character ch in the mapping and put the result in result,
4639   which must be decrefed by the caller.
4640   Return 0 on success, -1 on error */
4641static
4642int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4643{
4644    PyObject *w = PyInt_FromLong((long)c);
4645    PyObject *x;
4646
4647    if (w == NULL)
4648	 return -1;
4649    x = PyObject_GetItem(mapping, w);
4650    Py_DECREF(w);
4651    if (x == NULL) {
4652	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4653	    /* No mapping found means: use 1:1 mapping. */
4654	    PyErr_Clear();
4655	    *result = NULL;
4656	    return 0;
4657	} else
4658	    return -1;
4659    }
4660    else if (x == Py_None) {
4661	*result = x;
4662	return 0;
4663    }
4664    else if (PyInt_Check(x)) {
4665	long value = PyInt_AS_LONG(x);
4666	long max = PyUnicode_GetMax();
4667	if (value < 0 || value > max) {
4668	    PyErr_Format(PyExc_TypeError,
4669			     "character mapping must be in range(0x%lx)", max+1);
4670	    Py_DECREF(x);
4671	    return -1;
4672	}
4673	*result = x;
4674	return 0;
4675    }
4676    else if (PyUnicode_Check(x)) {
4677	*result = x;
4678	return 0;
4679    }
4680    else {
4681	/* wrong return value */
4682	PyErr_SetString(PyExc_TypeError,
4683	      "character mapping must return integer, None or unicode");
4684	Py_DECREF(x);
4685	return -1;
4686    }
4687}
4688/* ensure that *outobj is at least requiredsize characters long,
4689if not reallocate and adjust various state variables.
4690Return 0 on success, -1 on error */
4691static
4692int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4693    Py_ssize_t requiredsize)
4694{
4695    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4696    if (requiredsize > oldsize) {
4697	/* remember old output position */
4698	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4699	/* exponentially overallocate to minimize reallocations */
4700	if (requiredsize < 2 * oldsize)
4701	    requiredsize = 2 * oldsize;
4702	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4703	    return -1;
4704	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4705    }
4706    return 0;
4707}
4708/* lookup the character, put the result in the output string and adjust
4709   various state variables. Return a new reference to the object that
4710   was put in the output buffer in *result, or Py_None, if the mapping was
4711   undefined (in which case no character was written).
4712   The called must decref result.
4713   Return 0 on success, -1 on error. */
4714static
4715int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4716    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4717    PyObject **res)
4718{
4719    if (charmaptranslate_lookup(*curinp, mapping, res))
4720	return -1;
4721    if (*res==NULL) {
4722	/* not found => default to 1:1 mapping */
4723	*(*outp)++ = *curinp;
4724    }
4725    else if (*res==Py_None)
4726	;
4727    else if (PyInt_Check(*res)) {
4728	/* no overflow check, because we know that the space is enough */
4729	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4730    }
4731    else if (PyUnicode_Check(*res)) {
4732	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4733	if (repsize==1) {
4734	    /* no overflow check, because we know that the space is enough */
4735	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4736	}
4737	else if (repsize!=0) {
4738	    /* more than one character */
4739	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4740		(insize - (curinp-startinp)) +
4741		repsize - 1;
4742	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
4743		return -1;
4744	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4745	    *outp += repsize;
4746	}
4747    }
4748    else
4749	return -1;
4750    return 0;
4751}
4752
4753PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4754				     Py_ssize_t size,
4755				     PyObject *mapping,
4756				     const char *errors)
4757{
4758    /* output object */
4759    PyObject *res = NULL;
4760    /* pointers to the beginning and end+1 of input */
4761    const Py_UNICODE *startp = p;
4762    const Py_UNICODE *endp = p + size;
4763    /* pointer into the output */
4764    Py_UNICODE *str;
4765    /* current output position */
4766    Py_ssize_t respos = 0;
4767    char *reason = "character maps to <undefined>";
4768    PyObject *errorHandler = NULL;
4769    PyObject *exc = NULL;
4770    /* the following variable is used for caching string comparisons
4771     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4772     * 3=ignore, 4=xmlcharrefreplace */
4773    int known_errorHandler = -1;
4774
4775    if (mapping == NULL) {
4776	PyErr_BadArgument();
4777	return NULL;
4778    }
4779
4780    /* allocate enough for a simple 1:1 translation without
4781       replacements, if we need more, we'll resize */
4782    res = PyUnicode_FromUnicode(NULL, size);
4783    if (res == NULL)
4784	goto onError;
4785    if (size == 0)
4786	return res;
4787    str = PyUnicode_AS_UNICODE(res);
4788
4789    while (p<endp) {
4790	/* try to encode it */
4791	PyObject *x = NULL;
4792	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4793	    Py_XDECREF(x);
4794	    goto onError;
4795	}
4796	Py_XDECREF(x);
4797	if (x!=Py_None) /* it worked => adjust input pointer */
4798	    ++p;
4799	else { /* untranslatable character */
4800	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4801	    Py_ssize_t repsize;
4802	    Py_ssize_t newpos;
4803	    Py_UNICODE *uni2;
4804	    /* startpos for collecting untranslatable chars */
4805	    const Py_UNICODE *collstart = p;
4806	    const Py_UNICODE *collend = p+1;
4807	    const Py_UNICODE *coll;
4808
4809	    /* find all untranslatable characters */
4810	    while (collend < endp) {
4811		if (charmaptranslate_lookup(*collend, mapping, &x))
4812		    goto onError;
4813		Py_XDECREF(x);
4814		if (x!=Py_None)
4815		    break;
4816		++collend;
4817	    }
4818	    /* cache callback name lookup
4819	     * (if not done yet, i.e. it's the first error) */
4820	    if (known_errorHandler==-1) {
4821		if ((errors==NULL) || (!strcmp(errors, "strict")))
4822		    known_errorHandler = 1;
4823		else if (!strcmp(errors, "replace"))
4824		    known_errorHandler = 2;
4825		else if (!strcmp(errors, "ignore"))
4826		    known_errorHandler = 3;
4827		else if (!strcmp(errors, "xmlcharrefreplace"))
4828		    known_errorHandler = 4;
4829		else
4830		    known_errorHandler = 0;
4831	    }
4832	    switch (known_errorHandler) {
4833		case 1: /* strict */
4834		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4835		    goto onError;
4836		case 2: /* replace */
4837		    /* No need to check for space, this is a 1:1 replacement */
4838		    for (coll = collstart; coll<collend; ++coll)
4839			*str++ = '?';
4840		    /* fall through */
4841		case 3: /* ignore */
4842		    p = collend;
4843		    break;
4844		case 4: /* xmlcharrefreplace */
4845		    /* generate replacement (temporarily (mis)uses p) */
4846		    for (p = collstart; p < collend; ++p) {
4847			char buffer[2+29+1+1];
4848			char *cp;
4849			sprintf(buffer, "&#%d;", (int)*p);
4850			if (charmaptranslate_makespace(&res, &str,
4851			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4852			    goto onError;
4853			for (cp = buffer; *cp; ++cp)
4854			    *str++ = *cp;
4855		    }
4856		    p = collend;
4857		    break;
4858		default:
4859		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4860			reason, startp, size, &exc,
4861			collstart-startp, collend-startp, &newpos);
4862		    if (repunicode == NULL)
4863			goto onError;
4864		    /* generate replacement  */
4865		    repsize = PyUnicode_GET_SIZE(repunicode);
4866		    if (charmaptranslate_makespace(&res, &str,
4867			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4868			Py_DECREF(repunicode);
4869			goto onError;
4870		    }
4871		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4872			*str++ = *uni2;
4873		    p = startp + newpos;
4874		    Py_DECREF(repunicode);
4875	    }
4876	}
4877    }
4878    /* Resize if we allocated to much */
4879    respos = str-PyUnicode_AS_UNICODE(res);
4880    if (respos<PyUnicode_GET_SIZE(res)) {
4881	if (_PyUnicode_Resize(&res, respos) < 0)
4882	    goto onError;
4883    }
4884    Py_XDECREF(exc);
4885    Py_XDECREF(errorHandler);
4886    return res;
4887
4888    onError:
4889    Py_XDECREF(res);
4890    Py_XDECREF(exc);
4891    Py_XDECREF(errorHandler);
4892    return NULL;
4893}
4894
4895PyObject *PyUnicode_Translate(PyObject *str,
4896			      PyObject *mapping,
4897			      const char *errors)
4898{
4899    PyObject *result;
4900
4901    str = PyUnicode_FromObject(str);
4902    if (str == NULL)
4903	goto onError;
4904    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4905					PyUnicode_GET_SIZE(str),
4906					mapping,
4907					errors);
4908    Py_DECREF(str);
4909    return result;
4910
4911 onError:
4912    Py_XDECREF(str);
4913    return NULL;
4914}
4915
4916/* --- Decimal Encoder ---------------------------------------------------- */
4917
4918int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4919			    Py_ssize_t length,
4920			    char *output,
4921			    const char *errors)
4922{
4923    Py_UNICODE *p, *end;
4924    PyObject *errorHandler = NULL;
4925    PyObject *exc = NULL;
4926    const char *encoding = "decimal";
4927    const char *reason = "invalid decimal Unicode string";
4928    /* the following variable is used for caching string comparisons
4929     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4930    int known_errorHandler = -1;
4931
4932    if (output == NULL) {
4933	PyErr_BadArgument();
4934	return -1;
4935    }
4936
4937    p = s;
4938    end = s + length;
4939    while (p < end) {
4940	register Py_UNICODE ch = *p;
4941	int decimal;
4942	PyObject *repunicode;
4943	Py_ssize_t repsize;
4944	Py_ssize_t newpos;
4945	Py_UNICODE *uni2;
4946	Py_UNICODE *collstart;
4947	Py_UNICODE *collend;
4948
4949	if (Py_UNICODE_ISSPACE(ch)) {
4950	    *output++ = ' ';
4951	    ++p;
4952	    continue;
4953	}
4954	decimal = Py_UNICODE_TODECIMAL(ch);
4955	if (decimal >= 0) {
4956	    *output++ = '0' + decimal;
4957	    ++p;
4958	    continue;
4959	}
4960	if (0 < ch && ch < 256) {
4961	    *output++ = (char)ch;
4962	    ++p;
4963	    continue;
4964	}
4965	/* All other characters are considered unencodable */
4966	collstart = p;
4967	collend = p+1;
4968	while (collend < end) {
4969	    if ((0 < *collend && *collend < 256) ||
4970	        !Py_UNICODE_ISSPACE(*collend) ||
4971	        Py_UNICODE_TODECIMAL(*collend))
4972		break;
4973	}
4974	/* cache callback name lookup
4975	 * (if not done yet, i.e. it's the first error) */
4976	if (known_errorHandler==-1) {
4977	    if ((errors==NULL) || (!strcmp(errors, "strict")))
4978		known_errorHandler = 1;
4979	    else if (!strcmp(errors, "replace"))
4980		known_errorHandler = 2;
4981	    else if (!strcmp(errors, "ignore"))
4982		known_errorHandler = 3;
4983	    else if (!strcmp(errors, "xmlcharrefreplace"))
4984		known_errorHandler = 4;
4985	    else
4986		known_errorHandler = 0;
4987	}
4988	switch (known_errorHandler) {
4989	    case 1: /* strict */
4990		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4991		goto onError;
4992	    case 2: /* replace */
4993		for (p = collstart; p < collend; ++p)
4994		    *output++ = '?';
4995		/* fall through */
4996	    case 3: /* ignore */
4997		p = collend;
4998		break;
4999	    case 4: /* xmlcharrefreplace */
5000		/* generate replacement (temporarily (mis)uses p) */
5001		for (p = collstart; p < collend; ++p)
5002		    output += sprintf(output, "&#%d;", (int)*p);
5003		p = collend;
5004		break;
5005	    default:
5006		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5007		    encoding, reason, s, length, &exc,
5008		    collstart-s, collend-s, &newpos);
5009		if (repunicode == NULL)
5010		    goto onError;
5011		/* generate replacement  */
5012		repsize = PyUnicode_GET_SIZE(repunicode);
5013		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5014		    Py_UNICODE ch = *uni2;
5015		    if (Py_UNICODE_ISSPACE(ch))
5016			*output++ = ' ';
5017		    else {
5018			decimal = Py_UNICODE_TODECIMAL(ch);
5019			if (decimal >= 0)
5020			    *output++ = '0' + decimal;
5021			else if (0 < ch && ch < 256)
5022			    *output++ = (char)ch;
5023			else {
5024			    Py_DECREF(repunicode);
5025			    raise_encode_exception(&exc, encoding,
5026				s, length, collstart-s, collend-s, reason);
5027			    goto onError;
5028			}
5029		    }
5030		}
5031		p = s + newpos;
5032		Py_DECREF(repunicode);
5033	}
5034    }
5035    /* 0-terminate the output string */
5036    *output++ = '\0';
5037    Py_XDECREF(exc);
5038    Py_XDECREF(errorHandler);
5039    return 0;
5040
5041 onError:
5042    Py_XDECREF(exc);
5043    Py_XDECREF(errorHandler);
5044    return -1;
5045}
5046
5047/* --- Helpers ------------------------------------------------------------ */
5048
5049#include "stringlib/unicodedefs.h"
5050
5051#include "stringlib/fastsearch.h"
5052
5053#include "stringlib/count.h"
5054#include "stringlib/find.h"
5055#include "stringlib/partition.h"
5056
5057/* helper macro to fixup start/end slice values */
5058#define FIX_START_END(obj)                      \
5059    if (start < 0)                              \
5060        start += (obj)->length;                 \
5061    if (start < 0)                              \
5062        start = 0;                              \
5063    if (end > (obj)->length)                    \
5064        end = (obj)->length;                    \
5065    if (end < 0)                                \
5066        end += (obj)->length;                   \
5067    if (end < 0)                                \
5068        end = 0;
5069
5070Py_ssize_t PyUnicode_Count(PyObject *str,
5071                           PyObject *substr,
5072                           Py_ssize_t start,
5073                           Py_ssize_t end)
5074{
5075    Py_ssize_t result;
5076    PyUnicodeObject* str_obj;
5077    PyUnicodeObject* sub_obj;
5078
5079    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5080    if (!str_obj)
5081	return -1;
5082    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5083    if (!sub_obj) {
5084	Py_DECREF(str_obj);
5085	return -1;
5086    }
5087
5088    FIX_START_END(str_obj);
5089
5090    result = stringlib_count(
5091        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5092        );
5093
5094    Py_DECREF(sub_obj);
5095    Py_DECREF(str_obj);
5096
5097    return result;
5098}
5099
5100Py_ssize_t PyUnicode_Find(PyObject *str,
5101                          PyObject *sub,
5102                          Py_ssize_t start,
5103                          Py_ssize_t end,
5104                          int direction)
5105{
5106    Py_ssize_t result;
5107
5108    str = PyUnicode_FromObject(str);
5109    if (!str)
5110	return -2;
5111    sub = PyUnicode_FromObject(sub);
5112    if (!sub) {
5113	Py_DECREF(str);
5114	return -2;
5115    }
5116
5117    if (direction > 0)
5118        result = stringlib_find_slice(
5119            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5120            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5121            start, end
5122            );
5123    else
5124        result = stringlib_rfind_slice(
5125            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5126            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5127            start, end
5128            );
5129
5130    Py_DECREF(str);
5131    Py_DECREF(sub);
5132
5133    return result;
5134}
5135
5136static
5137int tailmatch(PyUnicodeObject *self,
5138	      PyUnicodeObject *substring,
5139	      Py_ssize_t start,
5140	      Py_ssize_t end,
5141	      int direction)
5142{
5143    if (substring->length == 0)
5144        return 1;
5145
5146    FIX_START_END(self);
5147
5148    end -= substring->length;
5149    if (end < start)
5150	return 0;
5151
5152    if (direction > 0) {
5153	if (Py_UNICODE_MATCH(self, end, substring))
5154	    return 1;
5155    } else {
5156        if (Py_UNICODE_MATCH(self, start, substring))
5157	    return 1;
5158    }
5159
5160    return 0;
5161}
5162
5163Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5164			PyObject *substr,
5165			Py_ssize_t start,
5166			Py_ssize_t end,
5167			int direction)
5168{
5169    Py_ssize_t result;
5170
5171    str = PyUnicode_FromObject(str);
5172    if (str == NULL)
5173	return -1;
5174    substr = PyUnicode_FromObject(substr);
5175    if (substr == NULL) {
5176	Py_DECREF(str);
5177	return -1;
5178    }
5179
5180    result = tailmatch((PyUnicodeObject *)str,
5181		       (PyUnicodeObject *)substr,
5182		       start, end, direction);
5183    Py_DECREF(str);
5184    Py_DECREF(substr);
5185    return result;
5186}
5187
5188/* Apply fixfct filter to the Unicode object self and return a
5189   reference to the modified object */
5190
5191static
5192PyObject *fixup(PyUnicodeObject *self,
5193		int (*fixfct)(PyUnicodeObject *s))
5194{
5195
5196    PyUnicodeObject *u;
5197
5198    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5199    if (u == NULL)
5200	return NULL;
5201
5202    Py_UNICODE_COPY(u->str, self->str, self->length);
5203
5204    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5205	/* fixfct should return TRUE if it modified the buffer. If
5206	   FALSE, return a reference to the original buffer instead
5207	   (to save space, not time) */
5208	Py_INCREF(self);
5209	Py_DECREF(u);
5210	return (PyObject*) self;
5211    }
5212    return (PyObject*) u;
5213}
5214
5215static
5216int fixupper(PyUnicodeObject *self)
5217{
5218    Py_ssize_t len = self->length;
5219    Py_UNICODE *s = self->str;
5220    int status = 0;
5221
5222    while (len-- > 0) {
5223	register Py_UNICODE ch;
5224
5225	ch = Py_UNICODE_TOUPPER(*s);
5226	if (ch != *s) {
5227            status = 1;
5228	    *s = ch;
5229	}
5230        s++;
5231    }
5232
5233    return status;
5234}
5235
5236static
5237int fixlower(PyUnicodeObject *self)
5238{
5239    Py_ssize_t len = self->length;
5240    Py_UNICODE *s = self->str;
5241    int status = 0;
5242
5243    while (len-- > 0) {
5244	register Py_UNICODE ch;
5245
5246	ch = Py_UNICODE_TOLOWER(*s);
5247	if (ch != *s) {
5248            status = 1;
5249	    *s = ch;
5250	}
5251        s++;
5252    }
5253
5254    return status;
5255}
5256
5257static
5258int fixswapcase(PyUnicodeObject *self)
5259{
5260    Py_ssize_t len = self->length;
5261    Py_UNICODE *s = self->str;
5262    int status = 0;
5263
5264    while (len-- > 0) {
5265        if (Py_UNICODE_ISUPPER(*s)) {
5266            *s = Py_UNICODE_TOLOWER(*s);
5267            status = 1;
5268        } else if (Py_UNICODE_ISLOWER(*s)) {
5269            *s = Py_UNICODE_TOUPPER(*s);
5270            status = 1;
5271        }
5272        s++;
5273    }
5274
5275    return status;
5276}
5277
5278static
5279int fixcapitalize(PyUnicodeObject *self)
5280{
5281    Py_ssize_t len = self->length;
5282    Py_UNICODE *s = self->str;
5283    int status = 0;
5284
5285    if (len == 0)
5286	return 0;
5287    if (Py_UNICODE_ISLOWER(*s)) {
5288	*s = Py_UNICODE_TOUPPER(*s);
5289	status = 1;
5290    }
5291    s++;
5292    while (--len > 0) {
5293        if (Py_UNICODE_ISUPPER(*s)) {
5294            *s = Py_UNICODE_TOLOWER(*s);
5295            status = 1;
5296        }
5297        s++;
5298    }
5299    return status;
5300}
5301
5302static
5303int fixtitle(PyUnicodeObject *self)
5304{
5305    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5306    register Py_UNICODE *e;
5307    int previous_is_cased;
5308
5309    /* Shortcut for single character strings */
5310    if (PyUnicode_GET_SIZE(self) == 1) {
5311	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5312	if (*p != ch) {
5313	    *p = ch;
5314	    return 1;
5315	}
5316	else
5317	    return 0;
5318    }
5319
5320    e = p + PyUnicode_GET_SIZE(self);
5321    previous_is_cased = 0;
5322    for (; p < e; p++) {
5323	register const Py_UNICODE ch = *p;
5324
5325	if (previous_is_cased)
5326	    *p = Py_UNICODE_TOLOWER(ch);
5327	else
5328	    *p = Py_UNICODE_TOTITLE(ch);
5329
5330	if (Py_UNICODE_ISLOWER(ch) ||
5331	    Py_UNICODE_ISUPPER(ch) ||
5332	    Py_UNICODE_ISTITLE(ch))
5333	    previous_is_cased = 1;
5334	else
5335	    previous_is_cased = 0;
5336    }
5337    return 1;
5338}
5339
5340PyObject *
5341PyUnicode_Join(PyObject *separator, PyObject *seq)
5342{
5343    PyObject *internal_separator = NULL;
5344    const Py_UNICODE blank = ' ';
5345    const Py_UNICODE *sep = &blank;
5346    Py_ssize_t seplen = 1;
5347    PyUnicodeObject *res = NULL; /* the result */
5348    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5349    Py_ssize_t res_used;         /* # used bytes */
5350    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5351    PyObject *fseq;          /* PySequence_Fast(seq) */
5352    Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5353    PyObject *item;
5354    Py_ssize_t i;
5355
5356    fseq = PySequence_Fast(seq, "");
5357    if (fseq == NULL) {
5358    	return NULL;
5359    }
5360
5361    /* Grrrr.  A codec may be invoked to convert str objects to
5362     * Unicode, and so it's possible to call back into Python code
5363     * during PyUnicode_FromObject(), and so it's possible for a sick
5364     * codec to change the size of fseq (if seq is a list).  Therefore
5365     * we have to keep refetching the size -- can't assume seqlen
5366     * is invariant.
5367     */
5368    seqlen = PySequence_Fast_GET_SIZE(fseq);
5369    /* If empty sequence, return u"". */
5370    if (seqlen == 0) {
5371    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5372    	goto Done;
5373    }
5374    /* If singleton sequence with an exact Unicode, return that. */
5375    if (seqlen == 1) {
5376	item = PySequence_Fast_GET_ITEM(fseq, 0);
5377	if (PyUnicode_CheckExact(item)) {
5378	    Py_INCREF(item);
5379	    res = (PyUnicodeObject *)item;
5380	    goto Done;
5381	}
5382    }
5383
5384    /* At least two items to join, or one that isn't exact Unicode. */
5385    if (seqlen > 1) {
5386        /* Set up sep and seplen -- they're needed. */
5387    	if (separator == NULL) {
5388	    sep = &blank;
5389	    seplen = 1;
5390        }
5391    	else {
5392	    internal_separator = PyUnicode_FromObject(separator);
5393	    if (internal_separator == NULL)
5394	        goto onError;
5395	    sep = PyUnicode_AS_UNICODE(internal_separator);
5396	    seplen = PyUnicode_GET_SIZE(internal_separator);
5397	    /* In case PyUnicode_FromObject() mutated seq. */
5398	    seqlen = PySequence_Fast_GET_SIZE(fseq);
5399        }
5400    }
5401
5402    /* Get space. */
5403    res = _PyUnicode_New(res_alloc);
5404    if (res == NULL)
5405        goto onError;
5406    res_p = PyUnicode_AS_UNICODE(res);
5407    res_used = 0;
5408
5409    for (i = 0; i < seqlen; ++i) {
5410	Py_ssize_t itemlen;
5411	Py_ssize_t new_res_used;
5412
5413	item = PySequence_Fast_GET_ITEM(fseq, i);
5414	/* Convert item to Unicode. */
5415	if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5416	    PyErr_Format(PyExc_TypeError,
5417			 "sequence item %zd: expected string or Unicode,"
5418			 " %.80s found",
5419			 i, Py_Type(item)->tp_name);
5420	    goto onError;
5421	}
5422	item = PyUnicode_FromObject(item);
5423	if (item == NULL)
5424	    goto onError;
5425	/* We own a reference to item from here on. */
5426
5427	/* In case PyUnicode_FromObject() mutated seq. */
5428	seqlen = PySequence_Fast_GET_SIZE(fseq);
5429
5430        /* Make sure we have enough space for the separator and the item. */
5431	itemlen = PyUnicode_GET_SIZE(item);
5432	new_res_used = res_used + itemlen;
5433	if (new_res_used < 0)
5434	    goto Overflow;
5435	if (i < seqlen - 1) {
5436	    new_res_used += seplen;
5437	    if (new_res_used < 0)
5438		goto Overflow;
5439	}
5440	if (new_res_used > res_alloc) {
5441	    /* double allocated size until it's big enough */
5442	    do {
5443	        res_alloc += res_alloc;
5444	        if (res_alloc <= 0)
5445	            goto Overflow;
5446	    } while (new_res_used > res_alloc);
5447	    if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5448		Py_DECREF(item);
5449		goto onError;
5450	    }
5451            res_p = PyUnicode_AS_UNICODE(res) + res_used;
5452	}
5453
5454	/* Copy item, and maybe the separator. */
5455	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5456	res_p += itemlen;
5457	if (i < seqlen - 1) {
5458	    Py_UNICODE_COPY(res_p, sep, seplen);
5459	    res_p += seplen;
5460	}
5461	Py_DECREF(item);
5462	res_used = new_res_used;
5463    }
5464
5465    /* Shrink res to match the used area; this probably can't fail,
5466     * but it's cheap to check.
5467     */
5468    if (_PyUnicode_Resize(&res, res_used) < 0)
5469	goto onError;
5470
5471 Done:
5472    Py_XDECREF(internal_separator);
5473    Py_DECREF(fseq);
5474    return (PyObject *)res;
5475
5476 Overflow:
5477    PyErr_SetString(PyExc_OverflowError,
5478                    "join() result is too long for a Python string");
5479    Py_DECREF(item);
5480    /* fall through */
5481
5482 onError:
5483    Py_XDECREF(internal_separator);
5484    Py_DECREF(fseq);
5485    Py_XDECREF(res);
5486    return NULL;
5487}
5488
5489static
5490PyUnicodeObject *pad(PyUnicodeObject *self,
5491		     Py_ssize_t left,
5492		     Py_ssize_t right,
5493		     Py_UNICODE fill)
5494{
5495    PyUnicodeObject *u;
5496
5497    if (left < 0)
5498        left = 0;
5499    if (right < 0)
5500        right = 0;
5501
5502    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5503        Py_INCREF(self);
5504        return self;
5505    }
5506
5507    u = _PyUnicode_New(left + self->length + right);
5508    if (u) {
5509        if (left)
5510            Py_UNICODE_FILL(u->str, fill, left);
5511        Py_UNICODE_COPY(u->str + left, self->str, self->length);
5512        if (right)
5513            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5514    }
5515
5516    return u;
5517}
5518
5519#define SPLIT_APPEND(data, left, right)					\
5520	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
5521	if (!str)							\
5522	    goto onError;						\
5523	if (PyList_Append(list, str)) {					\
5524	    Py_DECREF(str);						\
5525	    goto onError;						\
5526	}								\
5527        else								\
5528            Py_DECREF(str);
5529
5530static
5531PyObject *split_whitespace(PyUnicodeObject *self,
5532			   PyObject *list,
5533			   Py_ssize_t maxcount)
5534{
5535    register Py_ssize_t i;
5536    register Py_ssize_t j;
5537    Py_ssize_t len = self->length;
5538    PyObject *str;
5539
5540    for (i = j = 0; i < len; ) {
5541	/* find a token */
5542	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5543	    i++;
5544	j = i;
5545	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5546	    i++;
5547	if (j < i) {
5548	    if (maxcount-- <= 0)
5549		break;
5550	    SPLIT_APPEND(self->str, j, i);
5551	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5552		i++;
5553	    j = i;
5554	}
5555    }
5556    if (j < len) {
5557	SPLIT_APPEND(self->str, j, len);
5558    }
5559    return list;
5560
5561 onError:
5562    Py_DECREF(list);
5563    return NULL;
5564}
5565
5566PyObject *PyUnicode_Splitlines(PyObject *string,
5567			       int keepends)
5568{
5569    register Py_ssize_t i;
5570    register Py_ssize_t j;
5571    Py_ssize_t len;
5572    PyObject *list;
5573    PyObject *str;
5574    Py_UNICODE *data;
5575
5576    string = PyUnicode_FromObject(string);
5577    if (string == NULL)
5578	return NULL;
5579    data = PyUnicode_AS_UNICODE(string);
5580    len = PyUnicode_GET_SIZE(string);
5581
5582    list = PyList_New(0);
5583    if (!list)
5584        goto onError;
5585
5586    for (i = j = 0; i < len; ) {
5587	Py_ssize_t eol;
5588
5589	/* Find a line and append it */
5590	while (i < len && !BLOOM_LINEBREAK(data[i]))
5591	    i++;
5592
5593	/* Skip the line break reading CRLF as one line break */
5594	eol = i;
5595	if (i < len) {
5596	    if (data[i] == '\r' && i + 1 < len &&
5597		data[i+1] == '\n')
5598		i += 2;
5599	    else
5600		i++;
5601	    if (keepends)
5602		eol = i;
5603	}
5604	SPLIT_APPEND(data, j, eol);
5605	j = i;
5606    }
5607    if (j < len) {
5608	SPLIT_APPEND(data, j, len);
5609    }
5610
5611    Py_DECREF(string);
5612    return list;
5613
5614 onError:
5615    Py_XDECREF(list);
5616    Py_DECREF(string);
5617    return NULL;
5618}
5619
5620static
5621PyObject *split_char(PyUnicodeObject *self,
5622		     PyObject *list,
5623		     Py_UNICODE ch,
5624		     Py_ssize_t maxcount)
5625{
5626    register Py_ssize_t i;
5627    register Py_ssize_t j;
5628    Py_ssize_t len = self->length;
5629    PyObject *str;
5630
5631    for (i = j = 0; i < len; ) {
5632	if (self->str[i] == ch) {
5633	    if (maxcount-- <= 0)
5634		break;
5635	    SPLIT_APPEND(self->str, j, i);
5636	    i = j = i + 1;
5637	} else
5638	    i++;
5639    }
5640    if (j <= len) {
5641	SPLIT_APPEND(self->str, j, len);
5642    }
5643    return list;
5644
5645 onError:
5646    Py_DECREF(list);
5647    return NULL;
5648}
5649
5650static
5651PyObject *split_substring(PyUnicodeObject *self,
5652			  PyObject *list,
5653			  PyUnicodeObject *substring,
5654			  Py_ssize_t maxcount)
5655{
5656    register Py_ssize_t i;
5657    register Py_ssize_t j;
5658    Py_ssize_t len = self->length;
5659    Py_ssize_t sublen = substring->length;
5660    PyObject *str;
5661
5662    for (i = j = 0; i <= len - sublen; ) {
5663	if (Py_UNICODE_MATCH(self, i, substring)) {
5664	    if (maxcount-- <= 0)
5665		break;
5666	    SPLIT_APPEND(self->str, j, i);
5667	    i = j = i + sublen;
5668	} else
5669	    i++;
5670    }
5671    if (j <= len) {
5672	SPLIT_APPEND(self->str, j, len);
5673    }
5674    return list;
5675
5676 onError:
5677    Py_DECREF(list);
5678    return NULL;
5679}
5680
5681static
5682PyObject *rsplit_whitespace(PyUnicodeObject *self,
5683			    PyObject *list,
5684			    Py_ssize_t maxcount)
5685{
5686    register Py_ssize_t i;
5687    register Py_ssize_t j;
5688    Py_ssize_t len = self->length;
5689    PyObject *str;
5690
5691    for (i = j = len - 1; i >= 0; ) {
5692	/* find a token */
5693	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5694	    i--;
5695	j = i;
5696	while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5697	    i--;
5698	if (j > i) {
5699	    if (maxcount-- <= 0)
5700		break;
5701	    SPLIT_APPEND(self->str, i + 1, j + 1);
5702	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5703		i--;
5704	    j = i;
5705	}
5706    }
5707    if (j >= 0) {
5708	SPLIT_APPEND(self->str, 0, j + 1);
5709    }
5710    if (PyList_Reverse(list) < 0)
5711        goto onError;
5712    return list;
5713
5714 onError:
5715    Py_DECREF(list);
5716    return NULL;
5717}
5718
5719static
5720PyObject *rsplit_char(PyUnicodeObject *self,
5721		      PyObject *list,
5722		      Py_UNICODE ch,
5723		      Py_ssize_t maxcount)
5724{
5725    register Py_ssize_t i;
5726    register Py_ssize_t j;
5727    Py_ssize_t len = self->length;
5728    PyObject *str;
5729
5730    for (i = j = len - 1; i >= 0; ) {
5731	if (self->str[i] == ch) {
5732	    if (maxcount-- <= 0)
5733		break;
5734	    SPLIT_APPEND(self->str, i + 1, j + 1);
5735	    j = i = i - 1;
5736	} else
5737	    i--;
5738    }
5739    if (j >= -1) {
5740	SPLIT_APPEND(self->str, 0, j + 1);
5741    }
5742    if (PyList_Reverse(list) < 0)
5743        goto onError;
5744    return list;
5745
5746 onError:
5747    Py_DECREF(list);
5748    return NULL;
5749}
5750
5751static
5752PyObject *rsplit_substring(PyUnicodeObject *self,
5753			   PyObject *list,
5754			   PyUnicodeObject *substring,
5755			   Py_ssize_t maxcount)
5756{
5757    register Py_ssize_t i;
5758    register Py_ssize_t j;
5759    Py_ssize_t len = self->length;
5760    Py_ssize_t sublen = substring->length;
5761    PyObject *str;
5762
5763    for (i = len - sublen, j = len; i >= 0; ) {
5764	if (Py_UNICODE_MATCH(self, i, substring)) {
5765	    if (maxcount-- <= 0)
5766		break;
5767	    SPLIT_APPEND(self->str, i + sublen, j);
5768	    j = i;
5769	    i -= sublen;
5770	} else
5771	    i--;
5772    }
5773    if (j >= 0) {
5774	SPLIT_APPEND(self->str, 0, j);
5775    }
5776    if (PyList_Reverse(list) < 0)
5777        goto onError;
5778    return list;
5779
5780 onError:
5781    Py_DECREF(list);
5782    return NULL;
5783}
5784
5785#undef SPLIT_APPEND
5786
5787static
5788PyObject *split(PyUnicodeObject *self,
5789		PyUnicodeObject *substring,
5790		Py_ssize_t maxcount)
5791{
5792    PyObject *list;
5793
5794    if (maxcount < 0)
5795        maxcount = PY_SSIZE_T_MAX;
5796
5797    list = PyList_New(0);
5798    if (!list)
5799        return NULL;
5800
5801    if (substring == NULL)
5802	return split_whitespace(self,list,maxcount);
5803
5804    else if (substring->length == 1)
5805	return split_char(self,list,substring->str[0],maxcount);
5806
5807    else if (substring->length == 0) {
5808	Py_DECREF(list);
5809	PyErr_SetString(PyExc_ValueError, "empty separator");
5810	return NULL;
5811    }
5812    else
5813	return split_substring(self,list,substring,maxcount);
5814}
5815
5816static
5817PyObject *rsplit(PyUnicodeObject *self,
5818		 PyUnicodeObject *substring,
5819		 Py_ssize_t maxcount)
5820{
5821    PyObject *list;
5822
5823    if (maxcount < 0)
5824        maxcount = PY_SSIZE_T_MAX;
5825
5826    list = PyList_New(0);
5827    if (!list)
5828        return NULL;
5829
5830    if (substring == NULL)
5831	return rsplit_whitespace(self,list,maxcount);
5832
5833    else if (substring->length == 1)
5834	return rsplit_char(self,list,substring->str[0],maxcount);
5835
5836    else if (substring->length == 0) {
5837	Py_DECREF(list);
5838	PyErr_SetString(PyExc_ValueError, "empty separator");
5839	return NULL;
5840    }
5841    else
5842	return rsplit_substring(self,list,substring,maxcount);
5843}
5844
5845static
5846PyObject *replace(PyUnicodeObject *self,
5847		  PyUnicodeObject *str1,
5848		  PyUnicodeObject *str2,
5849		  Py_ssize_t maxcount)
5850{
5851    PyUnicodeObject *u;
5852
5853    if (maxcount < 0)
5854	maxcount = PY_SSIZE_T_MAX;
5855
5856    if (str1->length == str2->length) {
5857        /* same length */
5858        Py_ssize_t i;
5859        if (str1->length == 1) {
5860            /* replace characters */
5861            Py_UNICODE u1, u2;
5862            if (!findchar(self->str, self->length, str1->str[0]))
5863                goto nothing;
5864            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5865            if (!u)
5866                return NULL;
5867            Py_UNICODE_COPY(u->str, self->str, self->length);
5868            u1 = str1->str[0];
5869            u2 = str2->str[0];
5870            for (i = 0; i < u->length; i++)
5871                if (u->str[i] == u1) {
5872                    if (--maxcount < 0)
5873                        break;
5874                    u->str[i] = u2;
5875                }
5876        } else {
5877            i = fastsearch(
5878                self->str, self->length, str1->str, str1->length, FAST_SEARCH
5879                );
5880            if (i < 0)
5881                goto nothing;
5882            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5883            if (!u)
5884                return NULL;
5885            Py_UNICODE_COPY(u->str, self->str, self->length);
5886            while (i <= self->length - str1->length)
5887                if (Py_UNICODE_MATCH(self, i, str1)) {
5888                    if (--maxcount < 0)
5889                        break;
5890                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5891                    i += str1->length;
5892                } else
5893                    i++;
5894        }
5895    } else {
5896
5897        Py_ssize_t n, i, j, e;
5898        Py_ssize_t product, new_size, delta;
5899        Py_UNICODE *p;
5900
5901        /* replace strings */
5902        n = stringlib_count(self->str, self->length, str1->str, str1->length);
5903        if (n > maxcount)
5904            n = maxcount;
5905        if (n == 0)
5906            goto nothing;
5907        /* new_size = self->length + n * (str2->length - str1->length)); */
5908        delta = (str2->length - str1->length);
5909        if (delta == 0) {
5910            new_size = self->length;
5911        } else {
5912            product = n * (str2->length - str1->length);
5913            if ((product / (str2->length - str1->length)) != n) {
5914                PyErr_SetString(PyExc_OverflowError,
5915                                "replace string is too long");
5916                return NULL;
5917            }
5918            new_size = self->length + product;
5919            if (new_size < 0) {
5920                PyErr_SetString(PyExc_OverflowError,
5921                                "replace string is too long");
5922                return NULL;
5923            }
5924        }
5925        u = _PyUnicode_New(new_size);
5926        if (!u)
5927            return NULL;
5928        i = 0;
5929        p = u->str;
5930        e = self->length - str1->length;
5931        if (str1->length > 0) {
5932            while (n-- > 0) {
5933                /* look for next match */
5934                j = i;
5935                while (j <= e) {
5936                    if (Py_UNICODE_MATCH(self, j, str1))
5937                        break;
5938                    j++;
5939                }
5940		if (j > i) {
5941                    if (j > e)
5942                        break;
5943                    /* copy unchanged part [i:j] */
5944                    Py_UNICODE_COPY(p, self->str+i, j-i);
5945                    p += j - i;
5946                }
5947                /* copy substitution string */
5948                if (str2->length > 0) {
5949                    Py_UNICODE_COPY(p, str2->str, str2->length);
5950                    p += str2->length;
5951                }
5952                i = j + str1->length;
5953            }
5954            if (i < self->length)
5955                /* copy tail [i:] */
5956                Py_UNICODE_COPY(p, self->str+i, self->length-i);
5957        } else {
5958            /* interleave */
5959            while (n > 0) {
5960                Py_UNICODE_COPY(p, str2->str, str2->length);
5961                p += str2->length;
5962                if (--n <= 0)
5963                    break;
5964                *p++ = self->str[i++];
5965            }
5966            Py_UNICODE_COPY(p, self->str+i, self->length-i);
5967        }
5968    }
5969    return (PyObject *) u;
5970
5971nothing:
5972    /* nothing to replace; return original string (when possible) */
5973    if (PyUnicode_CheckExact(self)) {
5974        Py_INCREF(self);
5975        return (PyObject *) self;
5976    }
5977    return PyUnicode_FromUnicode(self->str, self->length);
5978}
5979
5980/* --- Unicode Object Methods --------------------------------------------- */
5981
5982PyDoc_STRVAR(title__doc__,
5983"S.title() -> unicode\n\
5984\n\
5985Return a titlecased version of S, i.e. words start with title case\n\
5986characters, all remaining cased characters have lower case.");
5987
5988static PyObject*
5989unicode_title(PyUnicodeObject *self)
5990{
5991    return fixup(self, fixtitle);
5992}
5993
5994PyDoc_STRVAR(capitalize__doc__,
5995"S.capitalize() -> unicode\n\
5996\n\
5997Return a capitalized version of S, i.e. make the first character\n\
5998have upper case.");
5999
6000static PyObject*
6001unicode_capitalize(PyUnicodeObject *self)
6002{
6003    return fixup(self, fixcapitalize);
6004}
6005
6006#if 0
6007PyDoc_STRVAR(capwords__doc__,
6008"S.capwords() -> unicode\n\
6009\n\
6010Apply .capitalize() to all words in S and return the result with\n\
6011normalized whitespace (all whitespace strings are replaced by ' ').");
6012
6013static PyObject*
6014unicode_capwords(PyUnicodeObject *self)
6015{
6016    PyObject *list;
6017    PyObject *item;
6018    Py_ssize_t i;
6019
6020    /* Split into words */
6021    list = split(self, NULL, -1);
6022    if (!list)
6023        return NULL;
6024
6025    /* Capitalize each word */
6026    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6027        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6028		     fixcapitalize);
6029        if (item == NULL)
6030            goto onError;
6031        Py_DECREF(PyList_GET_ITEM(list, i));
6032        PyList_SET_ITEM(list, i, item);
6033    }
6034
6035    /* Join the words to form a new string */
6036    item = PyUnicode_Join(NULL, list);
6037
6038onError:
6039    Py_DECREF(list);
6040    return (PyObject *)item;
6041}
6042#endif
6043
6044/* Argument converter.  Coerces to a single unicode character */
6045
6046static int
6047convert_uc(PyObject *obj, void *addr)
6048{
6049	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6050	PyObject *uniobj;
6051	Py_UNICODE *unistr;
6052
6053	uniobj = PyUnicode_FromObject(obj);
6054	if (uniobj == NULL) {
6055		PyErr_SetString(PyExc_TypeError,
6056			"The fill character cannot be converted to Unicode");
6057		return 0;
6058	}
6059	if (PyUnicode_GET_SIZE(uniobj) != 1) {
6060		PyErr_SetString(PyExc_TypeError,
6061			"The fill character must be exactly one character long");
6062		Py_DECREF(uniobj);
6063		return 0;
6064	}
6065	unistr = PyUnicode_AS_UNICODE(uniobj);
6066	*fillcharloc = unistr[0];
6067	Py_DECREF(uniobj);
6068	return 1;
6069}
6070
6071PyDoc_STRVAR(center__doc__,
6072"S.center(width[, fillchar]) -> unicode\n\
6073\n\
6074Return S centered in a Unicode string of length width. Padding is\n\
6075done using the specified fill character (default is a space)");
6076
6077static PyObject *
6078unicode_center(PyUnicodeObject *self, PyObject *args)
6079{
6080    Py_ssize_t marg, left;
6081    Py_ssize_t width;
6082    Py_UNICODE fillchar = ' ';
6083
6084    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6085        return NULL;
6086
6087    if (self->length >= width && PyUnicode_CheckExact(self)) {
6088        Py_INCREF(self);
6089        return (PyObject*) self;
6090    }
6091
6092    marg = width - self->length;
6093    left = marg / 2 + (marg & width & 1);
6094
6095    return (PyObject*) pad(self, left, marg - left, fillchar);
6096}
6097
6098#if 0
6099
6100/* This code should go into some future Unicode collation support
6101   module. The basic comparison should compare ordinals on a naive
6102   basis (this is what Java does and thus JPython too). */
6103
6104/* speedy UTF-16 code point order comparison */
6105/* gleaned from: */
6106/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6107
6108static short utf16Fixup[32] =
6109{
6110    0, 0, 0, 0, 0, 0, 0, 0,
6111    0, 0, 0, 0, 0, 0, 0, 0,
6112    0, 0, 0, 0, 0, 0, 0, 0,
6113    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6114};
6115
6116static int
6117unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6118{
6119    Py_ssize_t len1, len2;
6120
6121    Py_UNICODE *s1 = str1->str;
6122    Py_UNICODE *s2 = str2->str;
6123
6124    len1 = str1->length;
6125    len2 = str2->length;
6126
6127    while (len1 > 0 && len2 > 0) {
6128        Py_UNICODE c1, c2;
6129
6130        c1 = *s1++;
6131        c2 = *s2++;
6132
6133	if (c1 > (1<<11) * 26)
6134	    c1 += utf16Fixup[c1>>11];
6135	if (c2 > (1<<11) * 26)
6136            c2 += utf16Fixup[c2>>11];
6137        /* now c1 and c2 are in UTF-32-compatible order */
6138
6139        if (c1 != c2)
6140            return (c1 < c2) ? -1 : 1;
6141
6142        len1--; len2--;
6143    }
6144
6145    return (len1 < len2) ? -1 : (len1 != len2);
6146}
6147
6148#else
6149
6150static int
6151unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6152{
6153    register Py_ssize_t len1, len2;
6154
6155    Py_UNICODE *s1 = str1->str;
6156    Py_UNICODE *s2 = str2->str;
6157
6158    len1 = str1->length;
6159    len2 = str2->length;
6160
6161    while (len1 > 0 && len2 > 0) {
6162        Py_UNICODE c1, c2;
6163
6164        c1 = *s1++;
6165        c2 = *s2++;
6166
6167        if (c1 != c2)
6168            return (c1 < c2) ? -1 : 1;
6169
6170        len1--; len2--;
6171    }
6172
6173    return (len1 < len2) ? -1 : (len1 != len2);
6174}
6175
6176#endif
6177
6178int PyUnicode_Compare(PyObject *left,
6179		      PyObject *right)
6180{
6181    if (PyUnicode_Check(left) && PyUnicode_Check(right))
6182        return unicode_compare((PyUnicodeObject *)left,
6183                               (PyUnicodeObject *)right);
6184    if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6185        (PyUnicode_Check(left) && PyString_Check(right))) {
6186        if (PyUnicode_Check(left))
6187            left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6188        if (PyUnicode_Check(right))
6189            right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6190        assert(PyString_Check(left));
6191        assert(PyString_Check(right));
6192        return PyObject_Compare(left, right);
6193    }
6194    PyErr_Format(PyExc_TypeError,
6195                 "Can't compare %.100s and %.100s",
6196                 left->ob_type->tp_name,
6197                 right->ob_type->tp_name);
6198    return -1;
6199}
6200
6201int
6202PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6203{
6204    int i;
6205    Py_UNICODE *id;
6206    assert(PyUnicode_Check(uni));
6207    id = PyUnicode_AS_UNICODE(uni);
6208    /* Compare Unicode string and source character set string */
6209    for (i = 0; id[i] && str[i]; i++)
6210	if (id[i] != str[i])
6211	    return ((int)id[i] < (int)str[i]) ? -1 : 1;
6212    if (id[i])
6213	return 1; /* uni is longer */
6214    if (str[i])
6215	return -1; /* str is longer */
6216    return 0;
6217}
6218
6219PyObject *PyUnicode_RichCompare(PyObject *left,
6220                                PyObject *right,
6221                                int op)
6222{
6223    int result;
6224
6225    result = PyUnicode_Compare(left, right);
6226    if (result == -1 && PyErr_Occurred())
6227        goto onError;
6228
6229    /* Convert the return value to a Boolean */
6230    switch (op) {
6231    case Py_EQ:
6232        result = (result == 0);
6233        break;
6234    case Py_NE:
6235        result = (result != 0);
6236        break;
6237    case Py_LE:
6238        result = (result <= 0);
6239        break;
6240    case Py_GE:
6241        result = (result >= 0);
6242        break;
6243    case Py_LT:
6244        result = (result == -1);
6245        break;
6246    case Py_GT:
6247        result = (result == 1);
6248        break;
6249    }
6250    return PyBool_FromLong(result);
6251
6252 onError:
6253
6254    /* Standard case
6255
6256       Type errors mean that PyUnicode_FromObject() could not convert
6257       one of the arguments (usually the right hand side) to Unicode,
6258       ie. we can't handle the comparison request. However, it is
6259       possible that the other object knows a comparison method, which
6260       is why we return Py_NotImplemented to give the other object a
6261       chance.
6262
6263    */
6264    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6265        PyErr_Clear();
6266        Py_INCREF(Py_NotImplemented);
6267        return Py_NotImplemented;
6268    }
6269    if (op != Py_EQ && op != Py_NE)
6270        return NULL;
6271
6272    /* Equality comparison.
6273
6274       This is a special case: we silence any PyExc_UnicodeDecodeError
6275       and instead turn it into a PyErr_UnicodeWarning.
6276
6277    */
6278    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6279        return NULL;
6280    PyErr_Clear();
6281    if (PyErr_WarnEx(PyExc_UnicodeWarning,
6282                     (op == Py_EQ) ?
6283                     "Unicode equal comparison "
6284                     "failed to convert both arguments to Unicode - "
6285                     "interpreting them as being unequal"
6286                     :
6287                     "Unicode unequal comparison "
6288                     "failed to convert both arguments to Unicode - "
6289                     "interpreting them as being unequal",
6290                     1) < 0)
6291        return NULL;
6292    result = (op == Py_NE);
6293    return PyBool_FromLong(result);
6294}
6295
6296int PyUnicode_Contains(PyObject *container,
6297		       PyObject *element)
6298{
6299    PyObject *str, *sub;
6300    int result;
6301
6302    /* Coerce the two arguments */
6303    sub = PyUnicode_FromObject(element);
6304    if (!sub) {
6305	PyErr_Format(PyExc_TypeError,
6306	    "'in <string>' requires string as left operand, not %s",
6307	    element->ob_type->tp_name);
6308        return -1;
6309    }
6310
6311    str = PyUnicode_FromObject(container);
6312    if (!str) {
6313        Py_DECREF(sub);
6314        return -1;
6315    }
6316
6317    result = stringlib_contains_obj(str, sub);
6318
6319    Py_DECREF(str);
6320    Py_DECREF(sub);
6321
6322    return result;
6323}
6324
6325/* Concat to string or Unicode object giving a new Unicode object. */
6326
6327PyObject *PyUnicode_Concat(PyObject *left,
6328			   PyObject *right)
6329{
6330    PyUnicodeObject *u = NULL, *v = NULL, *w;
6331
6332    if (PyBytes_Check(left) || PyBytes_Check(right))
6333        return PyBytes_Concat(left, right);
6334
6335    /* Coerce the two arguments */
6336    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6337    if (u == NULL)
6338	goto onError;
6339    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6340    if (v == NULL)
6341	goto onError;
6342
6343    /* Shortcuts */
6344    if (v == unicode_empty) {
6345	Py_DECREF(v);
6346	return (PyObject *)u;
6347    }
6348    if (u == unicode_empty) {
6349	Py_DECREF(u);
6350	return (PyObject *)v;
6351    }
6352
6353    /* Concat the two Unicode strings */
6354    w = _PyUnicode_New(u->length + v->length);
6355    if (w == NULL)
6356	goto onError;
6357    Py_UNICODE_COPY(w->str, u->str, u->length);
6358    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6359
6360    Py_DECREF(u);
6361    Py_DECREF(v);
6362    return (PyObject *)w;
6363
6364onError:
6365    Py_XDECREF(u);
6366    Py_XDECREF(v);
6367    return NULL;
6368}
6369
6370void
6371PyUnicode_Append(PyObject **pleft, PyObject *right)
6372{
6373	PyObject *new;
6374	if (*pleft == NULL)
6375		return;
6376	if (right == NULL || !PyUnicode_Check(*pleft)) {
6377		Py_DECREF(*pleft);
6378		*pleft = NULL;
6379		return;
6380	}
6381	new = PyUnicode_Concat(*pleft, right);
6382	Py_DECREF(*pleft);
6383	*pleft = new;
6384}
6385
6386void
6387PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6388{
6389	PyUnicode_Append(pleft, right);
6390	Py_XDECREF(right);
6391}
6392
6393PyDoc_STRVAR(count__doc__,
6394"S.count(sub[, start[, end]]) -> int\n\
6395\n\
6396Return the number of non-overlapping occurrences of substring sub in\n\
6397Unicode string S[start:end].  Optional arguments start and end are\n\
6398interpreted as in slice notation.");
6399
6400static PyObject *
6401unicode_count(PyUnicodeObject *self, PyObject *args)
6402{
6403    PyUnicodeObject *substring;
6404    Py_ssize_t start = 0;
6405    Py_ssize_t end = PY_SSIZE_T_MAX;
6406    PyObject *result;
6407
6408    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6409		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6410        return NULL;
6411
6412    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6413        (PyObject *)substring);
6414    if (substring == NULL)
6415	return NULL;
6416
6417    FIX_START_END(self);
6418
6419    result = PyInt_FromSsize_t(
6420        stringlib_count(self->str + start, end - start,
6421                        substring->str, substring->length)
6422        );
6423
6424    Py_DECREF(substring);
6425
6426    return result;
6427}
6428
6429PyDoc_STRVAR(encode__doc__,
6430"S.encode([encoding[,errors]]) -> string or unicode\n\
6431\n\
6432Encodes S using the codec registered for encoding. encoding defaults\n\
6433to the default encoding. errors may be given to set a different error\n\
6434handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6435a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6436'xmlcharrefreplace' as well as any other name registered with\n\
6437codecs.register_error that can handle UnicodeEncodeErrors.");
6438
6439static PyObject *
6440unicode_encode(PyUnicodeObject *self, PyObject *args)
6441{
6442    char *encoding = NULL;
6443    char *errors = NULL;
6444    PyObject *v;
6445
6446    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6447        return NULL;
6448    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6449    if (v == NULL)
6450        goto onError;
6451    if (!PyBytes_Check(v)) {
6452        PyErr_Format(PyExc_TypeError,
6453                     "encoder did not return a bytes object "
6454                     "(type=%.400s)",
6455                     Py_Type(v)->tp_name);
6456        Py_DECREF(v);
6457        return NULL;
6458    }
6459    return v;
6460
6461 onError:
6462    return NULL;
6463}
6464
6465PyDoc_STRVAR(expandtabs__doc__,
6466"S.expandtabs([tabsize]) -> unicode\n\
6467\n\
6468Return a copy of S where all tab characters are expanded using spaces.\n\
6469If tabsize is not given, a tab size of 8 characters is assumed.");
6470
6471static PyObject*
6472unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6473{
6474    Py_UNICODE *e;
6475    Py_UNICODE *p;
6476    Py_UNICODE *q;
6477    Py_ssize_t i, j, old_j;
6478    PyUnicodeObject *u;
6479    int tabsize = 8;
6480
6481    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6482	return NULL;
6483
6484    /* First pass: determine size of output string */
6485    i = j = old_j = 0;
6486    e = self->str + self->length;
6487    for (p = self->str; p < e; p++)
6488        if (*p == '\t') {
6489	    if (tabsize > 0) {
6490		j += tabsize - (j % tabsize);
6491		if (old_j > j) {
6492		    PyErr_SetString(PyExc_OverflowError,
6493				    "new string is too long");
6494		    return NULL;
6495		}
6496		old_j = j;
6497	    }
6498	}
6499        else {
6500            j++;
6501            if (*p == '\n' || *p == '\r') {
6502                i += j;
6503                old_j = j = 0;
6504                if (i < 0) {
6505                    PyErr_SetString(PyExc_OverflowError,
6506                                    "new string is too long");
6507                    return NULL;
6508                }
6509            }
6510        }
6511
6512    if ((i + j) < 0) {
6513        PyErr_SetString(PyExc_OverflowError, "new string is too long");
6514        return NULL;
6515    }
6516
6517    /* Second pass: create output string and fill it */
6518    u = _PyUnicode_New(i + j);
6519    if (!u)
6520        return NULL;
6521
6522    j = 0;
6523    q = u->str;
6524
6525    for (p = self->str; p < e; p++)
6526        if (*p == '\t') {
6527	    if (tabsize > 0) {
6528		i = tabsize - (j % tabsize);
6529		j += i;
6530		while (i--)
6531		    *q++ = ' ';
6532	    }
6533	}
6534	else {
6535            j++;
6536	    *q++ = *p;
6537            if (*p == '\n' || *p == '\r')
6538                j = 0;
6539        }
6540
6541    return (PyObject*) u;
6542}
6543
6544PyDoc_STRVAR(find__doc__,
6545"S.find(sub [,start [,end]]) -> int\n\
6546\n\
6547Return the lowest index in S where substring sub is found,\n\
6548such that sub is contained within s[start:end].  Optional\n\
6549arguments start and end are interpreted as in slice notation.\n\
6550\n\
6551Return -1 on failure.");
6552
6553static PyObject *
6554unicode_find(PyUnicodeObject *self, PyObject *args)
6555{
6556    PyObject *substring;
6557    Py_ssize_t start = 0;
6558    Py_ssize_t end = PY_SSIZE_T_MAX;
6559    Py_ssize_t result;
6560
6561    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6562		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6563        return NULL;
6564    substring = PyUnicode_FromObject(substring);
6565    if (!substring)
6566	return NULL;
6567
6568    result = stringlib_find_slice(
6569        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6570        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6571        start, end
6572        );
6573
6574    Py_DECREF(substring);
6575
6576    return PyInt_FromSsize_t(result);
6577}
6578
6579static PyObject *
6580unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6581{
6582    if (index < 0 || index >= self->length) {
6583        PyErr_SetString(PyExc_IndexError, "string index out of range");
6584        return NULL;
6585    }
6586
6587    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6588}
6589
6590/* Believe it or not, this produces the same value for ASCII strings
6591   as string_hash(). */
6592static long
6593unicode_hash(PyUnicodeObject *self)
6594{
6595    Py_ssize_t len;
6596    Py_UNICODE *p;
6597    long x;
6598
6599    if (self->hash != -1)
6600        return self->hash;
6601    len = Py_Size(self);
6602    p = self->str;
6603    x = *p << 7;
6604    while (--len >= 0)
6605        x = (1000003*x) ^ *p++;
6606    x ^= Py_Size(self);
6607    if (x == -1)
6608        x = -2;
6609    self->hash = x;
6610    return x;
6611}
6612
6613PyDoc_STRVAR(index__doc__,
6614"S.index(sub [,start [,end]]) -> int\n\
6615\n\
6616Like S.find() but raise ValueError when the substring is not found.");
6617
6618static PyObject *
6619unicode_index(PyUnicodeObject *self, PyObject *args)
6620{
6621    Py_ssize_t result;
6622    PyObject *substring;
6623    Py_ssize_t start = 0;
6624    Py_ssize_t end = PY_SSIZE_T_MAX;
6625
6626    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6627		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6628        return NULL;
6629    substring = PyUnicode_FromObject(substring);
6630    if (!substring)
6631	return NULL;
6632
6633    result = stringlib_find_slice(
6634        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6635        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6636        start, end
6637        );
6638
6639    Py_DECREF(substring);
6640
6641    if (result < 0) {
6642        PyErr_SetString(PyExc_ValueError, "substring not found");
6643        return NULL;
6644    }
6645
6646    return PyInt_FromSsize_t(result);
6647}
6648
6649PyDoc_STRVAR(islower__doc__,
6650"S.islower() -> bool\n\
6651\n\
6652Return True if all cased characters in S are lowercase and there is\n\
6653at least one cased character in S, False otherwise.");
6654
6655static PyObject*
6656unicode_islower(PyUnicodeObject *self)
6657{
6658    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6659    register const Py_UNICODE *e;
6660    int cased;
6661
6662    /* Shortcut for single character strings */
6663    if (PyUnicode_GET_SIZE(self) == 1)
6664	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6665
6666    /* Special case for empty strings */
6667    if (PyUnicode_GET_SIZE(self) == 0)
6668	return PyBool_FromLong(0);
6669
6670    e = p + PyUnicode_GET_SIZE(self);
6671    cased = 0;
6672    for (; p < e; p++) {
6673	register const Py_UNICODE ch = *p;
6674
6675	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6676	    return PyBool_FromLong(0);
6677	else if (!cased && Py_UNICODE_ISLOWER(ch))
6678	    cased = 1;
6679    }
6680    return PyBool_FromLong(cased);
6681}
6682
6683PyDoc_STRVAR(isupper__doc__,
6684"S.isupper() -> bool\n\
6685\n\
6686Return True if all cased characters in S are uppercase and there is\n\
6687at least one cased character in S, False otherwise.");
6688
6689static PyObject*
6690unicode_isupper(PyUnicodeObject *self)
6691{
6692    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6693    register const Py_UNICODE *e;
6694    int cased;
6695
6696    /* Shortcut for single character strings */
6697    if (PyUnicode_GET_SIZE(self) == 1)
6698	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6699
6700    /* Special case for empty strings */
6701    if (PyUnicode_GET_SIZE(self) == 0)
6702	return PyBool_FromLong(0);
6703
6704    e = p + PyUnicode_GET_SIZE(self);
6705    cased = 0;
6706    for (; p < e; p++) {
6707	register const Py_UNICODE ch = *p;
6708
6709	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6710	    return PyBool_FromLong(0);
6711	else if (!cased && Py_UNICODE_ISUPPER(ch))
6712	    cased = 1;
6713    }
6714    return PyBool_FromLong(cased);
6715}
6716
6717PyDoc_STRVAR(istitle__doc__,
6718"S.istitle() -> bool\n\
6719\n\
6720Return True if S is a titlecased string and there is at least one\n\
6721character in S, i.e. upper- and titlecase characters may only\n\
6722follow uncased characters and lowercase characters only cased ones.\n\
6723Return False otherwise.");
6724
6725static PyObject*
6726unicode_istitle(PyUnicodeObject *self)
6727{
6728    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6729    register const Py_UNICODE *e;
6730    int cased, previous_is_cased;
6731
6732    /* Shortcut for single character strings */
6733    if (PyUnicode_GET_SIZE(self) == 1)
6734	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6735			       (Py_UNICODE_ISUPPER(*p) != 0));
6736
6737    /* Special case for empty strings */
6738    if (PyUnicode_GET_SIZE(self) == 0)
6739	return PyBool_FromLong(0);
6740
6741    e = p + PyUnicode_GET_SIZE(self);
6742    cased = 0;
6743    previous_is_cased = 0;
6744    for (; p < e; p++) {
6745	register const Py_UNICODE ch = *p;
6746
6747	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6748	    if (previous_is_cased)
6749		return PyBool_FromLong(0);
6750	    previous_is_cased = 1;
6751	    cased = 1;
6752	}
6753	else if (Py_UNICODE_ISLOWER(ch)) {
6754	    if (!previous_is_cased)
6755		return PyBool_FromLong(0);
6756	    previous_is_cased = 1;
6757	    cased = 1;
6758	}
6759	else
6760	    previous_is_cased = 0;
6761    }
6762    return PyBool_FromLong(cased);
6763}
6764
6765PyDoc_STRVAR(isspace__doc__,
6766"S.isspace() -> bool\n\
6767\n\
6768Return True if all characters in S are whitespace\n\
6769and there is at least one character in S, False otherwise.");
6770
6771static PyObject*
6772unicode_isspace(PyUnicodeObject *self)
6773{
6774    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6775    register const Py_UNICODE *e;
6776
6777    /* Shortcut for single character strings */
6778    if (PyUnicode_GET_SIZE(self) == 1 &&
6779	Py_UNICODE_ISSPACE(*p))
6780	return PyBool_FromLong(1);
6781
6782    /* Special case for empty strings */
6783    if (PyUnicode_GET_SIZE(self) == 0)
6784	return PyBool_FromLong(0);
6785
6786    e = p + PyUnicode_GET_SIZE(self);
6787    for (; p < e; p++) {
6788	if (!Py_UNICODE_ISSPACE(*p))
6789	    return PyBool_FromLong(0);
6790    }
6791    return PyBool_FromLong(1);
6792}
6793
6794PyDoc_STRVAR(isalpha__doc__,
6795"S.isalpha() -> bool\n\
6796\n\
6797Return True if all characters in S are alphabetic\n\
6798and there is at least one character in S, False otherwise.");
6799
6800static PyObject*
6801unicode_isalpha(PyUnicodeObject *self)
6802{
6803    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6804    register const Py_UNICODE *e;
6805
6806    /* Shortcut for single character strings */
6807    if (PyUnicode_GET_SIZE(self) == 1 &&
6808	Py_UNICODE_ISALPHA(*p))
6809	return PyBool_FromLong(1);
6810
6811    /* Special case for empty strings */
6812    if (PyUnicode_GET_SIZE(self) == 0)
6813	return PyBool_FromLong(0);
6814
6815    e = p + PyUnicode_GET_SIZE(self);
6816    for (; p < e; p++) {
6817	if (!Py_UNICODE_ISALPHA(*p))
6818	    return PyBool_FromLong(0);
6819    }
6820    return PyBool_FromLong(1);
6821}
6822
6823PyDoc_STRVAR(isalnum__doc__,
6824"S.isalnum() -> bool\n\
6825\n\
6826Return True if all characters in S are alphanumeric\n\
6827and there is at least one character in S, False otherwise.");
6828
6829static PyObject*
6830unicode_isalnum(PyUnicodeObject *self)
6831{
6832    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6833    register const Py_UNICODE *e;
6834
6835    /* Shortcut for single character strings */
6836    if (PyUnicode_GET_SIZE(self) == 1 &&
6837	Py_UNICODE_ISALNUM(*p))
6838	return PyBool_FromLong(1);
6839
6840    /* Special case for empty strings */
6841    if (PyUnicode_GET_SIZE(self) == 0)
6842	return PyBool_FromLong(0);
6843
6844    e = p + PyUnicode_GET_SIZE(self);
6845    for (; p < e; p++) {
6846	if (!Py_UNICODE_ISALNUM(*p))
6847	    return PyBool_FromLong(0);
6848    }
6849    return PyBool_FromLong(1);
6850}
6851
6852PyDoc_STRVAR(isdecimal__doc__,
6853"S.isdecimal() -> bool\n\
6854\n\
6855Return True if there are only decimal characters in S,\n\
6856False otherwise.");
6857
6858static PyObject*
6859unicode_isdecimal(PyUnicodeObject *self)
6860{
6861    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6862    register const Py_UNICODE *e;
6863
6864    /* Shortcut for single character strings */
6865    if (PyUnicode_GET_SIZE(self) == 1 &&
6866	Py_UNICODE_ISDECIMAL(*p))
6867	return PyBool_FromLong(1);
6868
6869    /* Special case for empty strings */
6870    if (PyUnicode_GET_SIZE(self) == 0)
6871	return PyBool_FromLong(0);
6872
6873    e = p + PyUnicode_GET_SIZE(self);
6874    for (; p < e; p++) {
6875	if (!Py_UNICODE_ISDECIMAL(*p))
6876	    return PyBool_FromLong(0);
6877    }
6878    return PyBool_FromLong(1);
6879}
6880
6881PyDoc_STRVAR(isdigit__doc__,
6882"S.isdigit() -> bool\n\
6883\n\
6884Return True if all characters in S are digits\n\
6885and there is at least one character in S, False otherwise.");
6886
6887static PyObject*
6888unicode_isdigit(PyUnicodeObject *self)
6889{
6890    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6891    register const Py_UNICODE *e;
6892
6893    /* Shortcut for single character strings */
6894    if (PyUnicode_GET_SIZE(self) == 1 &&
6895	Py_UNICODE_ISDIGIT(*p))
6896	return PyBool_FromLong(1);
6897
6898    /* Special case for empty strings */
6899    if (PyUnicode_GET_SIZE(self) == 0)
6900	return PyBool_FromLong(0);
6901
6902    e = p + PyUnicode_GET_SIZE(self);
6903    for (; p < e; p++) {
6904	if (!Py_UNICODE_ISDIGIT(*p))
6905	    return PyBool_FromLong(0);
6906    }
6907    return PyBool_FromLong(1);
6908}
6909
6910PyDoc_STRVAR(isnumeric__doc__,
6911"S.isnumeric() -> bool\n\
6912\n\
6913Return True if there are only numeric characters in S,\n\
6914False otherwise.");
6915
6916static PyObject*
6917unicode_isnumeric(PyUnicodeObject *self)
6918{
6919    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6920    register const Py_UNICODE *e;
6921
6922    /* Shortcut for single character strings */
6923    if (PyUnicode_GET_SIZE(self) == 1 &&
6924	Py_UNICODE_ISNUMERIC(*p))
6925	return PyBool_FromLong(1);
6926
6927    /* Special case for empty strings */
6928    if (PyUnicode_GET_SIZE(self) == 0)
6929	return PyBool_FromLong(0);
6930
6931    e = p + PyUnicode_GET_SIZE(self);
6932    for (; p < e; p++) {
6933	if (!Py_UNICODE_ISNUMERIC(*p))
6934	    return PyBool_FromLong(0);
6935    }
6936    return PyBool_FromLong(1);
6937}
6938
6939int
6940PyUnicode_IsIdentifier(PyObject *self)
6941{
6942    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6943    register const Py_UNICODE *e;
6944
6945    /* Special case for empty strings */
6946    if (PyUnicode_GET_SIZE(self) == 0)
6947	return 0;
6948
6949    /* PEP 3131 says that the first character must be in
6950       XID_Start and subsequent characters in XID_Continue,
6951       and for the ASCII range, the 2.x rules apply (i.e
6952       start with letters and underscore, continue with
6953       letters, digits, underscore). However, given the current
6954       definition of XID_Start and XID_Continue, it is sufficient
6955       to check just for these, except that _ must be allowed
6956       as starting an identifier.  */
6957    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6958        return 0;
6959
6960    e = p + PyUnicode_GET_SIZE(self);
6961    for (p++; p < e; p++) {
6962	if (!_PyUnicode_IsXidContinue(*p))
6963	    return 0;
6964    }
6965    return 1;
6966}
6967
6968PyDoc_STRVAR(isidentifier__doc__,
6969"S.isidentifier() -> bool\n\
6970\n\
6971Return True if S is a valid identifier according\n\
6972to the language definition.");
6973
6974static PyObject*
6975unicode_isidentifier(PyObject *self)
6976{
6977    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6978}
6979
6980PyDoc_STRVAR(join__doc__,
6981"S.join(sequence) -> unicode\n\
6982\n\
6983Return a string which is the concatenation of the strings in the\n\
6984sequence.  The separator between elements is S.");
6985
6986static PyObject*
6987unicode_join(PyObject *self, PyObject *data)
6988{
6989    return PyUnicode_Join(self, data);
6990}
6991
6992static Py_ssize_t
6993unicode_length(PyUnicodeObject *self)
6994{
6995    return self->length;
6996}
6997
6998PyDoc_STRVAR(ljust__doc__,
6999"S.ljust(width[, fillchar]) -> int\n\
7000\n\
7001Return S left justified in a Unicode string of length width. Padding is\n\
7002done using the specified fill character (default is a space).");
7003
7004static PyObject *
7005unicode_ljust(PyUnicodeObject *self, PyObject *args)
7006{
7007    Py_ssize_t width;
7008    Py_UNICODE fillchar = ' ';
7009
7010    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7011        return NULL;
7012
7013    if (self->length >= width && PyUnicode_CheckExact(self)) {
7014        Py_INCREF(self);
7015        return (PyObject*) self;
7016    }
7017
7018    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7019}
7020
7021PyDoc_STRVAR(lower__doc__,
7022"S.lower() -> unicode\n\
7023\n\
7024Return a copy of the string S converted to lowercase.");
7025
7026static PyObject*
7027unicode_lower(PyUnicodeObject *self)
7028{
7029    return fixup(self, fixlower);
7030}
7031
7032#define LEFTSTRIP 0
7033#define RIGHTSTRIP 1
7034#define BOTHSTRIP 2
7035
7036/* Arrays indexed by above */
7037static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7038
7039#define STRIPNAME(i) (stripformat[i]+3)
7040
7041/* externally visible for str.strip(unicode) */
7042PyObject *
7043_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7044{
7045	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7046	Py_ssize_t len = PyUnicode_GET_SIZE(self);
7047	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7048	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7049	Py_ssize_t i, j;
7050
7051        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7052
7053	i = 0;
7054	if (striptype != RIGHTSTRIP) {
7055            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7056                i++;
7057            }
7058	}
7059
7060	j = len;
7061	if (striptype != LEFTSTRIP) {
7062            do {
7063                j--;
7064            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7065            j++;
7066	}
7067
7068	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7069            Py_INCREF(self);
7070            return (PyObject*)self;
7071	}
7072	else
7073            return PyUnicode_FromUnicode(s+i, j-i);
7074}
7075
7076
7077static PyObject *
7078do_strip(PyUnicodeObject *self, int striptype)
7079{
7080	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7081	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7082
7083	i = 0;
7084	if (striptype != RIGHTSTRIP) {
7085		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7086			i++;
7087		}
7088	}
7089
7090	j = len;
7091	if (striptype != LEFTSTRIP) {
7092		do {
7093			j--;
7094		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7095		j++;
7096	}
7097
7098	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7099		Py_INCREF(self);
7100		return (PyObject*)self;
7101	}
7102	else
7103		return PyUnicode_FromUnicode(s+i, j-i);
7104}
7105
7106
7107static PyObject *
7108do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7109{
7110	PyObject *sep = NULL;
7111
7112	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7113		return NULL;
7114
7115	if (sep != NULL && sep != Py_None) {
7116		if (PyUnicode_Check(sep))
7117			return _PyUnicode_XStrip(self, striptype, sep);
7118		else if (PyString_Check(sep)) {
7119			PyObject *res;
7120			sep = PyUnicode_FromObject(sep);
7121			if (sep==NULL)
7122				return NULL;
7123			res = _PyUnicode_XStrip(self, striptype, sep);
7124			Py_DECREF(sep);
7125			return res;
7126		}
7127		else {
7128			PyErr_Format(PyExc_TypeError,
7129				     "%s arg must be None, unicode or str",
7130				     STRIPNAME(striptype));
7131			return NULL;
7132		}
7133	}
7134
7135	return do_strip(self, striptype);
7136}
7137
7138
7139PyDoc_STRVAR(strip__doc__,
7140"S.strip([chars]) -> unicode\n\
7141\n\
7142Return a copy of the string S with leading and trailing\n\
7143whitespace removed.\n\
7144If chars is given and not None, remove characters in chars instead.\n\
7145If chars is a str, it will be converted to unicode before stripping");
7146
7147static PyObject *
7148unicode_strip(PyUnicodeObject *self, PyObject *args)
7149{
7150	if (PyTuple_GET_SIZE(args) == 0)
7151		return do_strip(self, BOTHSTRIP); /* Common case */
7152	else
7153		return do_argstrip(self, BOTHSTRIP, args);
7154}
7155
7156
7157PyDoc_STRVAR(lstrip__doc__,
7158"S.lstrip([chars]) -> unicode\n\
7159\n\
7160Return a copy of the string S with leading whitespace removed.\n\
7161If chars is given and not None, remove characters in chars instead.\n\
7162If chars is a str, it will be converted to unicode before stripping");
7163
7164static PyObject *
7165unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7166{
7167	if (PyTuple_GET_SIZE(args) == 0)
7168		return do_strip(self, LEFTSTRIP); /* Common case */
7169	else
7170		return do_argstrip(self, LEFTSTRIP, args);
7171}
7172
7173
7174PyDoc_STRVAR(rstrip__doc__,
7175"S.rstrip([chars]) -> unicode\n\
7176\n\
7177Return a copy of the string S with trailing whitespace removed.\n\
7178If chars is given and not None, remove characters in chars instead.\n\
7179If chars is a str, it will be converted to unicode before stripping");
7180
7181static PyObject *
7182unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7183{
7184	if (PyTuple_GET_SIZE(args) == 0)
7185		return do_strip(self, RIGHTSTRIP); /* Common case */
7186	else
7187		return do_argstrip(self, RIGHTSTRIP, args);
7188}
7189
7190
7191static PyObject*
7192unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7193{
7194    PyUnicodeObject *u;
7195    Py_UNICODE *p;
7196    Py_ssize_t nchars;
7197    size_t nbytes;
7198
7199    if (len < 0)
7200        len = 0;
7201
7202    if (len == 1 && PyUnicode_CheckExact(str)) {
7203        /* no repeat, return original string */
7204        Py_INCREF(str);
7205        return (PyObject*) str;
7206    }
7207
7208    /* ensure # of chars needed doesn't overflow int and # of bytes
7209     * needed doesn't overflow size_t
7210     */
7211    nchars = len * str->length;
7212    if (len && nchars / len != str->length) {
7213        PyErr_SetString(PyExc_OverflowError,
7214                        "repeated string is too long");
7215        return NULL;
7216    }
7217    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7218    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7219        PyErr_SetString(PyExc_OverflowError,
7220                        "repeated string is too long");
7221        return NULL;
7222    }
7223    u = _PyUnicode_New(nchars);
7224    if (!u)
7225        return NULL;
7226
7227    p = u->str;
7228
7229    if (str->length == 1 && len > 0) {
7230        Py_UNICODE_FILL(p, str->str[0], len);
7231    } else {
7232	Py_ssize_t done = 0; /* number of characters copied this far */
7233	if (done < nchars) {
7234            Py_UNICODE_COPY(p, str->str, str->length);
7235            done = str->length;
7236	}
7237	while (done < nchars) {
7238            int n = (done <= nchars-done) ? done : nchars-done;
7239            Py_UNICODE_COPY(p+done, p, n);
7240            done += n;
7241	}
7242    }
7243
7244    return (PyObject*) u;
7245}
7246
7247PyObject *PyUnicode_Replace(PyObject *obj,
7248			    PyObject *subobj,
7249			    PyObject *replobj,
7250			    Py_ssize_t maxcount)
7251{
7252    PyObject *self;
7253    PyObject *str1;
7254    PyObject *str2;
7255    PyObject *result;
7256
7257    self = PyUnicode_FromObject(obj);
7258    if (self == NULL)
7259	return NULL;
7260    str1 = PyUnicode_FromObject(subobj);
7261    if (str1 == NULL) {
7262	Py_DECREF(self);
7263	return NULL;
7264    }
7265    str2 = PyUnicode_FromObject(replobj);
7266    if (str2 == NULL) {
7267	Py_DECREF(self);
7268	Py_DECREF(str1);
7269	return NULL;
7270    }
7271    result = replace((PyUnicodeObject *)self,
7272		     (PyUnicodeObject *)str1,
7273		     (PyUnicodeObject *)str2,
7274		     maxcount);
7275    Py_DECREF(self);
7276    Py_DECREF(str1);
7277    Py_DECREF(str2);
7278    return result;
7279}
7280
7281PyDoc_STRVAR(replace__doc__,
7282"S.replace (old, new[, maxsplit]) -> unicode\n\
7283\n\
7284Return a copy of S with all occurrences of substring\n\
7285old replaced by new.  If the optional argument maxsplit is\n\
7286given, only the first maxsplit occurrences are replaced.");
7287
7288static PyObject*
7289unicode_replace(PyUnicodeObject *self, PyObject *args)
7290{
7291    PyUnicodeObject *str1;
7292    PyUnicodeObject *str2;
7293    Py_ssize_t maxcount = -1;
7294    PyObject *result;
7295
7296    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7297        return NULL;
7298    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7299    if (str1 == NULL)
7300	return NULL;
7301    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7302    if (str2 == NULL) {
7303	Py_DECREF(str1);
7304	return NULL;
7305    }
7306
7307    result = replace(self, str1, str2, maxcount);
7308
7309    Py_DECREF(str1);
7310    Py_DECREF(str2);
7311    return result;
7312}
7313
7314static
7315PyObject *unicode_repr(PyObject *unicode)
7316{
7317    PyObject *repr;
7318    Py_UNICODE *p;
7319    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7320    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7321
7322    /* XXX(nnorwitz): rather than over-allocating, it would be
7323       better to choose a different scheme.  Perhaps scan the
7324       first N-chars of the string and allocate based on that size.
7325    */
7326    /* Initial allocation is based on the longest-possible unichr
7327       escape.
7328
7329       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7330       unichr, so in this case it's the longest unichr escape. In
7331       narrow (UTF-16) builds this is five chars per source unichr
7332       since there are two unichrs in the surrogate pair, so in narrow
7333       (UTF-16) builds it's not the longest unichr escape.
7334
7335       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7336       so in the narrow (UTF-16) build case it's the longest unichr
7337       escape.
7338    */
7339
7340    repr = PyUnicode_FromUnicode(NULL,
7341        2 /* quotes */
7342#ifdef Py_UNICODE_WIDE
7343        + 10*size
7344#else
7345        + 6*size
7346#endif
7347        + 1);
7348    if (repr == NULL)
7349        return NULL;
7350
7351    p = PyUnicode_AS_UNICODE(repr);
7352
7353    /* Add quote */
7354    *p++ = (findchar(s, size, '\'') &&
7355            !findchar(s, size, '"')) ? '"' : '\'';
7356    while (size-- > 0) {
7357        Py_UNICODE ch = *s++;
7358
7359        /* Escape quotes and backslashes */
7360        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7361            *p++ = '\\';
7362            *p++ = ch;
7363            continue;
7364        }
7365
7366#ifdef Py_UNICODE_WIDE
7367        /* Map 21-bit characters to '\U00xxxxxx' */
7368        else if (ch >= 0x10000) {
7369            *p++ = '\\';
7370            *p++ = 'U';
7371            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7372            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7373            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7374            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7375            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7376            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7377            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7378            *p++ = hexdigits[ch & 0x0000000F];
7379	    continue;
7380        }
7381#else
7382	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7383	else if (ch >= 0xD800 && ch < 0xDC00) {
7384	    Py_UNICODE ch2;
7385	    Py_UCS4 ucs;
7386
7387	    ch2 = *s++;
7388	    size--;
7389	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7390		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7391		*p++ = '\\';
7392		*p++ = 'U';
7393		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7394		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7395		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7396		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7397		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7398		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7399		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7400		*p++ = hexdigits[ucs & 0x0000000F];
7401		continue;
7402	    }
7403	    /* Fall through: isolated surrogates are copied as-is */
7404	    s--;
7405	    size++;
7406	}
7407#endif
7408
7409        /* Map 16-bit characters to '\uxxxx' */
7410        if (ch >= 256) {
7411            *p++ = '\\';
7412            *p++ = 'u';
7413            *p++ = hexdigits[(ch >> 12) & 0x000F];
7414            *p++ = hexdigits[(ch >> 8) & 0x000F];
7415            *p++ = hexdigits[(ch >> 4) & 0x000F];
7416            *p++ = hexdigits[ch & 0x000F];
7417        }
7418
7419        /* Map special whitespace to '\t', \n', '\r' */
7420        else if (ch == '\t') {
7421            *p++ = '\\';
7422            *p++ = 't';
7423        }
7424        else if (ch == '\n') {
7425            *p++ = '\\';
7426            *p++ = 'n';
7427        }
7428        else if (ch == '\r') {
7429            *p++ = '\\';
7430            *p++ = 'r';
7431        }
7432
7433        /* Map non-printable US ASCII to '\xhh' */
7434        else if (ch < ' ' || ch >= 0x7F) {
7435            *p++ = '\\';
7436            *p++ = 'x';
7437            *p++ = hexdigits[(ch >> 4) & 0x000F];
7438            *p++ = hexdigits[ch & 0x000F];
7439        }
7440
7441        /* Copy everything else as-is */
7442        else
7443            *p++ = (char) ch;
7444    }
7445    /* Add quote */
7446    *p++ = PyUnicode_AS_UNICODE(repr)[0];
7447
7448    *p = '\0';
7449    _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7450    return repr;
7451}
7452
7453PyDoc_STRVAR(rfind__doc__,
7454"S.rfind(sub [,start [,end]]) -> int\n\
7455\n\
7456Return the highest index in S where substring sub is found,\n\
7457such that sub is contained within s[start:end].  Optional\n\
7458arguments start and end are interpreted as in slice notation.\n\
7459\n\
7460Return -1 on failure.");
7461
7462static PyObject *
7463unicode_rfind(PyUnicodeObject *self, PyObject *args)
7464{
7465    PyObject *substring;
7466    Py_ssize_t start = 0;
7467    Py_ssize_t end = PY_SSIZE_T_MAX;
7468    Py_ssize_t result;
7469
7470    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7471		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7472        return NULL;
7473    substring = PyUnicode_FromObject(substring);
7474    if (!substring)
7475	return NULL;
7476
7477    result = stringlib_rfind_slice(
7478        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7479        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7480        start, end
7481        );
7482
7483    Py_DECREF(substring);
7484
7485    return PyInt_FromSsize_t(result);
7486}
7487
7488PyDoc_STRVAR(rindex__doc__,
7489"S.rindex(sub [,start [,end]]) -> int\n\
7490\n\
7491Like S.rfind() but raise ValueError when the substring is not found.");
7492
7493static PyObject *
7494unicode_rindex(PyUnicodeObject *self, PyObject *args)
7495{
7496    PyObject *substring;
7497    Py_ssize_t start = 0;
7498    Py_ssize_t end = PY_SSIZE_T_MAX;
7499    Py_ssize_t result;
7500
7501    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7502		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7503        return NULL;
7504    substring = PyUnicode_FromObject(substring);
7505    if (!substring)
7506	return NULL;
7507
7508    result = stringlib_rfind_slice(
7509        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7510        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7511        start, end
7512        );
7513
7514    Py_DECREF(substring);
7515
7516    if (result < 0) {
7517        PyErr_SetString(PyExc_ValueError, "substring not found");
7518        return NULL;
7519    }
7520    return PyInt_FromSsize_t(result);
7521}
7522
7523PyDoc_STRVAR(rjust__doc__,
7524"S.rjust(width[, fillchar]) -> unicode\n\
7525\n\
7526Return S right justified in a Unicode string of length width. Padding is\n\
7527done using the specified fill character (default is a space).");
7528
7529static PyObject *
7530unicode_rjust(PyUnicodeObject *self, PyObject *args)
7531{
7532    Py_ssize_t width;
7533    Py_UNICODE fillchar = ' ';
7534
7535    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7536        return NULL;
7537
7538    if (self->length >= width && PyUnicode_CheckExact(self)) {
7539        Py_INCREF(self);
7540        return (PyObject*) self;
7541    }
7542
7543    return (PyObject*) pad(self, width - self->length, 0, fillchar);
7544}
7545
7546PyObject *PyUnicode_Split(PyObject *s,
7547			  PyObject *sep,
7548			  Py_ssize_t maxsplit)
7549{
7550    PyObject *result;
7551
7552    s = PyUnicode_FromObject(s);
7553    if (s == NULL)
7554	return NULL;
7555    if (sep != NULL) {
7556	sep = PyUnicode_FromObject(sep);
7557	if (sep == NULL) {
7558	    Py_DECREF(s);
7559	    return NULL;
7560	}
7561    }
7562
7563    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7564
7565    Py_DECREF(s);
7566    Py_XDECREF(sep);
7567    return result;
7568}
7569
7570PyDoc_STRVAR(split__doc__,
7571"S.split([sep [,maxsplit]]) -> list of strings\n\
7572\n\
7573Return a list of the words in S, using sep as the\n\
7574delimiter string.  If maxsplit is given, at most maxsplit\n\
7575splits are done. If sep is not specified or is None,\n\
7576any whitespace string is a separator.");
7577
7578static PyObject*
7579unicode_split(PyUnicodeObject *self, PyObject *args)
7580{
7581    PyObject *substring = Py_None;
7582    Py_ssize_t maxcount = -1;
7583
7584    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7585        return NULL;
7586
7587    if (substring == Py_None)
7588	return split(self, NULL, maxcount);
7589    else if (PyUnicode_Check(substring))
7590	return split(self, (PyUnicodeObject *)substring, maxcount);
7591    else
7592	return PyUnicode_Split((PyObject *)self, substring, maxcount);
7593}
7594
7595PyObject *
7596PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7597{
7598    PyObject* str_obj;
7599    PyObject* sep_obj;
7600    PyObject* out;
7601
7602    str_obj = PyUnicode_FromObject(str_in);
7603    if (!str_obj)
7604	return NULL;
7605    sep_obj = PyUnicode_FromObject(sep_in);
7606    if (!sep_obj) {
7607        Py_DECREF(str_obj);
7608        return NULL;
7609    }
7610
7611    out = stringlib_partition(
7612        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7613        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7614        );
7615
7616    Py_DECREF(sep_obj);
7617    Py_DECREF(str_obj);
7618
7619    return out;
7620}
7621
7622
7623PyObject *
7624PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7625{
7626    PyObject* str_obj;
7627    PyObject* sep_obj;
7628    PyObject* out;
7629
7630    str_obj = PyUnicode_FromObject(str_in);
7631    if (!str_obj)
7632	return NULL;
7633    sep_obj = PyUnicode_FromObject(sep_in);
7634    if (!sep_obj) {
7635        Py_DECREF(str_obj);
7636        return NULL;
7637    }
7638
7639    out = stringlib_rpartition(
7640        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7641        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7642        );
7643
7644    Py_DECREF(sep_obj);
7645    Py_DECREF(str_obj);
7646
7647    return out;
7648}
7649
7650PyDoc_STRVAR(partition__doc__,
7651"S.partition(sep) -> (head, sep, tail)\n\
7652\n\
7653Searches for the separator sep in S, and returns the part before it,\n\
7654the separator itself, and the part after it.  If the separator is not\n\
7655found, returns S and two empty strings.");
7656
7657static PyObject*
7658unicode_partition(PyUnicodeObject *self, PyObject *separator)
7659{
7660    return PyUnicode_Partition((PyObject *)self, separator);
7661}
7662
7663PyDoc_STRVAR(rpartition__doc__,
7664"S.rpartition(sep) -> (tail, sep, head)\n\
7665\n\
7666Searches for the separator sep in S, starting at the end of S, and returns\n\
7667the part before it, the separator itself, and the part after it.  If the\n\
7668separator is not found, returns two empty strings and S.");
7669
7670static PyObject*
7671unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7672{
7673    return PyUnicode_RPartition((PyObject *)self, separator);
7674}
7675
7676PyObject *PyUnicode_RSplit(PyObject *s,
7677			   PyObject *sep,
7678			   Py_ssize_t maxsplit)
7679{
7680    PyObject *result;
7681
7682    s = PyUnicode_FromObject(s);
7683    if (s == NULL)
7684	return NULL;
7685    if (sep != NULL) {
7686	sep = PyUnicode_FromObject(sep);
7687	if (sep == NULL) {
7688	    Py_DECREF(s);
7689	    return NULL;
7690	}
7691    }
7692
7693    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7694
7695    Py_DECREF(s);
7696    Py_XDECREF(sep);
7697    return result;
7698}
7699
7700PyDoc_STRVAR(rsplit__doc__,
7701"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7702\n\
7703Return a list of the words in S, using sep as the\n\
7704delimiter string, starting at the end of the string and\n\
7705working to the front.  If maxsplit is given, at most maxsplit\n\
7706splits are done. If sep is not specified, any whitespace string\n\
7707is a separator.");
7708
7709static PyObject*
7710unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7711{
7712    PyObject *substring = Py_None;
7713    Py_ssize_t maxcount = -1;
7714
7715    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7716        return NULL;
7717
7718    if (substring == Py_None)
7719	return rsplit(self, NULL, maxcount);
7720    else if (PyUnicode_Check(substring))
7721	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7722    else
7723	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7724}
7725
7726PyDoc_STRVAR(splitlines__doc__,
7727"S.splitlines([keepends]]) -> list of strings\n\
7728\n\
7729Return a list of the lines in S, breaking at line boundaries.\n\
7730Line breaks are not included in the resulting list unless keepends\n\
7731is given and true.");
7732
7733static PyObject*
7734unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7735{
7736    int keepends = 0;
7737
7738    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7739        return NULL;
7740
7741    return PyUnicode_Splitlines((PyObject *)self, keepends);
7742}
7743
7744static
7745PyObject *unicode_str(PyObject *self)
7746{
7747    if (PyUnicode_CheckExact(self)) {
7748        Py_INCREF(self);
7749        return self;
7750    } else
7751        /* Subtype -- return genuine unicode string with the same value. */
7752        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7753                                     PyUnicode_GET_SIZE(self));
7754}
7755
7756PyDoc_STRVAR(swapcase__doc__,
7757"S.swapcase() -> unicode\n\
7758\n\
7759Return a copy of S with uppercase characters converted to lowercase\n\
7760and vice versa.");
7761
7762static PyObject*
7763unicode_swapcase(PyUnicodeObject *self)
7764{
7765    return fixup(self, fixswapcase);
7766}
7767
7768PyDoc_STRVAR(translate__doc__,
7769"S.translate(table) -> unicode\n\
7770\n\
7771Return a copy of the string S, where all characters have been mapped\n\
7772through the given translation table, which must be a mapping of\n\
7773Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7774Unmapped characters are left untouched. Characters mapped to None\n\
7775are deleted.");
7776
7777static PyObject*
7778unicode_translate(PyUnicodeObject *self, PyObject *table)
7779{
7780    return PyUnicode_TranslateCharmap(self->str,
7781				      self->length,
7782				      table,
7783				      "ignore");
7784}
7785
7786PyDoc_STRVAR(upper__doc__,
7787"S.upper() -> unicode\n\
7788\n\
7789Return a copy of S converted to uppercase.");
7790
7791static PyObject*
7792unicode_upper(PyUnicodeObject *self)
7793{
7794    return fixup(self, fixupper);
7795}
7796
7797PyDoc_STRVAR(zfill__doc__,
7798"S.zfill(width) -> unicode\n\
7799\n\
7800Pad a numeric string x with zeros on the left, to fill a field\n\
7801of the specified width. The string x is never truncated.");
7802
7803static PyObject *
7804unicode_zfill(PyUnicodeObject *self, PyObject *args)
7805{
7806    Py_ssize_t fill;
7807    PyUnicodeObject *u;
7808
7809    Py_ssize_t width;
7810    if (!PyArg_ParseTuple(args, "n:zfill", &width))
7811        return NULL;
7812
7813    if (self->length >= width) {
7814        if (PyUnicode_CheckExact(self)) {
7815            Py_INCREF(self);
7816            return (PyObject*) self;
7817        }
7818        else
7819            return PyUnicode_FromUnicode(
7820                PyUnicode_AS_UNICODE(self),
7821                PyUnicode_GET_SIZE(self)
7822            );
7823    }
7824
7825    fill = width - self->length;
7826
7827    u = pad(self, fill, 0, '0');
7828
7829    if (u == NULL)
7830        return NULL;
7831
7832    if (u->str[fill] == '+' || u->str[fill] == '-') {
7833        /* move sign to beginning of string */
7834        u->str[0] = u->str[fill];
7835        u->str[fill] = '0';
7836    }
7837
7838    return (PyObject*) u;
7839}
7840
7841#if 0
7842static PyObject*
7843unicode_freelistsize(PyUnicodeObject *self)
7844{
7845    return PyInt_FromLong(unicode_freelist_size);
7846}
7847#endif
7848
7849PyDoc_STRVAR(startswith__doc__,
7850"S.startswith(prefix[, start[, end]]) -> bool\n\
7851\n\
7852Return True if S starts with the specified prefix, False otherwise.\n\
7853With optional start, test S beginning at that position.\n\
7854With optional end, stop comparing S at that position.\n\
7855prefix can also be a tuple of strings to try.");
7856
7857static PyObject *
7858unicode_startswith(PyUnicodeObject *self,
7859		   PyObject *args)
7860{
7861    PyObject *subobj;
7862    PyUnicodeObject *substring;
7863    Py_ssize_t start = 0;
7864    Py_ssize_t end = PY_SSIZE_T_MAX;
7865    int result;
7866
7867    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7868		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7869	return NULL;
7870    if (PyTuple_Check(subobj)) {
7871        Py_ssize_t i;
7872        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7873            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7874                            PyTuple_GET_ITEM(subobj, i));
7875            if (substring == NULL)
7876                return NULL;
7877            result = tailmatch(self, substring, start, end, -1);
7878            Py_DECREF(substring);
7879            if (result) {
7880                Py_RETURN_TRUE;
7881            }
7882        }
7883        /* nothing matched */
7884        Py_RETURN_FALSE;
7885    }
7886    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7887    if (substring == NULL)
7888         return NULL;
7889    result = tailmatch(self, substring, start, end, -1);
7890    Py_DECREF(substring);
7891    return PyBool_FromLong(result);
7892}
7893
7894
7895PyDoc_STRVAR(endswith__doc__,
7896"S.endswith(suffix[, start[, end]]) -> bool\n\
7897\n\
7898Return True if S ends with the specified suffix, False otherwise.\n\
7899With optional start, test S beginning at that position.\n\
7900With optional end, stop comparing S at that position.\n\
7901suffix can also be a tuple of strings to try.");
7902
7903static PyObject *
7904unicode_endswith(PyUnicodeObject *self,
7905		 PyObject *args)
7906{
7907    PyObject *subobj;
7908    PyUnicodeObject *substring;
7909    Py_ssize_t start = 0;
7910    Py_ssize_t end = PY_SSIZE_T_MAX;
7911    int result;
7912
7913    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7914        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7915	return NULL;
7916    if (PyTuple_Check(subobj)) {
7917        Py_ssize_t i;
7918        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7919            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7920                            PyTuple_GET_ITEM(subobj, i));
7921            if (substring == NULL)
7922            return NULL;
7923            result = tailmatch(self, substring, start, end, +1);
7924            Py_DECREF(substring);
7925            if (result) {
7926                Py_RETURN_TRUE;
7927            }
7928        }
7929        Py_RETURN_FALSE;
7930    }
7931    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7932    if (substring == NULL)
7933    return NULL;
7934
7935    result = tailmatch(self, substring, start, end, +1);
7936    Py_DECREF(substring);
7937    return PyBool_FromLong(result);
7938}
7939
7940#include "stringlib/string_format.h"
7941
7942PyDoc_STRVAR(format__doc__,
7943"S.format(*args, **kwargs) -> unicode\n\
7944\n\
7945");
7946
7947PyDoc_STRVAR(p_format__doc__,
7948"S.__format__(format_spec) -> unicode\n\
7949\n\
7950");
7951
7952static PyObject *
7953unicode_getnewargs(PyUnicodeObject *v)
7954{
7955	return Py_BuildValue("(u#)", v->str, v->length);
7956}
7957
7958
7959static PyMethodDef unicode_methods[] = {
7960
7961    /* Order is according to common usage: often used methods should
7962       appear first, since lookup is done sequentially. */
7963
7964    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7965    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7966    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7967    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7968    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7969    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7970    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7971    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7972    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7973    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7974    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7975    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7976    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7977    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7978    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7979    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7980    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7981    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7982    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7983    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7984    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7985    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7986    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7987    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7988    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7989    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7990    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7991    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7992    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7993    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7994    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7995    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7996    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7997    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7998    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7999    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8000    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8001    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8002    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8003    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8004    {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
8005    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8006    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8007#if 0
8008    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8009#endif
8010
8011#if 0
8012    /* This one is just used for debugging the implementation. */
8013    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8014#endif
8015
8016    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
8017    {NULL, NULL}
8018};
8019
8020static PyObject *
8021unicode_mod(PyObject *v, PyObject *w)
8022{
8023       if (!PyUnicode_Check(v)) {
8024               Py_INCREF(Py_NotImplemented);
8025               return Py_NotImplemented;
8026       }
8027       return PyUnicode_Format(v, w);
8028}
8029
8030static PyNumberMethods unicode_as_number = {
8031	0,				/*nb_add*/
8032	0,				/*nb_subtract*/
8033	0,				/*nb_multiply*/
8034	unicode_mod,			/*nb_remainder*/
8035};
8036
8037static PySequenceMethods unicode_as_sequence = {
8038    (lenfunc) unicode_length, 		/* sq_length */
8039    PyUnicode_Concat,		 	/* sq_concat */
8040    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
8041    (ssizeargfunc) unicode_getitem, 	/* sq_item */
8042    0,				 	/* sq_slice */
8043    0, 					/* sq_ass_item */
8044    0, 					/* sq_ass_slice */
8045    PyUnicode_Contains, 		/* sq_contains */
8046};
8047
8048static PyObject*
8049unicode_subscript(PyUnicodeObject* self, PyObject* item)
8050{
8051    if (PyIndex_Check(item)) {
8052        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8053        if (i == -1 && PyErr_Occurred())
8054            return NULL;
8055        if (i < 0)
8056            i += PyUnicode_GET_SIZE(self);
8057        return unicode_getitem(self, i);
8058    } else if (PySlice_Check(item)) {
8059        Py_ssize_t start, stop, step, slicelength, cur, i;
8060        Py_UNICODE* source_buf;
8061        Py_UNICODE* result_buf;
8062        PyObject* result;
8063
8064        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8065				 &start, &stop, &step, &slicelength) < 0) {
8066            return NULL;
8067        }
8068
8069        if (slicelength <= 0) {
8070            return PyUnicode_FromUnicode(NULL, 0);
8071        } else if (start == 0 && step == 1 && slicelength == self->length &&
8072                   PyUnicode_CheckExact(self)) {
8073            Py_INCREF(self);
8074            return (PyObject *)self;
8075        } else if (step == 1) {
8076            return PyUnicode_FromUnicode(self->str + start, slicelength);
8077        } else {
8078            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8079            result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8080                                                    sizeof(Py_UNICODE));
8081
8082	    if (result_buf == NULL)
8083		    return PyErr_NoMemory();
8084
8085            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8086                result_buf[i] = source_buf[cur];
8087            }
8088
8089            result = PyUnicode_FromUnicode(result_buf, slicelength);
8090            PyMem_FREE(result_buf);
8091            return result;
8092        }
8093    } else {
8094        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8095        return NULL;
8096    }
8097}
8098
8099static PyMappingMethods unicode_as_mapping = {
8100    (lenfunc)unicode_length,		/* mp_length */
8101    (binaryfunc)unicode_subscript,	/* mp_subscript */
8102    (objobjargproc)0,			/* mp_ass_subscript */
8103};
8104
8105
8106static int
8107unicode_buffer_getbuffer(PyUnicodeObject *self, PyBuffer *view, int flags)
8108{
8109
8110    if (flags & PyBUF_CHARACTER) {
8111        PyErr_SetString(PyExc_SystemError, "can't use str as char buffer");
8112        return -1;
8113    }
8114    return PyBuffer_FillInfo(view, (void *)self->str,
8115                             PyUnicode_GET_DATA_SIZE(self), 1, flags);
8116}
8117
8118
8119/* Helpers for PyUnicode_Format() */
8120
8121static PyObject *
8122getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8123{
8124    Py_ssize_t argidx = *p_argidx;
8125    if (argidx < arglen) {
8126	(*p_argidx)++;
8127	if (arglen < 0)
8128	    return args;
8129	else
8130	    return PyTuple_GetItem(args, argidx);
8131    }
8132    PyErr_SetString(PyExc_TypeError,
8133		    "not enough arguments for format string");
8134    return NULL;
8135}
8136
8137#define F_LJUST (1<<0)
8138#define F_SIGN	(1<<1)
8139#define F_BLANK (1<<2)
8140#define F_ALT	(1<<3)
8141#define F_ZERO	(1<<4)
8142
8143static Py_ssize_t
8144strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8145{
8146    register Py_ssize_t i;
8147    Py_ssize_t len = strlen(charbuffer);
8148    for (i = len - 1; i >= 0; i--)
8149	buffer[i] = (Py_UNICODE) charbuffer[i];
8150
8151    return len;
8152}
8153
8154static int
8155doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8156{
8157    Py_ssize_t result;
8158
8159    PyOS_ascii_formatd((char *)buffer, len, format, x);
8160    result = strtounicode(buffer, (char *)buffer);
8161    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8162}
8163
8164static int
8165longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8166{
8167    Py_ssize_t result;
8168
8169    PyOS_snprintf((char *)buffer, len, format, x);
8170    result = strtounicode(buffer, (char *)buffer);
8171    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8172}
8173
8174/* XXX To save some code duplication, formatfloat/long/int could have been
8175   shared with stringobject.c, converting from 8-bit to Unicode after the
8176   formatting is done. */
8177
8178static int
8179formatfloat(Py_UNICODE *buf,
8180	    size_t buflen,
8181	    int flags,
8182	    int prec,
8183	    int type,
8184	    PyObject *v)
8185{
8186    /* fmt = '%#.' + `prec` + `type`
8187       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8188    char fmt[20];
8189    double x;
8190
8191    x = PyFloat_AsDouble(v);
8192    if (x == -1.0 && PyErr_Occurred())
8193	return -1;
8194    if (prec < 0)
8195	prec = 6;
8196    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8197	type = 'g';
8198    /* Worst case length calc to ensure no buffer overrun:
8199
8200       'g' formats:
8201	 fmt = %#.<prec>g
8202	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8203	    for any double rep.)
8204	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8205
8206       'f' formats:
8207	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8208	 len = 1 + 50 + 1 + prec = 52 + prec
8209
8210       If prec=0 the effective precision is 1 (the leading digit is
8211       always given), therefore increase the length by one.
8212
8213    */
8214    if (((type == 'g' || type == 'G') &&
8215          buflen <= (size_t)10 + (size_t)prec) ||
8216	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8217	PyErr_SetString(PyExc_OverflowError,
8218			"formatted float is too long (precision too large?)");
8219	return -1;
8220    }
8221    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8222		  (flags&F_ALT) ? "#" : "",
8223		  prec, type);
8224    return doubletounicode(buf, buflen, fmt, x);
8225}
8226
8227static PyObject*
8228formatlong(PyObject *val, int flags, int prec, int type)
8229{
8230	char *buf;
8231	int len;
8232	PyObject *str; /* temporary string object. */
8233	PyObject *result;
8234
8235	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8236	if (!str)
8237		return NULL;
8238	result = PyUnicode_FromStringAndSize(buf, len);
8239	Py_DECREF(str);
8240	return result;
8241}
8242
8243static int
8244formatint(Py_UNICODE *buf,
8245	  size_t buflen,
8246	  int flags,
8247	  int prec,
8248	  int type,
8249	  PyObject *v)
8250{
8251    /* fmt = '%#.' + `prec` + 'l' + `type`
8252     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8253     *                     + 1 + 1
8254     *                   = 24
8255     */
8256    char fmt[64]; /* plenty big enough! */
8257    char *sign;
8258    long x;
8259
8260    x = PyInt_AsLong(v);
8261    if (x == -1 && PyErr_Occurred())
8262        return -1;
8263    if (x < 0 && type == 'u') {
8264        type = 'd';
8265    }
8266    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8267        sign = "-";
8268    else
8269        sign = "";
8270    if (prec < 0)
8271        prec = 1;
8272
8273    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8274     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8275     */
8276    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8277        PyErr_SetString(PyExc_OverflowError,
8278    	        "formatted integer is too long (precision too large?)");
8279        return -1;
8280    }
8281
8282    if ((flags & F_ALT) &&
8283        (type == 'x' || type == 'X' || type == 'o')) {
8284        /* When converting under %#o, %#x or %#X, there are a number
8285         * of issues that cause pain:
8286	 * - for %#o, we want a different base marker than C
8287         * - when 0 is being converted, the C standard leaves off
8288         *   the '0x' or '0X', which is inconsistent with other
8289         *   %#x/%#X conversions and inconsistent with Python's
8290         *   hex() function
8291         * - there are platforms that violate the standard and
8292         *   convert 0 with the '0x' or '0X'
8293         *   (Metrowerks, Compaq Tru64)
8294         * - there are platforms that give '0x' when converting
8295         *   under %#X, but convert 0 in accordance with the
8296         *   standard (OS/2 EMX)
8297         *
8298         * We can achieve the desired consistency by inserting our
8299         * own '0x' or '0X' prefix, and substituting %x/%X in place
8300         * of %#x/%#X.
8301         *
8302         * Note that this is the same approach as used in
8303         * formatint() in stringobject.c
8304         */
8305        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8306                      sign, type, prec, type);
8307    }
8308    else {
8309        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8310                      sign, (flags&F_ALT) ? "#" : "",
8311                      prec, type);
8312    }
8313    if (sign[0])
8314        return longtounicode(buf, buflen, fmt, -x);
8315    else
8316        return longtounicode(buf, buflen, fmt, x);
8317}
8318
8319static int
8320formatchar(Py_UNICODE *buf,
8321           size_t buflen,
8322           PyObject *v)
8323{
8324    /* presume that the buffer is at least 2 characters long */
8325    if (PyUnicode_Check(v)) {
8326	if (PyUnicode_GET_SIZE(v) != 1)
8327	    goto onError;
8328	buf[0] = PyUnicode_AS_UNICODE(v)[0];
8329    }
8330
8331    else if (PyString_Check(v)) {
8332	if (PyString_GET_SIZE(v) != 1)
8333	    goto onError;
8334	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8335    }
8336
8337    else {
8338	/* Integer input truncated to a character */
8339        long x;
8340	x = PyInt_AsLong(v);
8341	if (x == -1 && PyErr_Occurred())
8342	    goto onError;
8343#ifdef Py_UNICODE_WIDE
8344	if (x < 0 || x > 0x10ffff) {
8345	    PyErr_SetString(PyExc_OverflowError,
8346			    "%c arg not in range(0x110000) "
8347			    "(wide Python build)");
8348	    return -1;
8349	}
8350#else
8351	if (x < 0 || x > 0xffff) {
8352	    PyErr_SetString(PyExc_OverflowError,
8353			    "%c arg not in range(0x10000) "
8354			    "(narrow Python build)");
8355	    return -1;
8356	}
8357#endif
8358	buf[0] = (Py_UNICODE) x;
8359    }
8360    buf[1] = '\0';
8361    return 1;
8362
8363 onError:
8364    PyErr_SetString(PyExc_TypeError,
8365		    "%c requires int or char");
8366    return -1;
8367}
8368
8369/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8370
8371   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8372   chars are formatted. XXX This is a magic number. Each formatting
8373   routine does bounds checking to ensure no overflow, but a better
8374   solution may be to malloc a buffer of appropriate size for each
8375   format. For now, the current solution is sufficient.
8376*/
8377#define FORMATBUFLEN (size_t)120
8378
8379PyObject *PyUnicode_Format(PyObject *format,
8380			   PyObject *args)
8381{
8382    Py_UNICODE *fmt, *res;
8383    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8384    int args_owned = 0;
8385    PyUnicodeObject *result = NULL;
8386    PyObject *dict = NULL;
8387    PyObject *uformat;
8388
8389    if (format == NULL || args == NULL) {
8390	PyErr_BadInternalCall();
8391	return NULL;
8392    }
8393    uformat = PyUnicode_FromObject(format);
8394    if (uformat == NULL)
8395	return NULL;
8396    fmt = PyUnicode_AS_UNICODE(uformat);
8397    fmtcnt = PyUnicode_GET_SIZE(uformat);
8398
8399    reslen = rescnt = fmtcnt + 100;
8400    result = _PyUnicode_New(reslen);
8401    if (result == NULL)
8402	goto onError;
8403    res = PyUnicode_AS_UNICODE(result);
8404
8405    if (PyTuple_Check(args)) {
8406	arglen = PyTuple_Size(args);
8407	argidx = 0;
8408    }
8409    else {
8410	arglen = -1;
8411	argidx = -2;
8412    }
8413    if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
8414        !PyObject_TypeCheck(args, &PyBaseString_Type))
8415	dict = args;
8416
8417    while (--fmtcnt >= 0) {
8418	if (*fmt != '%') {
8419	    if (--rescnt < 0) {
8420		rescnt = fmtcnt + 100;
8421		reslen += rescnt;
8422		if (_PyUnicode_Resize(&result, reslen) < 0)
8423		    goto onError;
8424		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8425		--rescnt;
8426	    }
8427	    *res++ = *fmt++;
8428	}
8429	else {
8430	    /* Got a format specifier */
8431	    int flags = 0;
8432	    Py_ssize_t width = -1;
8433	    int prec = -1;
8434	    Py_UNICODE c = '\0';
8435	    Py_UNICODE fill;
8436	    PyObject *v = NULL;
8437	    PyObject *temp = NULL;
8438	    Py_UNICODE *pbuf;
8439	    Py_UNICODE sign;
8440	    Py_ssize_t len;
8441	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8442
8443	    fmt++;
8444	    if (*fmt == '(') {
8445		Py_UNICODE *keystart;
8446		Py_ssize_t keylen;
8447		PyObject *key;
8448		int pcount = 1;
8449
8450		if (dict == NULL) {
8451		    PyErr_SetString(PyExc_TypeError,
8452				    "format requires a mapping");
8453		    goto onError;
8454		}
8455		++fmt;
8456		--fmtcnt;
8457		keystart = fmt;
8458		/* Skip over balanced parentheses */
8459		while (pcount > 0 && --fmtcnt >= 0) {
8460		    if (*fmt == ')')
8461			--pcount;
8462		    else if (*fmt == '(')
8463			++pcount;
8464		    fmt++;
8465		}
8466		keylen = fmt - keystart - 1;
8467		if (fmtcnt < 0 || pcount > 0) {
8468		    PyErr_SetString(PyExc_ValueError,
8469				    "incomplete format key");
8470		    goto onError;
8471		}
8472#if 0
8473		/* keys are converted to strings using UTF-8 and
8474		   then looked up since Python uses strings to hold
8475		   variables names etc. in its namespaces and we
8476		   wouldn't want to break common idioms. */
8477		key = PyUnicode_EncodeUTF8(keystart,
8478					   keylen,
8479					   NULL);
8480#else
8481		key = PyUnicode_FromUnicode(keystart, keylen);
8482#endif
8483		if (key == NULL)
8484		    goto onError;
8485		if (args_owned) {
8486		    Py_DECREF(args);
8487		    args_owned = 0;
8488		}
8489		args = PyObject_GetItem(dict, key);
8490		Py_DECREF(key);
8491		if (args == NULL) {
8492		    goto onError;
8493		}
8494		args_owned = 1;
8495		arglen = -1;
8496		argidx = -2;
8497	    }
8498	    while (--fmtcnt >= 0) {
8499		switch (c = *fmt++) {
8500		case '-': flags |= F_LJUST; continue;
8501		case '+': flags |= F_SIGN; continue;
8502		case ' ': flags |= F_BLANK; continue;
8503		case '#': flags |= F_ALT; continue;
8504		case '0': flags |= F_ZERO; continue;
8505		}
8506		break;
8507	    }
8508	    if (c == '*') {
8509		v = getnextarg(args, arglen, &argidx);
8510		if (v == NULL)
8511		    goto onError;
8512		if (!PyInt_Check(v)) {
8513		    PyErr_SetString(PyExc_TypeError,
8514				    "* wants int");
8515		    goto onError;
8516		}
8517		width = PyInt_AsLong(v);
8518		if (width == -1 && PyErr_Occurred())
8519			goto onError;
8520		if (width < 0) {
8521		    flags |= F_LJUST;
8522		    width = -width;
8523		}
8524		if (--fmtcnt >= 0)
8525		    c = *fmt++;
8526	    }
8527	    else if (c >= '0' && c <= '9') {
8528		width = c - '0';
8529		while (--fmtcnt >= 0) {
8530		    c = *fmt++;
8531		    if (c < '0' || c > '9')
8532			break;
8533		    if ((width*10) / 10 != width) {
8534			PyErr_SetString(PyExc_ValueError,
8535					"width too big");
8536			goto onError;
8537		    }
8538		    width = width*10 + (c - '0');
8539		}
8540	    }
8541	    if (c == '.') {
8542		prec = 0;
8543		if (--fmtcnt >= 0)
8544		    c = *fmt++;
8545		if (c == '*') {
8546		    v = getnextarg(args, arglen, &argidx);
8547		    if (v == NULL)
8548			goto onError;
8549		    if (!PyInt_Check(v)) {
8550			PyErr_SetString(PyExc_TypeError,
8551					"* wants int");
8552			goto onError;
8553		    }
8554		    prec = PyInt_AsLong(v);
8555		    if (prec == -1 && PyErr_Occurred())
8556			goto onError;
8557		    if (prec < 0)
8558			prec = 0;
8559		    if (--fmtcnt >= 0)
8560			c = *fmt++;
8561		}
8562		else if (c >= '0' && c <= '9') {
8563		    prec = c - '0';
8564		    while (--fmtcnt >= 0) {
8565			c = Py_CHARMASK(*fmt++);
8566			if (c < '0' || c > '9')
8567			    break;
8568			if ((prec*10) / 10 != prec) {
8569			    PyErr_SetString(PyExc_ValueError,
8570					    "prec too big");
8571			    goto onError;
8572			}
8573			prec = prec*10 + (c - '0');
8574		    }
8575		}
8576	    } /* prec */
8577	    if (fmtcnt >= 0) {
8578		if (c == 'h' || c == 'l' || c == 'L') {
8579		    if (--fmtcnt >= 0)
8580			c = *fmt++;
8581		}
8582	    }
8583	    if (fmtcnt < 0) {
8584		PyErr_SetString(PyExc_ValueError,
8585				"incomplete format");
8586		goto onError;
8587	    }
8588	    if (c != '%') {
8589		v = getnextarg(args, arglen, &argidx);
8590		if (v == NULL)
8591		    goto onError;
8592	    }
8593	    sign = 0;
8594	    fill = ' ';
8595	    switch (c) {
8596
8597	    case '%':
8598		pbuf = formatbuf;
8599		/* presume that buffer length is at least 1 */
8600		pbuf[0] = '%';
8601		len = 1;
8602		break;
8603
8604	    case 's':
8605	    case 'r':
8606		if (PyUnicode_Check(v) && c == 's') {
8607		    temp = v;
8608		    Py_INCREF(temp);
8609		}
8610		else {
8611		    PyObject *unicode;
8612		    if (c == 's')
8613			temp = PyObject_Unicode(v);
8614		    else
8615			temp = PyObject_Repr(v);
8616		    if (temp == NULL)
8617			goto onError;
8618                    if (PyUnicode_Check(temp))
8619                        /* nothing to do */;
8620                    else if (PyString_Check(temp)) {
8621                        /* convert to string to Unicode */
8622		        unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8623						   PyString_GET_SIZE(temp),
8624						   NULL,
8625						   "strict");
8626		        Py_DECREF(temp);
8627		        temp = unicode;
8628		        if (temp == NULL)
8629			    goto onError;
8630		    }
8631		    else {
8632			Py_DECREF(temp);
8633			PyErr_SetString(PyExc_TypeError,
8634					"%s argument has non-string str()");
8635			goto onError;
8636		    }
8637		}
8638		pbuf = PyUnicode_AS_UNICODE(temp);
8639		len = PyUnicode_GET_SIZE(temp);
8640		if (prec >= 0 && len > prec)
8641		    len = prec;
8642		break;
8643
8644	    case 'i':
8645	    case 'd':
8646	    case 'u':
8647	    case 'o':
8648	    case 'x':
8649	    case 'X':
8650		if (c == 'i')
8651		    c = 'd';
8652		if (PyLong_Check(v)) {
8653		    temp = formatlong(v, flags, prec, c);
8654		    if (!temp)
8655			goto onError;
8656		    pbuf = PyUnicode_AS_UNICODE(temp);
8657		    len = PyUnicode_GET_SIZE(temp);
8658		    sign = 1;
8659		}
8660		else {
8661		    pbuf = formatbuf;
8662		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8663				    flags, prec, c, v);
8664		    if (len < 0)
8665			goto onError;
8666		    sign = 1;
8667		}
8668		if (flags & F_ZERO)
8669		    fill = '0';
8670		break;
8671
8672	    case 'e':
8673	    case 'E':
8674	    case 'f':
8675	    case 'F':
8676	    case 'g':
8677	    case 'G':
8678		if (c == 'F')
8679			c = 'f';
8680		pbuf = formatbuf;
8681		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8682			flags, prec, c, v);
8683		if (len < 0)
8684		    goto onError;
8685		sign = 1;
8686		if (flags & F_ZERO)
8687		    fill = '0';
8688		break;
8689
8690	    case 'c':
8691		pbuf = formatbuf;
8692		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8693		if (len < 0)
8694		    goto onError;
8695		break;
8696
8697	    default:
8698		PyErr_Format(PyExc_ValueError,
8699			     "unsupported format character '%c' (0x%x) "
8700			     "at index %zd",
8701			     (31<=c && c<=126) ? (char)c : '?',
8702                             (int)c,
8703			     (Py_ssize_t)(fmt - 1 -
8704					  PyUnicode_AS_UNICODE(uformat)));
8705		goto onError;
8706	    }
8707	    if (sign) {
8708		if (*pbuf == '-' || *pbuf == '+') {
8709		    sign = *pbuf++;
8710		    len--;
8711		}
8712		else if (flags & F_SIGN)
8713		    sign = '+';
8714		else if (flags & F_BLANK)
8715		    sign = ' ';
8716		else
8717		    sign = 0;
8718	    }
8719	    if (width < len)
8720		width = len;
8721	    if (rescnt - (sign != 0) < width) {
8722		reslen -= rescnt;
8723		rescnt = width + fmtcnt + 100;
8724		reslen += rescnt;
8725		if (reslen < 0) {
8726		    Py_XDECREF(temp);
8727		    PyErr_NoMemory();
8728		    goto onError;
8729		}
8730		if (_PyUnicode_Resize(&result, reslen) < 0) {
8731		    Py_XDECREF(temp);
8732		    goto onError;
8733		}
8734		res = PyUnicode_AS_UNICODE(result)
8735		    + reslen - rescnt;
8736	    }
8737	    if (sign) {
8738		if (fill != ' ')
8739		    *res++ = sign;
8740		rescnt--;
8741		if (width > len)
8742		    width--;
8743	    }
8744	    if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
8745		assert(pbuf[0] == '0');
8746		assert(pbuf[1] == c);
8747		if (fill != ' ') {
8748		    *res++ = *pbuf++;
8749		    *res++ = *pbuf++;
8750		}
8751		rescnt -= 2;
8752		width -= 2;
8753		if (width < 0)
8754		    width = 0;
8755		len -= 2;
8756	    }
8757	    if (width > len && !(flags & F_LJUST)) {
8758		do {
8759		    --rescnt;
8760		    *res++ = fill;
8761		} while (--width > len);
8762	    }
8763	    if (fill == ' ') {
8764		if (sign)
8765		    *res++ = sign;
8766		if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
8767		    assert(pbuf[0] == '0');
8768		    assert(pbuf[1] == c);
8769		    *res++ = *pbuf++;
8770		    *res++ = *pbuf++;
8771		}
8772	    }
8773	    Py_UNICODE_COPY(res, pbuf, len);
8774	    res += len;
8775	    rescnt -= len;
8776	    while (--width >= len) {
8777		--rescnt;
8778		*res++ = ' ';
8779	    }
8780	    if (dict && (argidx < arglen) && c != '%') {
8781		PyErr_SetString(PyExc_TypeError,
8782				"not all arguments converted during string formatting");
8783                Py_XDECREF(temp);
8784		goto onError;
8785	    }
8786	    Py_XDECREF(temp);
8787	} /* '%' */
8788    } /* until end */
8789    if (argidx < arglen && !dict) {
8790	PyErr_SetString(PyExc_TypeError,
8791			"not all arguments converted during string formatting");
8792	goto onError;
8793    }
8794
8795    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8796	goto onError;
8797    if (args_owned) {
8798	Py_DECREF(args);
8799    }
8800    Py_DECREF(uformat);
8801    return (PyObject *)result;
8802
8803 onError:
8804    Py_XDECREF(result);
8805    Py_DECREF(uformat);
8806    if (args_owned) {
8807	Py_DECREF(args);
8808    }
8809    return NULL;
8810}
8811
8812static PyBufferProcs unicode_as_buffer = {
8813    (getbufferproc) unicode_buffer_getbuffer,
8814    NULL,
8815};
8816
8817static PyObject *
8818unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8819
8820static PyObject *
8821unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8822{
8823        PyObject *x = NULL;
8824	static char *kwlist[] = {"object", "encoding", "errors", 0};
8825	char *encoding = NULL;
8826	char *errors = NULL;
8827
8828	if (type != &PyUnicode_Type)
8829		return unicode_subtype_new(type, args, kwds);
8830	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8831					  kwlist, &x, &encoding, &errors))
8832	    return NULL;
8833	if (x == NULL)
8834		return (PyObject *)_PyUnicode_New(0);
8835	if (encoding == NULL && errors == NULL)
8836	    return PyObject_Unicode(x);
8837	else
8838	return PyUnicode_FromEncodedObject(x, encoding, errors);
8839}
8840
8841static PyObject *
8842unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8843{
8844	PyUnicodeObject *tmp, *pnew;
8845	Py_ssize_t n;
8846
8847	assert(PyType_IsSubtype(type, &PyUnicode_Type));
8848	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8849	if (tmp == NULL)
8850		return NULL;
8851	assert(PyUnicode_Check(tmp));
8852	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8853	if (pnew == NULL) {
8854		Py_DECREF(tmp);
8855		return NULL;
8856	}
8857	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8858	if (pnew->str == NULL) {
8859		_Py_ForgetReference((PyObject *)pnew);
8860		PyObject_Del(pnew);
8861		Py_DECREF(tmp);
8862		return PyErr_NoMemory();
8863	}
8864	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8865	pnew->length = n;
8866	pnew->hash = tmp->hash;
8867	Py_DECREF(tmp);
8868	return (PyObject *)pnew;
8869}
8870
8871PyDoc_STRVAR(unicode_doc,
8872"str(string [, encoding[, errors]]) -> object\n\
8873\n\
8874Create a new string object from the given encoded string.\n\
8875encoding defaults to the current default string encoding.\n\
8876errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8877
8878static PyObject *unicode_iter(PyObject *seq);
8879
8880PyTypeObject PyUnicode_Type = {
8881    PyVarObject_HEAD_INIT(&PyType_Type, 0)
8882    "str", 				/* tp_name */
8883    sizeof(PyUnicodeObject), 		/* tp_size */
8884    0, 					/* tp_itemsize */
8885    /* Slots */
8886    (destructor)unicode_dealloc, 	/* tp_dealloc */
8887    0, 					/* tp_print */
8888    0,				 	/* tp_getattr */
8889    0, 					/* tp_setattr */
8890    0, 					/* tp_compare */
8891    unicode_repr, 			/* tp_repr */
8892    &unicode_as_number, 		/* tp_as_number */
8893    &unicode_as_sequence, 		/* tp_as_sequence */
8894    &unicode_as_mapping, 		/* tp_as_mapping */
8895    (hashfunc) unicode_hash, 		/* tp_hash*/
8896    0, 					/* tp_call*/
8897    (reprfunc) unicode_str,	 	/* tp_str */
8898    PyObject_GenericGetAttr, 		/* tp_getattro */
8899    0,			 		/* tp_setattro */
8900    &unicode_as_buffer,			/* tp_as_buffer */
8901    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8902        Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
8903    unicode_doc,			/* tp_doc */
8904    0,					/* tp_traverse */
8905    0,					/* tp_clear */
8906    PyUnicode_RichCompare,		/* tp_richcompare */
8907    0,					/* tp_weaklistoffset */
8908    unicode_iter,			/* tp_iter */
8909    0,					/* tp_iternext */
8910    unicode_methods,			/* tp_methods */
8911    0,					/* tp_members */
8912    0,					/* tp_getset */
8913    &PyBaseString_Type,			/* tp_base */
8914    0,					/* tp_dict */
8915    0,					/* tp_descr_get */
8916    0,					/* tp_descr_set */
8917    0,					/* tp_dictoffset */
8918    0,					/* tp_init */
8919    0,					/* tp_alloc */
8920    unicode_new,			/* tp_new */
8921    PyObject_Del,      		/* tp_free */
8922};
8923
8924/* Initialize the Unicode implementation */
8925
8926void _PyUnicode_Init(void)
8927{
8928    int i;
8929
8930    /* XXX - move this array to unicodectype.c ? */
8931    Py_UNICODE linebreak[] = {
8932        0x000A, /* LINE FEED */
8933        0x000D, /* CARRIAGE RETURN */
8934        0x001C, /* FILE SEPARATOR */
8935        0x001D, /* GROUP SEPARATOR */
8936        0x001E, /* RECORD SEPARATOR */
8937        0x0085, /* NEXT LINE */
8938        0x2028, /* LINE SEPARATOR */
8939        0x2029, /* PARAGRAPH SEPARATOR */
8940    };
8941
8942    /* Init the implementation */
8943    unicode_freelist = NULL;
8944    unicode_freelist_size = 0;
8945    unicode_empty = _PyUnicode_New(0);
8946    if (!unicode_empty)
8947	return;
8948
8949    for (i = 0; i < 256; i++)
8950	unicode_latin1[i] = NULL;
8951    if (PyType_Ready(&PyUnicode_Type) < 0)
8952	Py_FatalError("Can't initialize 'unicode'");
8953
8954    /* initialize the linebreak bloom filter */
8955    bloom_linebreak = make_bloom_mask(
8956        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8957        );
8958
8959    PyType_Ready(&EncodingMapType);
8960}
8961
8962/* Finalize the Unicode implementation */
8963
8964void
8965_PyUnicode_Fini(void)
8966{
8967    PyUnicodeObject *u;
8968    int i;
8969
8970    Py_XDECREF(unicode_empty);
8971    unicode_empty = NULL;
8972
8973    for (i = 0; i < 256; i++) {
8974	if (unicode_latin1[i]) {
8975	    Py_DECREF(unicode_latin1[i]);
8976	    unicode_latin1[i] = NULL;
8977	}
8978    }
8979
8980    for (u = unicode_freelist; u != NULL;) {
8981	PyUnicodeObject *v = u;
8982	u = *(PyUnicodeObject **)u;
8983	if (v->str)
8984	    PyMem_DEL(v->str);
8985	Py_XDECREF(v->defenc);
8986	PyObject_Del(v);
8987    }
8988    unicode_freelist = NULL;
8989    unicode_freelist_size = 0;
8990}
8991
8992void
8993PyUnicode_InternInPlace(PyObject **p)
8994{
8995	register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8996	PyObject *t;
8997	if (s == NULL || !PyUnicode_Check(s))
8998		Py_FatalError(
8999		    "PyUnicode_InternInPlace: unicode strings only please!");
9000	/* If it's a subclass, we don't really know what putting
9001	   it in the interned dict might do. */
9002	if (!PyUnicode_CheckExact(s))
9003		return;
9004	if (PyUnicode_CHECK_INTERNED(s))
9005		return;
9006	if (interned == NULL) {
9007		interned = PyDict_New();
9008		if (interned == NULL) {
9009			PyErr_Clear(); /* Don't leave an exception */
9010			return;
9011		}
9012	}
9013	/* It might be that the GetItem call fails even
9014	   though the key is present in the dictionary,
9015	   namely when this happens during a stack overflow. */
9016	Py_ALLOW_RECURSION
9017	t = PyDict_GetItem(interned, (PyObject *)s);
9018	Py_END_ALLOW_RECURSION
9019
9020	if (t) {
9021		Py_INCREF(t);
9022		Py_DECREF(*p);
9023		*p = t;
9024		return;
9025	}
9026
9027	PyThreadState_GET()->recursion_critical = 1;
9028	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9029		PyErr_Clear();
9030		PyThreadState_GET()->recursion_critical = 0;
9031		return;
9032	}
9033	PyThreadState_GET()->recursion_critical = 0;
9034	/* The two references in interned are not counted by refcnt.
9035	   The deallocator will take care of this */
9036	Py_Refcnt(s) -= 2;
9037	PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9038}
9039
9040void
9041PyUnicode_InternImmortal(PyObject **p)
9042{
9043	PyUnicode_InternInPlace(p);
9044	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9045		PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9046		Py_INCREF(*p);
9047	}
9048}
9049
9050PyObject *
9051PyUnicode_InternFromString(const char *cp)
9052{
9053	PyObject *s = PyUnicode_FromString(cp);
9054	if (s == NULL)
9055		return NULL;
9056	PyUnicode_InternInPlace(&s);
9057	return s;
9058}
9059
9060void _Py_ReleaseInternedUnicodeStrings(void)
9061{
9062	PyObject *keys;
9063	PyUnicodeObject *s;
9064	Py_ssize_t i, n;
9065	Py_ssize_t immortal_size = 0, mortal_size = 0;
9066
9067	if (interned == NULL || !PyDict_Check(interned))
9068		return;
9069	keys = PyDict_Keys(interned);
9070	if (keys == NULL || !PyList_Check(keys)) {
9071		PyErr_Clear();
9072		return;
9073	}
9074
9075	/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9076	   detector, interned unicode strings are not forcibly deallocated;
9077	   rather, we give them their stolen references back, and then clear
9078	   and DECREF the interned dict. */
9079
9080	n = PyList_GET_SIZE(keys);
9081	fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9082		n);
9083	for (i = 0; i < n; i++) {
9084		s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9085		switch (s->state) {
9086		case SSTATE_NOT_INTERNED:
9087			/* XXX Shouldn't happen */
9088			break;
9089		case SSTATE_INTERNED_IMMORTAL:
9090			Py_Refcnt(s) += 1;
9091			immortal_size += s->length;
9092			break;
9093		case SSTATE_INTERNED_MORTAL:
9094			Py_Refcnt(s) += 2;
9095			mortal_size += s->length;
9096			break;
9097		default:
9098			Py_FatalError("Inconsistent interned string state.");
9099		}
9100		s->state = SSTATE_NOT_INTERNED;
9101	}
9102	fprintf(stderr, "total size of all interned strings: "
9103			"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9104			"mortal/immortal\n", mortal_size, immortal_size);
9105	Py_DECREF(keys);
9106	PyDict_Clear(interned);
9107	Py_DECREF(interned);
9108	interned = NULL;
9109}
9110
9111
9112/********************* Unicode Iterator **************************/
9113
9114typedef struct {
9115	PyObject_HEAD
9116	Py_ssize_t it_index;
9117	PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9118} unicodeiterobject;
9119
9120static void
9121unicodeiter_dealloc(unicodeiterobject *it)
9122{
9123	_PyObject_GC_UNTRACK(it);
9124	Py_XDECREF(it->it_seq);
9125	PyObject_GC_Del(it);
9126}
9127
9128static int
9129unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9130{
9131	Py_VISIT(it->it_seq);
9132	return 0;
9133}
9134
9135static PyObject *
9136unicodeiter_next(unicodeiterobject *it)
9137{
9138	PyUnicodeObject *seq;
9139	PyObject *item;
9140
9141	assert(it != NULL);
9142	seq = it->it_seq;
9143	if (seq == NULL)
9144		return NULL;
9145	assert(PyUnicode_Check(seq));
9146
9147	if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9148		item = PyUnicode_FromUnicode(
9149                    PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9150		if (item != NULL)
9151			++it->it_index;
9152		return item;
9153	}
9154
9155	Py_DECREF(seq);
9156	it->it_seq = NULL;
9157	return NULL;
9158}
9159
9160static PyObject *
9161unicodeiter_len(unicodeiterobject *it)
9162{
9163	Py_ssize_t len = 0;
9164	if (it->it_seq)
9165		len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9166	return PyInt_FromSsize_t(len);
9167}
9168
9169PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9170
9171static PyMethodDef unicodeiter_methods[] = {
9172	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9173         length_hint_doc},
9174 	{NULL,		NULL}		/* sentinel */
9175};
9176
9177PyTypeObject PyUnicodeIter_Type = {
9178	PyVarObject_HEAD_INIT(&PyType_Type, 0)
9179	"unicodeiterator",			/* tp_name */
9180	sizeof(unicodeiterobject),		/* tp_basicsize */
9181	0,					/* tp_itemsize */
9182	/* methods */
9183	(destructor)unicodeiter_dealloc,	/* tp_dealloc */
9184	0,					/* tp_print */
9185	0,					/* tp_getattr */
9186	0,					/* tp_setattr */
9187	0,					/* tp_compare */
9188	0,					/* tp_repr */
9189	0,					/* tp_as_number */
9190	0,					/* tp_as_sequence */
9191	0,					/* tp_as_mapping */
9192	0,					/* tp_hash */
9193	0,					/* tp_call */
9194	0,					/* tp_str */
9195	PyObject_GenericGetAttr,		/* tp_getattro */
9196	0,					/* tp_setattro */
9197	0,					/* tp_as_buffer */
9198	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9199	0,					/* tp_doc */
9200	(traverseproc)unicodeiter_traverse,	/* tp_traverse */
9201	0,					/* tp_clear */
9202	0,					/* tp_richcompare */
9203	0,					/* tp_weaklistoffset */
9204	PyObject_SelfIter,			/* tp_iter */
9205	(iternextfunc)unicodeiter_next,		/* tp_iternext */
9206	unicodeiter_methods,			/* tp_methods */
9207	0,
9208};
9209
9210static PyObject *
9211unicode_iter(PyObject *seq)
9212{
9213	unicodeiterobject *it;
9214
9215	if (!PyUnicode_Check(seq)) {
9216		PyErr_BadInternalCall();
9217		return NULL;
9218	}
9219	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9220	if (it == NULL)
9221		return NULL;
9222	it->it_index = 0;
9223	Py_INCREF(seq);
9224	it->it_seq = (PyUnicodeObject *)seq;
9225	_PyObject_GC_TRACK(it);
9226	return (PyObject *)it;
9227}
9228
9229size_t
9230Py_UNICODE_strlen(const Py_UNICODE *u)
9231{
9232    int res = 0;
9233    while(*u++)
9234        res++;
9235    return res;
9236}
9237
9238Py_UNICODE*
9239Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9240{
9241    Py_UNICODE *u = s1;
9242    while ((*u++ = *s2++));
9243    return s1;
9244}
9245
9246Py_UNICODE*
9247Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9248{
9249    Py_UNICODE *u = s1;
9250    while ((*u++ = *s2++))
9251        if (n-- == 0)
9252            break;
9253    return s1;
9254}
9255
9256int
9257Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9258{
9259    while (*s1 && *s2 && *s1 == *s2)
9260        s1++, s2++;
9261    if (*s1 && *s2)
9262        return (*s1 < *s2) ? -1 : +1;
9263    if (*s1)
9264        return 1;
9265    if (*s2)
9266        return -1;
9267    return 0;
9268}
9269
9270Py_UNICODE*
9271Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9272{
9273    const Py_UNICODE *p;
9274    for (p = s; *p; p++)
9275        if (*p == c)
9276            return (Py_UNICODE*)p;
9277    return NULL;
9278}
9279
9280
9281#ifdef __cplusplus
9282}
9283#endif
9284
9285
9286/*
9287Local variables:
9288c-basic-offset: 4
9289indent-tabs-mode: nil
9290End:
9291*/
9292