unicodeobject.c revision 70a237179f1213b0c180898b6e1f0b6c4e9cd11c
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44
45#include "unicodeobject.h"
46#include "ucnhash.h"
47
48#include "formatter_unicode.h"
49
50#ifdef MS_WINDOWS
51#include <windows.h>
52#endif
53
54/* Limit for the Unicode object free list */
55
56#define MAX_UNICODE_FREELIST_SIZE       1024
57
58/* Limit for the Unicode object free list stay alive optimization.
59
60   The implementation will keep allocated Unicode memory intact for
61   all objects on the free list having a size less than this
62   limit. This reduces malloc() overhead for small Unicode objects.
63
64   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
65   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
66   malloc()-overhead) bytes of unused garbage.
67
68   Setting the limit to 0 effectively turns the feature off.
69
70   Note: This is an experimental feature ! If you get core dumps when
71   using Unicode objects, turn this feature off.
72
73*/
74
75#define KEEPALIVE_SIZE_LIMIT       9
76
77/* Endianness switches; defaults to little endian */
78
79#ifdef WORDS_BIGENDIAN
80# define BYTEORDER_IS_BIG_ENDIAN
81#else
82# define BYTEORDER_IS_LITTLE_ENDIAN
83#endif
84
85/* --- Globals ------------------------------------------------------------
86
87   The globals are initialized by the _PyUnicode_Init() API and should
88   not be used before calling that API.
89
90*/
91
92
93#ifdef __cplusplus
94extern "C" {
95#endif
96
97/* This dictionary holds all interned unicode strings.  Note that references
98   to strings in this dictionary are *not* counted in the string's ob_refcnt.
99   When the interned string reaches a refcnt of 0 the string deallocation
100   function will delete the reference from this dictionary.
101
102   Another way to look at this is that to say that the actual reference
103   count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
104*/
105static PyObject *interned;
106
107/* Free list for Unicode objects */
108static PyUnicodeObject *unicode_freelist;
109static int unicode_freelist_size;
110
111/* The empty Unicode object is shared to improve performance. */
112static PyUnicodeObject *unicode_empty;
113
114/* Single character Unicode strings in the Latin-1 range are being
115   shared as well. */
116static PyUnicodeObject *unicode_latin1[256];
117
118/* Default encoding to use and assume when NULL is passed as encoding
119   parameter; it is fixed to "utf-8".  Always use the
120   PyUnicode_GetDefaultEncoding() API to access this global. */
121static const char unicode_default_encoding[] = "utf-8";
122
123Py_UNICODE
124PyUnicode_GetMax(void)
125{
126#ifdef Py_UNICODE_WIDE
127	return 0x10FFFF;
128#else
129	/* This is actually an illegal character, so it should
130	   not be passed to unichr. */
131	return 0xFFFF;
132#endif
133}
134
135/* --- Bloom Filters ----------------------------------------------------- */
136
137/* stuff to implement simple "bloom filters" for Unicode characters.
138   to keep things simple, we use a single bitmask, using the least 5
139   bits from each unicode characters as the bit index. */
140
141/* the linebreak mask is set up by Unicode_Init below */
142
143#define BLOOM_MASK unsigned long
144
145static BLOOM_MASK bloom_linebreak;
146
147#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
148
149#define BLOOM_LINEBREAK(ch)\
150    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
151
152Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
153{
154    /* calculate simple bloom-style bitmask for a given unicode string */
155
156    long mask;
157    Py_ssize_t i;
158
159    mask = 0;
160    for (i = 0; i < len; i++)
161        mask |= (1 << (ptr[i] & 0x1F));
162
163    return mask;
164}
165
166Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
167{
168    Py_ssize_t i;
169
170    for (i = 0; i < setlen; i++)
171        if (set[i] == chr)
172            return 1;
173
174    return 0;
175}
176
177#define BLOOM_MEMBER(mask, chr, set, setlen)\
178    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
179
180/* --- Unicode Object ----------------------------------------------------- */
181
182static
183int unicode_resize(register PyUnicodeObject *unicode,
184                      Py_ssize_t length)
185{
186    void *oldstr;
187
188    /* Shortcut if there's nothing much to do. */
189    if (unicode->length == length)
190	goto reset;
191
192    /* Resizing shared object (unicode_empty or single character
193       objects) in-place is not allowed. Use PyUnicode_Resize()
194       instead ! */
195
196    if (unicode == unicode_empty ||
197	(unicode->length == 1 &&
198	 unicode->str[0] < 256U &&
199	 unicode_latin1[unicode->str[0]] == unicode)) {
200        PyErr_SetString(PyExc_SystemError,
201                        "can't resize shared unicode objects");
202        return -1;
203    }
204
205    /* We allocate one more byte to make sure the string is Ux0000 terminated.
206       The overallocation is also used by fastsearch, which assumes that it's
207       safe to look at str[length] (without making any assumptions about what
208       it contains). */
209
210    oldstr = unicode->str;
211    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
212    if (!unicode->str) {
213	unicode->str = (Py_UNICODE *)oldstr;
214        PyErr_NoMemory();
215        return -1;
216    }
217    unicode->str[length] = 0;
218    unicode->length = length;
219
220 reset:
221    /* Reset the object caches */
222    if (unicode->defenc) {
223        Py_DECREF(unicode->defenc);
224        unicode->defenc = NULL;
225    }
226    unicode->hash = -1;
227
228    return 0;
229}
230
231/* We allocate one more byte to make sure the string is
232   Ux0000 terminated; some code (e.g. new_identifier)
233   relies on that.
234
235   XXX This allocator could further be enhanced by assuring that the
236       free list never reduces its size below 1.
237
238*/
239
240static
241PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
242{
243    register PyUnicodeObject *unicode;
244
245    /* Optimization for empty strings */
246    if (length == 0 && unicode_empty != NULL) {
247        Py_INCREF(unicode_empty);
248        return unicode_empty;
249    }
250
251    /* Unicode freelist & memory allocation */
252    if (unicode_freelist) {
253        unicode = unicode_freelist;
254        unicode_freelist = *(PyUnicodeObject **)unicode;
255        unicode_freelist_size--;
256	if (unicode->str) {
257	    /* Keep-Alive optimization: we only upsize the buffer,
258	       never downsize it. */
259	    if ((unicode->length < length) &&
260                unicode_resize(unicode, length) < 0) {
261		PyMem_DEL(unicode->str);
262		goto onError;
263	    }
264	}
265        else {
266	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
267        }
268        PyObject_INIT(unicode, &PyUnicode_Type);
269    }
270    else {
271        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
272        if (unicode == NULL)
273            return NULL;
274	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
275    }
276
277    if (!unicode->str) {
278	PyErr_NoMemory();
279	goto onError;
280    }
281    /* Initialize the first element to guard against cases where
282     * the caller fails before initializing str -- unicode_resize()
283     * reads str[0], and the Keep-Alive optimization can keep memory
284     * allocated for str alive across a call to unicode_dealloc(unicode).
285     * We don't want unicode_resize to read uninitialized memory in
286     * that case.
287     */
288    unicode->str[0] = 0;
289    unicode->str[length] = 0;
290    unicode->length = length;
291    unicode->hash = -1;
292    unicode->state = 0;
293    unicode->defenc = NULL;
294    return unicode;
295
296 onError:
297    _Py_ForgetReference((PyObject *)unicode);
298    PyObject_Del(unicode);
299    return NULL;
300}
301
302static
303void unicode_dealloc(register PyUnicodeObject *unicode)
304{
305    switch (PyUnicode_CHECK_INTERNED(unicode)) {
306        case SSTATE_NOT_INTERNED:
307            break;
308
309        case SSTATE_INTERNED_MORTAL:
310            /* revive dead object temporarily for DelItem */
311            Py_Refcnt(unicode) = 3;
312            if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
313                Py_FatalError(
314                    "deletion of interned unicode string failed");
315            break;
316
317        case SSTATE_INTERNED_IMMORTAL:
318            Py_FatalError("Immortal interned unicode string died.");
319
320        default:
321            Py_FatalError("Inconsistent interned unicode string state.");
322    }
323
324    if (PyUnicode_CheckExact(unicode) &&
325	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
326        /* Keep-Alive optimization */
327	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
328	    PyMem_DEL(unicode->str);
329	    unicode->str = NULL;
330	    unicode->length = 0;
331	}
332	if (unicode->defenc) {
333	    Py_DECREF(unicode->defenc);
334	    unicode->defenc = NULL;
335	}
336	/* Add to free list */
337        *(PyUnicodeObject **)unicode = unicode_freelist;
338        unicode_freelist = unicode;
339        unicode_freelist_size++;
340    }
341    else {
342	PyMem_DEL(unicode->str);
343	Py_XDECREF(unicode->defenc);
344	Py_Type(unicode)->tp_free((PyObject *)unicode);
345    }
346}
347
348int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
349{
350    register PyUnicodeObject *v;
351
352    /* Argument checks */
353    if (unicode == NULL) {
354	PyErr_BadInternalCall();
355	return -1;
356    }
357    v = (PyUnicodeObject *)*unicode;
358    if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
359	PyErr_BadInternalCall();
360	return -1;
361    }
362
363    /* Resizing unicode_empty and single character objects is not
364       possible since these are being shared. We simply return a fresh
365       copy with the same Unicode content. */
366    if (v->length != length &&
367	(v == unicode_empty || v->length == 1)) {
368	PyUnicodeObject *w = _PyUnicode_New(length);
369	if (w == NULL)
370	    return -1;
371	Py_UNICODE_COPY(w->str, v->str,
372			length < v->length ? length : v->length);
373	Py_DECREF(*unicode);
374	*unicode = (PyObject *)w;
375	return 0;
376    }
377
378    /* Note that we don't have to modify *unicode for unshared Unicode
379       objects, since we can modify them in-place. */
380    return unicode_resize(v, length);
381}
382
383/* Internal API for use in unicodeobject.c only ! */
384#define _PyUnicode_Resize(unicodevar, length) \
385        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
386
387PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
388				Py_ssize_t size)
389{
390    PyUnicodeObject *unicode;
391
392    /* If the Unicode data is known at construction time, we can apply
393       some optimizations which share commonly used objects. */
394    if (u != NULL) {
395
396	/* Optimization for empty strings */
397	if (size == 0 && unicode_empty != NULL) {
398	    Py_INCREF(unicode_empty);
399	    return (PyObject *)unicode_empty;
400	}
401
402	/* Single character Unicode objects in the Latin-1 range are
403	   shared when using this constructor */
404	if (size == 1 && *u < 256) {
405	    unicode = unicode_latin1[*u];
406	    if (!unicode) {
407		unicode = _PyUnicode_New(1);
408		if (!unicode)
409		    return NULL;
410		unicode->str[0] = *u;
411		unicode_latin1[*u] = unicode;
412	    }
413	    Py_INCREF(unicode);
414	    return (PyObject *)unicode;
415	}
416    }
417
418    unicode = _PyUnicode_New(size);
419    if (!unicode)
420        return NULL;
421
422    /* Copy the Unicode data into the new object */
423    if (u != NULL)
424	Py_UNICODE_COPY(unicode->str, u, size);
425
426    return (PyObject *)unicode;
427}
428
429PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
430{
431    PyUnicodeObject *unicode;
432    /* If the Unicode data is known at construction time, we can apply
433       some optimizations which share commonly used objects.
434       Also, this means the input must be UTF-8, so fall back to the
435       UTF-8 decoder at the end. */
436    if (u != NULL) {
437
438	/* Optimization for empty strings */
439	if (size == 0 && unicode_empty != NULL) {
440	    Py_INCREF(unicode_empty);
441	    return (PyObject *)unicode_empty;
442	}
443
444	/* Single characters are shared when using this constructor.
445           Restrict to ASCII, since the input must be UTF-8. */
446	if (size == 1 && Py_CHARMASK(*u) < 128) {
447	    unicode = unicode_latin1[Py_CHARMASK(*u)];
448	    if (!unicode) {
449		unicode = _PyUnicode_New(1);
450		if (!unicode)
451		    return NULL;
452		unicode->str[0] = Py_CHARMASK(*u);
453		unicode_latin1[Py_CHARMASK(*u)] = unicode;
454	    }
455	    Py_INCREF(unicode);
456	    return (PyObject *)unicode;
457	}
458
459        return PyUnicode_DecodeUTF8(u, size, NULL);
460    }
461
462    unicode = _PyUnicode_New(size);
463    if (!unicode)
464        return NULL;
465
466    return (PyObject *)unicode;
467}
468
469PyObject *PyUnicode_FromString(const char *u)
470{
471    size_t size = strlen(u);
472    if (size > PY_SSIZE_T_MAX) {
473        PyErr_SetString(PyExc_OverflowError, "input too long");
474        return NULL;
475    }
476
477    return PyUnicode_FromStringAndSize(u, size);
478}
479
480#ifdef HAVE_WCHAR_H
481
482PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
483				 Py_ssize_t size)
484{
485    PyUnicodeObject *unicode;
486
487    if (w == NULL) {
488	PyErr_BadInternalCall();
489	return NULL;
490    }
491
492    unicode = _PyUnicode_New(size);
493    if (!unicode)
494        return NULL;
495
496    /* Copy the wchar_t data into the new object */
497#ifdef HAVE_USABLE_WCHAR_T
498    memcpy(unicode->str, w, size * sizeof(wchar_t));
499#else
500    {
501	register Py_UNICODE *u;
502	register Py_ssize_t i;
503	u = PyUnicode_AS_UNICODE(unicode);
504	for (i = size; i > 0; i--)
505	    *u++ = *w++;
506    }
507#endif
508
509    return (PyObject *)unicode;
510}
511
512static void
513makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
514{
515	*fmt++ = '%';
516	if (width) {
517		if (zeropad)
518			*fmt++ = '0';
519		fmt += sprintf(fmt, "%d", width);
520	}
521	if (precision)
522		fmt += sprintf(fmt, ".%d", precision);
523	if (longflag)
524		*fmt++ = 'l';
525	else if (size_tflag) {
526		char *f = PY_FORMAT_SIZE_T;
527		while (*f)
528			*fmt++ = *f++;
529	}
530	*fmt++ = c;
531	*fmt = '\0';
532}
533
534#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
535
536PyObject *
537PyUnicode_FromFormatV(const char *format, va_list vargs)
538{
539	va_list count;
540	Py_ssize_t callcount = 0;
541	PyObject **callresults = NULL;
542	PyObject **callresult = NULL;
543	Py_ssize_t n = 0;
544	int width = 0;
545	int precision = 0;
546	int zeropad;
547	const char* f;
548	Py_UNICODE *s;
549	PyObject *string;
550	/* used by sprintf */
551	char buffer[21];
552	/* use abuffer instead of buffer, if we need more space
553	 * (which can happen if there's a format specifier with width). */
554	char *abuffer = NULL;
555	char *realbuffer;
556	Py_ssize_t abuffersize = 0;
557	char fmt[60]; /* should be enough for %0width.precisionld */
558	const char *copy;
559
560#ifdef VA_LIST_IS_ARRAY
561	Py_MEMCPY(count, vargs, sizeof(va_list));
562#else
563#ifdef  __va_copy
564	__va_copy(count, vargs);
565#else
566	count = vargs;
567#endif
568#endif
569	/* step 1: count the number of %S/%R format specifications
570	 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
571	 * once during step 3 and put the result in an array) */
572	for (f = format; *f; f++) {
573		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
574			++callcount;
575	}
576	/* step 2: allocate memory for the results of
577	 * PyObject_Unicode()/PyObject_Repr() calls */
578	if (callcount) {
579		callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
580		if (!callresults) {
581			PyErr_NoMemory();
582			return NULL;
583		}
584		callresult = callresults;
585	}
586	/* step 3: figure out how large a buffer we need */
587	for (f = format; *f; f++) {
588		if (*f == '%') {
589			const char* p = f;
590			width = 0;
591			while (isdigit(Py_CHARMASK(*f)))
592				width = (width*10) + *f++ - '0';
593			while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
594				;
595
596			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
597			 * they don't affect the amount of space we reserve.
598			 */
599			if ((*f == 'l' || *f == 'z') &&
600					(f[1] == 'd' || f[1] == 'u'))
601                                ++f;
602
603			switch (*f) {
604			case 'c':
605				(void)va_arg(count, int);
606				/* fall through... */
607			case '%':
608				n++;
609				break;
610			case 'd': case 'u': case 'i': case 'x':
611				(void) va_arg(count, int);
612				/* 20 bytes is enough to hold a 64-bit
613				   integer.  Decimal takes the most space.
614				   This isn't enough for octal.
615				   If a width is specified we need more
616				   (which we allocate later). */
617				if (width < 20)
618					width = 20;
619				n += width;
620				if (abuffersize < width)
621					abuffersize = width;
622				break;
623			case 's':
624			{
625				/* UTF-8 */
626				unsigned char*s;
627				s = va_arg(count, unsigned char*);
628				while (*s) {
629					if (*s < 128) {
630						n++; s++;
631					} else if (*s < 0xc0) {
632						/* invalid UTF-8 */
633						n++; s++;
634					} else if (*s < 0xc0) {
635						n++;
636						s++; if(!*s)break;
637						s++;
638					} else if (*s < 0xe0) {
639						n++;
640						s++; if(!*s)break;
641						s++; if(!*s)break;
642						s++;
643					} else {
644						#ifdef Py_UNICODE_WIDE
645						n++;
646						#else
647						n+=2;
648						#endif
649						s++; if(!*s)break;
650						s++; if(!*s)break;
651						s++; if(!*s)break;
652						s++;
653					}
654				}
655				break;
656			}
657			case 'U':
658			{
659				PyObject *obj = va_arg(count, PyObject *);
660				assert(obj && PyUnicode_Check(obj));
661				n += PyUnicode_GET_SIZE(obj);
662				break;
663			}
664			case 'V':
665			{
666				PyObject *obj = va_arg(count, PyObject *);
667				const char *str = va_arg(count, const char *);
668				assert(obj || str);
669				assert(!obj || PyUnicode_Check(obj));
670				if (obj)
671					n += PyUnicode_GET_SIZE(obj);
672				else
673					n += strlen(str);
674				break;
675			}
676			case 'S':
677			{
678				PyObject *obj = va_arg(count, PyObject *);
679				PyObject *str;
680				assert(obj);
681				str = PyObject_Unicode(obj);
682				if (!str)
683					goto fail;
684				n += PyUnicode_GET_SIZE(str);
685				/* Remember the str and switch to the next slot */
686				*callresult++ = str;
687				break;
688			}
689			case 'R':
690			{
691				PyObject *obj = va_arg(count, PyObject *);
692				PyObject *repr;
693				assert(obj);
694				repr = PyObject_Repr(obj);
695				if (!repr)
696					goto fail;
697				n += PyUnicode_GET_SIZE(repr);
698				/* Remember the repr and switch to the next slot */
699				*callresult++ = repr;
700				break;
701			}
702			case 'p':
703				(void) va_arg(count, int);
704				/* maximum 64-bit pointer representation:
705				 * 0xffffffffffffffff
706				 * so 19 characters is enough.
707				 * XXX I count 18 -- what's the extra for?
708				 */
709				n += 19;
710				break;
711			default:
712				/* if we stumble upon an unknown
713				   formatting code, copy the rest of
714				   the format string to the output
715				   string. (we cannot just skip the
716				   code, since there's no way to know
717				   what's in the argument list) */
718				n += strlen(p);
719				goto expand;
720			}
721		} else
722			n++;
723	}
724 expand:
725	if (abuffersize > 20) {
726		abuffer = PyMem_Malloc(abuffersize);
727		if (!abuffer) {
728			PyErr_NoMemory();
729			goto fail;
730		}
731		realbuffer = abuffer;
732	}
733	else
734		realbuffer = buffer;
735	/* step 4: fill the buffer */
736	/* Since we've analyzed how much space we need for the worst case,
737	   we don't have to resize the string.
738	   There can be no errors beyond this point. */
739	string = PyUnicode_FromUnicode(NULL, n);
740	if (!string)
741		goto fail;
742
743	s = PyUnicode_AS_UNICODE(string);
744	callresult = callresults;
745
746	for (f = format; *f; f++) {
747		if (*f == '%') {
748			const char* p = f++;
749			int longflag = 0;
750			int size_tflag = 0;
751			zeropad = (*f == '0');
752			/* parse the width.precision part */
753			width = 0;
754			while (isdigit(Py_CHARMASK(*f)))
755				width = (width*10) + *f++ - '0';
756			precision = 0;
757			if (*f == '.') {
758				f++;
759				while (isdigit(Py_CHARMASK(*f)))
760					precision = (precision*10) + *f++ - '0';
761			}
762			/* handle the long flag, but only for %ld and %lu.
763			   others can be added when necessary. */
764			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
765				longflag = 1;
766				++f;
767			}
768			/* handle the size_t flag. */
769			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
770				size_tflag = 1;
771				++f;
772			}
773
774			switch (*f) {
775			case 'c':
776				*s++ = va_arg(vargs, int);
777				break;
778			case 'd':
779				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
780				if (longflag)
781					sprintf(realbuffer, fmt, va_arg(vargs, long));
782				else if (size_tflag)
783					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
784				else
785					sprintf(realbuffer, fmt, va_arg(vargs, int));
786				appendstring(realbuffer);
787				break;
788			case 'u':
789				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
790				if (longflag)
791					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
792				else if (size_tflag)
793					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
794				else
795					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
796				appendstring(realbuffer);
797				break;
798			case 'i':
799				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
800				sprintf(realbuffer, fmt, va_arg(vargs, int));
801				appendstring(realbuffer);
802				break;
803			case 'x':
804				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
805				sprintf(realbuffer, fmt, va_arg(vargs, int));
806				appendstring(realbuffer);
807				break;
808			case 's':
809			{
810				/* Parameter must be UTF-8 encoded.
811				   In case of encoding errors, use
812				   the replacement character. */
813				PyObject *u;
814				p = va_arg(vargs, char*);
815				u = PyUnicode_DecodeUTF8(p, strlen(p),
816							 "replace");
817				if (!u)
818					goto fail;
819				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
820						PyUnicode_GET_SIZE(u));
821				s += PyUnicode_GET_SIZE(u);
822				Py_DECREF(u);
823				break;
824			}
825			case 'U':
826			{
827				PyObject *obj = va_arg(vargs, PyObject *);
828				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
829				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
830				s += size;
831				break;
832			}
833			case 'V':
834			{
835				PyObject *obj = va_arg(vargs, PyObject *);
836				const char *str = va_arg(vargs, const char *);
837				if (obj) {
838					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
839					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
840					s += size;
841				} else {
842					appendstring(str);
843				}
844				break;
845			}
846			case 'S':
847			case 'R':
848			{
849				Py_UNICODE *ucopy;
850				Py_ssize_t usize;
851				Py_ssize_t upos;
852				/* unused, since we already have the result */
853				(void) va_arg(vargs, PyObject *);
854				ucopy = PyUnicode_AS_UNICODE(*callresult);
855				usize = PyUnicode_GET_SIZE(*callresult);
856				for (upos = 0; upos<usize;)
857					*s++ = ucopy[upos++];
858				/* We're done with the unicode()/repr() => forget it */
859				Py_DECREF(*callresult);
860				/* switch to next unicode()/repr() result */
861				++callresult;
862				break;
863			}
864			case 'p':
865				sprintf(buffer, "%p", va_arg(vargs, void*));
866				/* %p is ill-defined:  ensure leading 0x. */
867				if (buffer[1] == 'X')
868					buffer[1] = 'x';
869				else if (buffer[1] != 'x') {
870					memmove(buffer+2, buffer, strlen(buffer)+1);
871					buffer[0] = '0';
872					buffer[1] = 'x';
873				}
874				appendstring(buffer);
875				break;
876			case '%':
877				*s++ = '%';
878				break;
879			default:
880				appendstring(p);
881				goto end;
882			}
883		} else
884			*s++ = *f;
885	}
886
887 end:
888	if (callresults)
889		PyMem_Free(callresults);
890	if (abuffer)
891		PyMem_Free(abuffer);
892	_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
893	return string;
894 fail:
895	if (callresults) {
896		PyObject **callresult2 = callresults;
897		while (callresult2 < callresult) {
898			Py_DECREF(*callresult2);
899			++callresult2;
900		}
901		PyMem_Free(callresults);
902	}
903	if (abuffer)
904		PyMem_Free(abuffer);
905	return NULL;
906}
907
908#undef appendstring
909
910PyObject *
911PyUnicode_FromFormat(const char *format, ...)
912{
913	PyObject* ret;
914	va_list vargs;
915
916#ifdef HAVE_STDARG_PROTOTYPES
917	va_start(vargs, format);
918#else
919	va_start(vargs);
920#endif
921	ret = PyUnicode_FromFormatV(format, vargs);
922	va_end(vargs);
923	return ret;
924}
925
926Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
927				wchar_t *w,
928				Py_ssize_t size)
929{
930    if (unicode == NULL) {
931	PyErr_BadInternalCall();
932	return -1;
933    }
934
935    /* If possible, try to copy the 0-termination as well */
936    if (size > PyUnicode_GET_SIZE(unicode))
937	size = PyUnicode_GET_SIZE(unicode) + 1;
938
939#ifdef HAVE_USABLE_WCHAR_T
940    memcpy(w, unicode->str, size * sizeof(wchar_t));
941#else
942    {
943	register Py_UNICODE *u;
944	register Py_ssize_t i;
945	u = PyUnicode_AS_UNICODE(unicode);
946	for (i = size; i > 0; i--)
947	    *w++ = *u++;
948    }
949#endif
950
951    if (size > PyUnicode_GET_SIZE(unicode))
952        return PyUnicode_GET_SIZE(unicode);
953    else
954    return size;
955}
956
957#endif
958
959PyObject *PyUnicode_FromOrdinal(int ordinal)
960{
961    Py_UNICODE s[2];
962
963    if (ordinal < 0 || ordinal > 0x10ffff) {
964	PyErr_SetString(PyExc_ValueError,
965			"chr() arg not in range(0x110000)");
966	return NULL;
967    }
968
969#ifndef Py_UNICODE_WIDE
970    if (ordinal > 0xffff) {
971        ordinal -= 0x10000;
972        s[0] = 0xD800 | (ordinal >> 10);
973        s[1] = 0xDC00 | (ordinal & 0x3FF);
974        return PyUnicode_FromUnicode(s, 2);
975    }
976#endif
977
978    s[0] = (Py_UNICODE)ordinal;
979    return PyUnicode_FromUnicode(s, 1);
980}
981
982PyObject *PyUnicode_FromObject(register PyObject *obj)
983{
984    /* XXX Perhaps we should make this API an alias of
985           PyObject_Unicode() instead ?! */
986    if (PyUnicode_CheckExact(obj)) {
987	Py_INCREF(obj);
988	return obj;
989    }
990    if (PyUnicode_Check(obj)) {
991	/* For a Unicode subtype that's not a Unicode object,
992	   return a true Unicode object with the same data. */
993	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
994				     PyUnicode_GET_SIZE(obj));
995    }
996    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
997}
998
999PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1000				      const char *encoding,
1001				      const char *errors)
1002{
1003    const char *s = NULL;
1004    Py_ssize_t len;
1005    PyObject *v;
1006
1007    if (obj == NULL) {
1008	PyErr_BadInternalCall();
1009	return NULL;
1010    }
1011
1012    if (PyUnicode_Check(obj)) {
1013	PyErr_SetString(PyExc_TypeError,
1014			"decoding Unicode is not supported");
1015	return NULL;
1016	}
1017
1018    /* Coerce object */
1019    if (PyString_Check(obj)) {
1020	    s = PyString_AS_STRING(obj);
1021	    len = PyString_GET_SIZE(obj);
1022	    }
1023    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1024	/* Overwrite the error message with something more useful in
1025	   case of a TypeError. */
1026	if (PyErr_ExceptionMatches(PyExc_TypeError))
1027	PyErr_Format(PyExc_TypeError,
1028			 "coercing to Unicode: need string or buffer, "
1029			 "%.80s found",
1030		     Py_Type(obj)->tp_name);
1031	goto onError;
1032    }
1033
1034    /* Convert to Unicode */
1035    if (len == 0) {
1036	Py_INCREF(unicode_empty);
1037	v = (PyObject *)unicode_empty;
1038    }
1039    else
1040	v = PyUnicode_Decode(s, len, encoding, errors);
1041
1042    return v;
1043
1044 onError:
1045    return NULL;
1046}
1047
1048PyObject *PyUnicode_Decode(const char *s,
1049			   Py_ssize_t size,
1050			   const char *encoding,
1051			   const char *errors)
1052{
1053    PyObject *buffer = NULL, *unicode;
1054    Py_buffer info;
1055
1056    if (encoding == NULL)
1057	encoding = PyUnicode_GetDefaultEncoding();
1058
1059    /* Shortcuts for common default encodings */
1060    if (strcmp(encoding, "utf-8") == 0)
1061        return PyUnicode_DecodeUTF8(s, size, errors);
1062    else if (strcmp(encoding, "latin-1") == 0)
1063        return PyUnicode_DecodeLatin1(s, size, errors);
1064#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1065    else if (strcmp(encoding, "mbcs") == 0)
1066        return PyUnicode_DecodeMBCS(s, size, errors);
1067#endif
1068    else if (strcmp(encoding, "ascii") == 0)
1069        return PyUnicode_DecodeASCII(s, size, errors);
1070
1071    /* Decode via the codec registry */
1072    buffer = NULL;
1073    if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1074        goto onError;
1075    buffer = PyMemoryView_FromMemory(&info);
1076    if (buffer == NULL)
1077        goto onError;
1078    unicode = PyCodec_Decode(buffer, encoding, errors);
1079    if (unicode == NULL)
1080        goto onError;
1081    if (!PyUnicode_Check(unicode)) {
1082        PyErr_Format(PyExc_TypeError,
1083                     "decoder did not return an unicode object (type=%.400s)",
1084                     Py_Type(unicode)->tp_name);
1085        Py_DECREF(unicode);
1086        goto onError;
1087    }
1088    Py_DECREF(buffer);
1089    return unicode;
1090
1091 onError:
1092    Py_XDECREF(buffer);
1093    return NULL;
1094}
1095
1096PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1097                                    const char *encoding,
1098                                    const char *errors)
1099{
1100    PyObject *v;
1101
1102    if (!PyUnicode_Check(unicode)) {
1103        PyErr_BadArgument();
1104        goto onError;
1105    }
1106
1107    if (encoding == NULL)
1108	encoding = PyUnicode_GetDefaultEncoding();
1109
1110    /* Decode via the codec registry */
1111    v = PyCodec_Decode(unicode, encoding, errors);
1112    if (v == NULL)
1113        goto onError;
1114    return v;
1115
1116 onError:
1117    return NULL;
1118}
1119
1120PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1121			   Py_ssize_t size,
1122			   const char *encoding,
1123			   const char *errors)
1124{
1125    PyObject *v, *unicode;
1126
1127    unicode = PyUnicode_FromUnicode(s, size);
1128    if (unicode == NULL)
1129	return NULL;
1130    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1131    Py_DECREF(unicode);
1132    return v;
1133}
1134
1135PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1136                                    const char *encoding,
1137                                    const char *errors)
1138{
1139    PyObject *v;
1140
1141    if (!PyUnicode_Check(unicode)) {
1142        PyErr_BadArgument();
1143        goto onError;
1144    }
1145
1146    if (encoding == NULL)
1147	encoding = PyUnicode_GetDefaultEncoding();
1148
1149    /* Encode via the codec registry */
1150    v = PyCodec_Encode(unicode, encoding, errors);
1151    if (v == NULL)
1152        goto onError;
1153    return v;
1154
1155 onError:
1156    return NULL;
1157}
1158
1159PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1160                                    const char *encoding,
1161                                    const char *errors)
1162{
1163    PyObject *v;
1164
1165    if (!PyUnicode_Check(unicode)) {
1166        PyErr_BadArgument();
1167        goto onError;
1168    }
1169
1170    if (encoding == NULL)
1171	encoding = PyUnicode_GetDefaultEncoding();
1172
1173    /* Shortcuts for common default encodings */
1174    if (errors == NULL) {
1175	if (strcmp(encoding, "utf-8") == 0)
1176	    return PyUnicode_AsUTF8String(unicode);
1177	else if (strcmp(encoding, "latin-1") == 0)
1178	    return PyUnicode_AsLatin1String(unicode);
1179#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1180	else if (strcmp(encoding, "mbcs") == 0)
1181	    return PyUnicode_AsMBCSString(unicode);
1182#endif
1183	else if (strcmp(encoding, "ascii") == 0)
1184	    return PyUnicode_AsASCIIString(unicode);
1185    }
1186
1187    /* Encode via the codec registry */
1188    v = PyCodec_Encode(unicode, encoding, errors);
1189    if (v == NULL)
1190        goto onError;
1191    if (!PyBytes_Check(v)) {
1192        if (PyString_Check(v)) {
1193            /* Old codec, turn it into bytes */
1194            PyObject *b = PyBytes_FromObject(v);
1195            Py_DECREF(v);
1196            return b;
1197        }
1198        PyErr_Format(PyExc_TypeError,
1199                     "encoder did not return a bytes object "
1200                     "(type=%.400s, encoding=%.20s, errors=%.20s)",
1201                     v->ob_type->tp_name,
1202                     encoding ? encoding : "NULL",
1203                     errors ? errors : "NULL");
1204        Py_DECREF(v);
1205        goto onError;
1206    }
1207    return v;
1208
1209 onError:
1210    return NULL;
1211}
1212
1213PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1214					    const char *errors)
1215{
1216    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1217    PyObject *b;
1218    if (v)
1219        return v;
1220    if (errors != NULL)
1221        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1222    b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1223                             PyUnicode_GET_SIZE(unicode),
1224                             NULL);
1225    if (!b)
1226        return NULL;
1227    v = PyString_FromStringAndSize(PyBytes_AsString(b),
1228                                   PyBytes_Size(b));
1229    Py_DECREF(b);
1230    ((PyUnicodeObject *)unicode)->defenc = v;
1231    return v;
1232}
1233
1234char*
1235PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1236{
1237    PyObject *str8;
1238    if (!PyUnicode_Check(unicode)) {
1239        PyErr_BadArgument();
1240        return NULL;
1241    }
1242    str8 = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1243    if (str8 == NULL)
1244        return NULL;
1245    if (psize != NULL)
1246        *psize = PyString_GET_SIZE(str8);
1247    return PyString_AS_STRING(str8);
1248}
1249
1250char*
1251PyUnicode_AsString(PyObject *unicode)
1252{
1253    return PyUnicode_AsStringAndSize(unicode, NULL);
1254}
1255
1256Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1257{
1258    if (!PyUnicode_Check(unicode)) {
1259        PyErr_BadArgument();
1260        goto onError;
1261    }
1262    return PyUnicode_AS_UNICODE(unicode);
1263
1264 onError:
1265    return NULL;
1266}
1267
1268Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1269{
1270    if (!PyUnicode_Check(unicode)) {
1271        PyErr_BadArgument();
1272        goto onError;
1273    }
1274    return PyUnicode_GET_SIZE(unicode);
1275
1276 onError:
1277    return -1;
1278}
1279
1280const char *PyUnicode_GetDefaultEncoding(void)
1281{
1282    return unicode_default_encoding;
1283}
1284
1285int PyUnicode_SetDefaultEncoding(const char *encoding)
1286{
1287    if (strcmp(encoding, unicode_default_encoding) != 0) {
1288        PyErr_Format(PyExc_ValueError,
1289                     "Can only set default encoding to %s",
1290                     unicode_default_encoding);
1291        return -1;
1292    }
1293    return 0;
1294}
1295
1296/* error handling callback helper:
1297   build arguments, call the callback and check the arguments,
1298   if no exception occurred, copy the replacement to the output
1299   and adjust various state variables.
1300   return 0 on success, -1 on error
1301*/
1302
1303static
1304int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1305                 const char *encoding, const char *reason,
1306                 const char **input, const char **inend, Py_ssize_t *startinpos,
1307                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1308                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1309{
1310    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1311
1312    PyObject *restuple = NULL;
1313    PyObject *repunicode = NULL;
1314    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1315    Py_ssize_t insize;
1316    Py_ssize_t requiredsize;
1317    Py_ssize_t newpos;
1318    Py_UNICODE *repptr;
1319    PyObject *inputobj = NULL;
1320    Py_ssize_t repsize;
1321    int res = -1;
1322
1323    if (*errorHandler == NULL) {
1324	*errorHandler = PyCodec_LookupError(errors);
1325	if (*errorHandler == NULL)
1326	   goto onError;
1327    }
1328
1329    if (*exceptionObject == NULL) {
1330    	*exceptionObject = PyUnicodeDecodeError_Create(
1331	    encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1332	if (*exceptionObject == NULL)
1333	   goto onError;
1334    }
1335    else {
1336	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1337	    goto onError;
1338	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1339	    goto onError;
1340	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1341	    goto onError;
1342    }
1343
1344    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1345    if (restuple == NULL)
1346	goto onError;
1347    if (!PyTuple_Check(restuple)) {
1348	PyErr_Format(PyExc_TypeError, &argparse[4]);
1349	goto onError;
1350    }
1351    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1352	goto onError;
1353
1354    /* Copy back the bytes variables, which might have been modified by the
1355       callback */
1356    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1357    if (!inputobj)
1358        goto onError;
1359    if (!PyBytes_Check(inputobj)) {
1360	PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1361    }
1362    *input = PyBytes_AS_STRING(inputobj);
1363    insize = PyBytes_GET_SIZE(inputobj);
1364    *inend = *input + insize;
1365    /* we can DECREF safely, as the exception has another reference,
1366       so the object won't go away. */
1367    Py_DECREF(inputobj);
1368
1369    if (newpos<0)
1370	newpos = insize+newpos;
1371    if (newpos<0 || newpos>insize) {
1372	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1373	goto onError;
1374    }
1375
1376    /* need more space? (at least enough for what we
1377       have+the replacement+the rest of the string (starting
1378       at the new input position), so we won't have to check space
1379       when there are no errors in the rest of the string) */
1380    repptr = PyUnicode_AS_UNICODE(repunicode);
1381    repsize = PyUnicode_GET_SIZE(repunicode);
1382    requiredsize = *outpos + repsize + insize-newpos;
1383    if (requiredsize > outsize) {
1384	if (requiredsize<2*outsize)
1385	    requiredsize = 2*outsize;
1386	if (PyUnicode_Resize(output, requiredsize) < 0)
1387	    goto onError;
1388	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1389    }
1390    *endinpos = newpos;
1391    *inptr = *input + newpos;
1392    Py_UNICODE_COPY(*outptr, repptr, repsize);
1393    *outptr += repsize;
1394    *outpos += repsize;
1395
1396    /* we made it! */
1397    res = 0;
1398
1399    onError:
1400    Py_XDECREF(restuple);
1401    return res;
1402}
1403
1404/* --- UTF-7 Codec -------------------------------------------------------- */
1405
1406/* see RFC2152 for details */
1407
1408static
1409char utf7_special[128] = {
1410    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1411       encoded:
1412	   0 - not special
1413	   1 - special
1414	   2 - whitespace (optional)
1415	   3 - RFC2152 Set O (optional) */
1416    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1417    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1418    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1419    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1420    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1421    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1422    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1423    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1424
1425};
1426
1427/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1428   warnings about the comparison always being false; since
1429   utf7_special[0] is 1, we can safely make that one comparison
1430   true  */
1431
1432#define SPECIAL(c, encodeO, encodeWS) \
1433    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1434     (encodeWS && (utf7_special[(c)] == 2)) || \
1435     (encodeO && (utf7_special[(c)] == 3)))
1436
1437#define B64(n)  \
1438    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1439#define B64CHAR(c) \
1440    (isalnum(c) || (c) == '+' || (c) == '/')
1441#define UB64(c) \
1442    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1443     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1444
1445#define ENCODE(out, ch, bits)                   \
1446    while (bits >= 6) {                         \
1447        *out++ = B64(ch >> (bits-6));           \
1448        bits -= 6;                              \
1449    }
1450
1451#define DECODE(out, ch, bits, surrogate)                                \
1452    while (bits >= 16) {                                                \
1453        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1454        bits -= 16;                                                     \
1455        if (surrogate) {                                                \
1456            /* We have already generated an error for the high surrogate \
1457               so let's not bother seeing if the low surrogate is correct or not */ \
1458            surrogate = 0;                                              \
1459        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1460            /* This is a surrogate pair. Unfortunately we can't represent \
1461               it in a 16-bit character */                              \
1462            surrogate = 1;                                              \
1463            errmsg = "code pairs are not supported";                    \
1464            goto utf7Error;                                             \
1465        } else {                                                        \
1466            *out++ = outCh;                                             \
1467        }                                                               \
1468    }
1469
1470PyObject *PyUnicode_DecodeUTF7(const char *s,
1471			       Py_ssize_t size,
1472			       const char *errors)
1473{
1474    const char *starts = s;
1475    Py_ssize_t startinpos;
1476    Py_ssize_t endinpos;
1477    Py_ssize_t outpos;
1478    const char *e;
1479    PyUnicodeObject *unicode;
1480    Py_UNICODE *p;
1481    const char *errmsg = "";
1482    int inShift = 0;
1483    unsigned int bitsleft = 0;
1484    unsigned long charsleft = 0;
1485    int surrogate = 0;
1486    PyObject *errorHandler = NULL;
1487    PyObject *exc = NULL;
1488
1489    unicode = _PyUnicode_New(size);
1490    if (!unicode)
1491        return NULL;
1492    if (size == 0)
1493        return (PyObject *)unicode;
1494
1495    p = unicode->str;
1496    e = s + size;
1497
1498    while (s < e) {
1499        Py_UNICODE ch;
1500        restart:
1501        ch = *s;
1502
1503        if (inShift) {
1504            if ((ch == '-') || !B64CHAR(ch)) {
1505                inShift = 0;
1506                s++;
1507
1508                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1509                if (bitsleft >= 6) {
1510                    /* The shift sequence has a partial character in it. If
1511                       bitsleft < 6 then we could just classify it as padding
1512                       but that is not the case here */
1513
1514                    errmsg = "partial character in shift sequence";
1515                    goto utf7Error;
1516                }
1517                /* According to RFC2152 the remaining bits should be zero. We
1518                   choose to signal an error/insert a replacement character
1519                   here so indicate the potential of a misencoded character. */
1520
1521                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1522                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1523                    errmsg = "non-zero padding bits in shift sequence";
1524                    goto utf7Error;
1525                }
1526
1527                if (ch == '-') {
1528                    if ((s < e) && (*(s) == '-')) {
1529                        *p++ = '-';
1530                        inShift = 1;
1531                    }
1532                } else if (SPECIAL(ch,0,0)) {
1533                    errmsg = "unexpected special character";
1534	                goto utf7Error;
1535                } else  {
1536                    *p++ = ch;
1537                }
1538            } else {
1539                charsleft = (charsleft << 6) | UB64(ch);
1540                bitsleft += 6;
1541                s++;
1542                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1543            }
1544        }
1545        else if ( ch == '+' ) {
1546            startinpos = s-starts;
1547            s++;
1548            if (s < e && *s == '-') {
1549                s++;
1550                *p++ = '+';
1551            } else
1552            {
1553                inShift = 1;
1554                bitsleft = 0;
1555            }
1556        }
1557        else if (SPECIAL(ch,0,0)) {
1558            startinpos = s-starts;
1559            errmsg = "unexpected special character";
1560            s++;
1561            goto utf7Error;
1562        }
1563        else {
1564            *p++ = ch;
1565            s++;
1566        }
1567        continue;
1568    utf7Error:
1569        outpos = p-PyUnicode_AS_UNICODE(unicode);
1570        endinpos = s-starts;
1571        if (unicode_decode_call_errorhandler(
1572             errors, &errorHandler,
1573             "utf7", errmsg,
1574             &starts, &e, &startinpos, &endinpos, &exc, &s,
1575             (PyObject **)&unicode, &outpos, &p))
1576        goto onError;
1577    }
1578
1579    if (inShift) {
1580        outpos = p-PyUnicode_AS_UNICODE(unicode);
1581        endinpos = size;
1582        if (unicode_decode_call_errorhandler(
1583             errors, &errorHandler,
1584             "utf7", "unterminated shift sequence",
1585             &starts, &e, &startinpos, &endinpos, &exc, &s,
1586             (PyObject **)&unicode, &outpos, &p))
1587            goto onError;
1588        if (s < e)
1589           goto restart;
1590    }
1591
1592    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1593        goto onError;
1594
1595    Py_XDECREF(errorHandler);
1596    Py_XDECREF(exc);
1597    return (PyObject *)unicode;
1598
1599onError:
1600    Py_XDECREF(errorHandler);
1601    Py_XDECREF(exc);
1602    Py_DECREF(unicode);
1603    return NULL;
1604}
1605
1606
1607PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1608                   Py_ssize_t size,
1609                   int encodeSetO,
1610                   int encodeWhiteSpace,
1611                   const char *errors)
1612{
1613    PyObject *v;
1614    /* It might be possible to tighten this worst case */
1615    Py_ssize_t cbAllocated = 5 * size;
1616    int inShift = 0;
1617    Py_ssize_t i = 0;
1618    unsigned int bitsleft = 0;
1619    unsigned long charsleft = 0;
1620    char * out;
1621    char * start;
1622
1623    if (size == 0)
1624	return PyBytes_FromStringAndSize(NULL, 0);
1625
1626    v = PyBytes_FromStringAndSize(NULL, cbAllocated);
1627    if (v == NULL)
1628        return NULL;
1629
1630    start = out = PyBytes_AS_STRING(v);
1631    for (;i < size; ++i) {
1632        Py_UNICODE ch = s[i];
1633
1634        if (!inShift) {
1635            if (ch == '+') {
1636                *out++ = '+';
1637                *out++ = '-';
1638            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1639                charsleft = ch;
1640                bitsleft = 16;
1641                *out++ = '+';
1642                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1643                inShift = bitsleft > 0;
1644            } else {
1645                *out++ = (char) ch;
1646            }
1647        } else {
1648            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1649                *out++ = B64(charsleft << (6-bitsleft));
1650                charsleft = 0;
1651                bitsleft = 0;
1652                /* Characters not in the BASE64 set implicitly unshift the sequence
1653                   so no '-' is required, except if the character is itself a '-' */
1654                if (B64CHAR(ch) || ch == '-') {
1655                    *out++ = '-';
1656                }
1657                inShift = 0;
1658                *out++ = (char) ch;
1659            } else {
1660                bitsleft += 16;
1661                charsleft = (charsleft << 16) | ch;
1662                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1663
1664                /* If the next character is special then we dont' need to terminate
1665                   the shift sequence. If the next character is not a BASE64 character
1666                   or '-' then the shift sequence will be terminated implicitly and we
1667                   don't have to insert a '-'. */
1668
1669                if (bitsleft == 0) {
1670                    if (i + 1 < size) {
1671                        Py_UNICODE ch2 = s[i+1];
1672
1673                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1674
1675                        } else if (B64CHAR(ch2) || ch2 == '-') {
1676                            *out++ = '-';
1677                            inShift = 0;
1678                        } else {
1679                            inShift = 0;
1680                        }
1681
1682                    }
1683                    else {
1684                        *out++ = '-';
1685                        inShift = 0;
1686                    }
1687                }
1688            }
1689        }
1690    }
1691    if (bitsleft) {
1692        *out++= B64(charsleft << (6-bitsleft) );
1693        *out++ = '-';
1694    }
1695
1696    if (PyBytes_Resize(v, out - start)) {
1697        Py_DECREF(v);
1698        return NULL;
1699    }
1700    return v;
1701}
1702
1703#undef SPECIAL
1704#undef B64
1705#undef B64CHAR
1706#undef UB64
1707#undef ENCODE
1708#undef DECODE
1709
1710/* --- UTF-8 Codec -------------------------------------------------------- */
1711
1712static
1713char utf8_code_length[256] = {
1714    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1715       illegal prefix.  see RFC 2279 for details */
1716    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1717    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1718    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1719    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1720    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1721    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1722    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1723    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1724    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1725    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1726    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1727    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1728    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1729    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1730    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1731    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1732};
1733
1734PyObject *PyUnicode_DecodeUTF8(const char *s,
1735			       Py_ssize_t size,
1736			       const char *errors)
1737{
1738    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1739}
1740
1741PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1742			                Py_ssize_t size,
1743			                const char *errors,
1744			                Py_ssize_t *consumed)
1745{
1746    const char *starts = s;
1747    int n;
1748    Py_ssize_t startinpos;
1749    Py_ssize_t endinpos;
1750    Py_ssize_t outpos;
1751    const char *e;
1752    PyUnicodeObject *unicode;
1753    Py_UNICODE *p;
1754    const char *errmsg = "";
1755    PyObject *errorHandler = NULL;
1756    PyObject *exc = NULL;
1757
1758    /* Note: size will always be longer than the resulting Unicode
1759       character count */
1760    unicode = _PyUnicode_New(size);
1761    if (!unicode)
1762        return NULL;
1763    if (size == 0) {
1764        if (consumed)
1765            *consumed = 0;
1766        return (PyObject *)unicode;
1767    }
1768
1769    /* Unpack UTF-8 encoded data */
1770    p = unicode->str;
1771    e = s + size;
1772
1773    while (s < e) {
1774        Py_UCS4 ch = (unsigned char)*s;
1775
1776        if (ch < 0x80) {
1777            *p++ = (Py_UNICODE)ch;
1778            s++;
1779            continue;
1780        }
1781
1782        n = utf8_code_length[ch];
1783
1784        if (s + n > e) {
1785	    if (consumed)
1786		break;
1787	    else {
1788		errmsg = "unexpected end of data";
1789		startinpos = s-starts;
1790		endinpos = size;
1791		goto utf8Error;
1792	    }
1793	}
1794
1795        switch (n) {
1796
1797        case 0:
1798            errmsg = "unexpected code byte";
1799	    startinpos = s-starts;
1800	    endinpos = startinpos+1;
1801	    goto utf8Error;
1802
1803        case 1:
1804            errmsg = "internal error";
1805	    startinpos = s-starts;
1806	    endinpos = startinpos+1;
1807	    goto utf8Error;
1808
1809        case 2:
1810            if ((s[1] & 0xc0) != 0x80) {
1811                errmsg = "invalid data";
1812		startinpos = s-starts;
1813		endinpos = startinpos+2;
1814		goto utf8Error;
1815	    }
1816            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1817            if (ch < 0x80) {
1818		startinpos = s-starts;
1819		endinpos = startinpos+2;
1820                errmsg = "illegal encoding";
1821		goto utf8Error;
1822	    }
1823	    else
1824		*p++ = (Py_UNICODE)ch;
1825            break;
1826
1827        case 3:
1828            if ((s[1] & 0xc0) != 0x80 ||
1829                (s[2] & 0xc0) != 0x80) {
1830                errmsg = "invalid data";
1831		startinpos = s-starts;
1832		endinpos = startinpos+3;
1833		goto utf8Error;
1834	    }
1835            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1836            if (ch < 0x0800) {
1837		/* Note: UTF-8 encodings of surrogates are considered
1838		   legal UTF-8 sequences;
1839
1840		   XXX For wide builds (UCS-4) we should probably try
1841		       to recombine the surrogates into a single code
1842		       unit.
1843		*/
1844                errmsg = "illegal encoding";
1845		startinpos = s-starts;
1846		endinpos = startinpos+3;
1847		goto utf8Error;
1848	    }
1849	    else
1850		*p++ = (Py_UNICODE)ch;
1851            break;
1852
1853        case 4:
1854            if ((s[1] & 0xc0) != 0x80 ||
1855                (s[2] & 0xc0) != 0x80 ||
1856                (s[3] & 0xc0) != 0x80) {
1857                errmsg = "invalid data";
1858		startinpos = s-starts;
1859		endinpos = startinpos+4;
1860		goto utf8Error;
1861	    }
1862            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1863                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1864            /* validate and convert to UTF-16 */
1865            if ((ch < 0x10000)        /* minimum value allowed for 4
1866					 byte encoding */
1867                || (ch > 0x10ffff))   /* maximum value allowed for
1868					 UTF-16 */
1869	    {
1870                errmsg = "illegal encoding";
1871		startinpos = s-starts;
1872		endinpos = startinpos+4;
1873		goto utf8Error;
1874	    }
1875#ifdef Py_UNICODE_WIDE
1876	    *p++ = (Py_UNICODE)ch;
1877#else
1878            /*  compute and append the two surrogates: */
1879
1880            /*  translate from 10000..10FFFF to 0..FFFF */
1881            ch -= 0x10000;
1882
1883            /*  high surrogate = top 10 bits added to D800 */
1884            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1885
1886            /*  low surrogate = bottom 10 bits added to DC00 */
1887            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1888#endif
1889            break;
1890
1891        default:
1892            /* Other sizes are only needed for UCS-4 */
1893            errmsg = "unsupported Unicode code range";
1894	    startinpos = s-starts;
1895	    endinpos = startinpos+n;
1896	    goto utf8Error;
1897        }
1898        s += n;
1899	continue;
1900
1901    utf8Error:
1902    outpos = p-PyUnicode_AS_UNICODE(unicode);
1903    if (unicode_decode_call_errorhandler(
1904	     errors, &errorHandler,
1905	     "utf8", errmsg,
1906	     &starts, &e, &startinpos, &endinpos, &exc, &s,
1907	     (PyObject **)&unicode, &outpos, &p))
1908	goto onError;
1909    }
1910    if (consumed)
1911	*consumed = s-starts;
1912
1913    /* Adjust length */
1914    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1915        goto onError;
1916
1917    Py_XDECREF(errorHandler);
1918    Py_XDECREF(exc);
1919    return (PyObject *)unicode;
1920
1921onError:
1922    Py_XDECREF(errorHandler);
1923    Py_XDECREF(exc);
1924    Py_DECREF(unicode);
1925    return NULL;
1926}
1927
1928/* Allocation strategy:  if the string is short, convert into a stack buffer
1929   and allocate exactly as much space needed at the end.  Else allocate the
1930   maximum possible needed (4 result bytes per Unicode character), and return
1931   the excess memory at the end.
1932*/
1933PyObject *
1934PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1935		     Py_ssize_t size,
1936		     const char *errors)
1937{
1938#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1939
1940    Py_ssize_t i;           /* index into s of next input byte */
1941    PyObject *v;        /* result string object */
1942    char *p;            /* next free byte in output buffer */
1943    Py_ssize_t nallocated;  /* number of result bytes allocated */
1944    Py_ssize_t nneeded;        /* number of result bytes needed */
1945    char stackbuf[MAX_SHORT_UNICHARS * 4];
1946
1947    assert(s != NULL);
1948    assert(size >= 0);
1949
1950    if (size <= MAX_SHORT_UNICHARS) {
1951        /* Write into the stack buffer; nallocated can't overflow.
1952         * At the end, we'll allocate exactly as much heap space as it
1953         * turns out we need.
1954         */
1955        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1956        v = NULL;   /* will allocate after we're done */
1957        p = stackbuf;
1958    }
1959    else {
1960        /* Overallocate on the heap, and give the excess back at the end. */
1961        nallocated = size * 4;
1962        if (nallocated / 4 != size)  /* overflow! */
1963            return PyErr_NoMemory();
1964        v = PyBytes_FromStringAndSize(NULL, nallocated);
1965        if (v == NULL)
1966            return NULL;
1967        p = PyBytes_AS_STRING(v);
1968    }
1969
1970    for (i = 0; i < size;) {
1971        Py_UCS4 ch = s[i++];
1972
1973        if (ch < 0x80)
1974            /* Encode ASCII */
1975            *p++ = (char) ch;
1976
1977        else if (ch < 0x0800) {
1978            /* Encode Latin-1 */
1979            *p++ = (char)(0xc0 | (ch >> 6));
1980            *p++ = (char)(0x80 | (ch & 0x3f));
1981        }
1982        else {
1983            /* Encode UCS2 Unicode ordinals */
1984            if (ch < 0x10000) {
1985                /* Special case: check for high surrogate */
1986                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1987                    Py_UCS4 ch2 = s[i];
1988                    /* Check for low surrogate and combine the two to
1989                       form a UCS4 value */
1990                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1991                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1992                        i++;
1993                        goto encodeUCS4;
1994                    }
1995                    /* Fall through: handles isolated high surrogates */
1996                }
1997                *p++ = (char)(0xe0 | (ch >> 12));
1998                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1999                *p++ = (char)(0x80 | (ch & 0x3f));
2000                continue;
2001    	    }
2002encodeUCS4:
2003            /* Encode UCS4 Unicode ordinals */
2004            *p++ = (char)(0xf0 | (ch >> 18));
2005            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2006            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2007            *p++ = (char)(0x80 | (ch & 0x3f));
2008        }
2009    }
2010
2011    if (v == NULL) {
2012        /* This was stack allocated. */
2013        nneeded = p - stackbuf;
2014        assert(nneeded <= nallocated);
2015        v = PyBytes_FromStringAndSize(stackbuf, nneeded);
2016    }
2017    else {
2018    	/* Cut back to size actually needed. */
2019        nneeded = p - PyBytes_AS_STRING(v);
2020        assert(nneeded <= nallocated);
2021        PyBytes_Resize(v, nneeded);
2022    }
2023    return v;
2024
2025#undef MAX_SHORT_UNICHARS
2026}
2027
2028PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2029{
2030    if (!PyUnicode_Check(unicode)) {
2031        PyErr_BadArgument();
2032        return NULL;
2033    }
2034    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2035				PyUnicode_GET_SIZE(unicode),
2036				NULL);
2037}
2038
2039/* --- UTF-32 Codec ------------------------------------------------------- */
2040
2041PyObject *
2042PyUnicode_DecodeUTF32(const char *s,
2043		      Py_ssize_t size,
2044		      const char *errors,
2045		      int *byteorder)
2046{
2047    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2048}
2049
2050PyObject *
2051PyUnicode_DecodeUTF32Stateful(const char *s,
2052			      Py_ssize_t size,
2053			      const char *errors,
2054			      int *byteorder,
2055			      Py_ssize_t *consumed)
2056{
2057    const char *starts = s;
2058    Py_ssize_t startinpos;
2059    Py_ssize_t endinpos;
2060    Py_ssize_t outpos;
2061    PyUnicodeObject *unicode;
2062    Py_UNICODE *p;
2063#ifndef Py_UNICODE_WIDE
2064    int i, pairs;
2065#else
2066    const int pairs = 0;
2067#endif
2068    const unsigned char *q, *e;
2069    int bo = 0;       /* assume native ordering by default */
2070    const char *errmsg = "";
2071    /* Offsets from q for retrieving bytes in the right order. */
2072#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2073    int iorder[] = {0, 1, 2, 3};
2074#else
2075    int iorder[] = {3, 2, 1, 0};
2076#endif
2077    PyObject *errorHandler = NULL;
2078    PyObject *exc = NULL;
2079    /* On narrow builds we split characters outside the BMP into two
2080       codepoints => count how much extra space we need. */
2081#ifndef Py_UNICODE_WIDE
2082    for (i = pairs = 0; i < size/4; i++)
2083	if (((Py_UCS4 *)s)[i] >= 0x10000)
2084	    pairs++;
2085#endif
2086
2087    /* This might be one to much, because of a BOM */
2088    unicode = _PyUnicode_New((size+3)/4+pairs);
2089    if (!unicode)
2090        return NULL;
2091    if (size == 0)
2092        return (PyObject *)unicode;
2093
2094    /* Unpack UTF-32 encoded data */
2095    p = unicode->str;
2096    q = (unsigned char *)s;
2097    e = q + size;
2098
2099    if (byteorder)
2100        bo = *byteorder;
2101
2102    /* Check for BOM marks (U+FEFF) in the input and adjust current
2103       byte order setting accordingly. In native mode, the leading BOM
2104       mark is skipped, in all other modes, it is copied to the output
2105       stream as-is (giving a ZWNBSP character). */
2106    if (bo == 0) {
2107        if (size >= 4) {
2108            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2109                                (q[iorder[1]] << 8) | q[iorder[0]];
2110#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2111	    if (bom == 0x0000FEFF) {
2112		q += 4;
2113		bo = -1;
2114	    }
2115	    else if (bom == 0xFFFE0000) {
2116		q += 4;
2117		bo = 1;
2118	    }
2119#else
2120	    if (bom == 0x0000FEFF) {
2121		q += 4;
2122		bo = 1;
2123	    }
2124	    else if (bom == 0xFFFE0000) {
2125		q += 4;
2126		bo = -1;
2127	    }
2128#endif
2129	}
2130    }
2131
2132    if (bo == -1) {
2133        /* force LE */
2134        iorder[0] = 0;
2135        iorder[1] = 1;
2136        iorder[2] = 2;
2137        iorder[3] = 3;
2138    }
2139    else if (bo == 1) {
2140        /* force BE */
2141        iorder[0] = 3;
2142        iorder[1] = 2;
2143        iorder[2] = 1;
2144        iorder[3] = 0;
2145    }
2146
2147    while (q < e) {
2148	Py_UCS4 ch;
2149	/* remaining bytes at the end? (size should be divisible by 4) */
2150	if (e-q<4) {
2151	    if (consumed)
2152		break;
2153	    errmsg = "truncated data";
2154	    startinpos = ((const char *)q)-starts;
2155	    endinpos = ((const char *)e)-starts;
2156	    goto utf32Error;
2157	    /* The remaining input chars are ignored if the callback
2158	       chooses to skip the input */
2159	}
2160	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2161	     (q[iorder[1]] << 8) | q[iorder[0]];
2162
2163	if (ch >= 0x110000)
2164	{
2165	    errmsg = "codepoint not in range(0x110000)";
2166	    startinpos = ((const char *)q)-starts;
2167	    endinpos = startinpos+4;
2168	    goto utf32Error;
2169	}
2170#ifndef Py_UNICODE_WIDE
2171	if (ch >= 0x10000)
2172	{
2173	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
2174	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2175	}
2176	else
2177#endif
2178	    *p++ = ch;
2179	q += 4;
2180	continue;
2181    utf32Error:
2182	outpos = p-PyUnicode_AS_UNICODE(unicode);
2183	if (unicode_decode_call_errorhandler(
2184	         errors, &errorHandler,
2185	         "utf32", errmsg,
2186	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2187	         (PyObject **)&unicode, &outpos, &p))
2188	    goto onError;
2189    }
2190
2191    if (byteorder)
2192        *byteorder = bo;
2193
2194    if (consumed)
2195	*consumed = (const char *)q-starts;
2196
2197    /* Adjust length */
2198    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2199        goto onError;
2200
2201    Py_XDECREF(errorHandler);
2202    Py_XDECREF(exc);
2203    return (PyObject *)unicode;
2204
2205onError:
2206    Py_DECREF(unicode);
2207    Py_XDECREF(errorHandler);
2208    Py_XDECREF(exc);
2209    return NULL;
2210}
2211
2212PyObject *
2213PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2214		      Py_ssize_t size,
2215		      const char *errors,
2216		      int byteorder)
2217{
2218    PyObject *v;
2219    unsigned char *p;
2220#ifndef Py_UNICODE_WIDE
2221    int i, pairs;
2222#else
2223    const int pairs = 0;
2224#endif
2225    /* Offsets from p for storing byte pairs in the right order. */
2226#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2227    int iorder[] = {0, 1, 2, 3};
2228#else
2229    int iorder[] = {3, 2, 1, 0};
2230#endif
2231
2232#define STORECHAR(CH)                       \
2233    do {                                    \
2234        p[iorder[3]] = ((CH) >> 24) & 0xff; \
2235        p[iorder[2]] = ((CH) >> 16) & 0xff; \
2236        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2237        p[iorder[0]] = (CH) & 0xff;         \
2238        p += 4;                             \
2239    } while(0)
2240
2241    /* In narrow builds we can output surrogate pairs as one codepoint,
2242       so we need less space. */
2243#ifndef Py_UNICODE_WIDE
2244    for (i = pairs = 0; i < size-1; i++)
2245	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2246	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2247	    pairs++;
2248#endif
2249    v = PyBytes_FromStringAndSize(NULL,
2250		  4 * (size - pairs + (byteorder == 0)));
2251    if (v == NULL)
2252        return NULL;
2253
2254    p = (unsigned char *)PyBytes_AS_STRING(v);
2255    if (byteorder == 0)
2256	STORECHAR(0xFEFF);
2257    if (size == 0)
2258        return v;
2259
2260    if (byteorder == -1) {
2261        /* force LE */
2262        iorder[0] = 0;
2263        iorder[1] = 1;
2264        iorder[2] = 2;
2265        iorder[3] = 3;
2266    }
2267    else if (byteorder == 1) {
2268        /* force BE */
2269        iorder[0] = 3;
2270        iorder[1] = 2;
2271        iorder[2] = 1;
2272        iorder[3] = 0;
2273    }
2274
2275    while (size-- > 0) {
2276	Py_UCS4 ch = *s++;
2277#ifndef Py_UNICODE_WIDE
2278	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2279	    Py_UCS4 ch2 = *s;
2280	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2281		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2282		s++;
2283		size--;
2284	    }
2285	}
2286#endif
2287        STORECHAR(ch);
2288    }
2289    return v;
2290#undef STORECHAR
2291}
2292
2293PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2294{
2295    if (!PyUnicode_Check(unicode)) {
2296        PyErr_BadArgument();
2297        return NULL;
2298    }
2299    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2300				 PyUnicode_GET_SIZE(unicode),
2301				 NULL,
2302				 0);
2303}
2304
2305/* --- UTF-16 Codec ------------------------------------------------------- */
2306
2307PyObject *
2308PyUnicode_DecodeUTF16(const char *s,
2309		      Py_ssize_t size,
2310		      const char *errors,
2311		      int *byteorder)
2312{
2313    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2314}
2315
2316PyObject *
2317PyUnicode_DecodeUTF16Stateful(const char *s,
2318			      Py_ssize_t size,
2319			      const char *errors,
2320			      int *byteorder,
2321			      Py_ssize_t *consumed)
2322{
2323    const char *starts = s;
2324    Py_ssize_t startinpos;
2325    Py_ssize_t endinpos;
2326    Py_ssize_t outpos;
2327    PyUnicodeObject *unicode;
2328    Py_UNICODE *p;
2329    const unsigned char *q, *e;
2330    int bo = 0;       /* assume native ordering by default */
2331    const char *errmsg = "";
2332    /* Offsets from q for retrieving byte pairs in the right order. */
2333#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2334    int ihi = 1, ilo = 0;
2335#else
2336    int ihi = 0, ilo = 1;
2337#endif
2338    PyObject *errorHandler = NULL;
2339    PyObject *exc = NULL;
2340
2341    /* Note: size will always be longer than the resulting Unicode
2342       character count */
2343    unicode = _PyUnicode_New(size);
2344    if (!unicode)
2345        return NULL;
2346    if (size == 0)
2347        return (PyObject *)unicode;
2348
2349    /* Unpack UTF-16 encoded data */
2350    p = unicode->str;
2351    q = (unsigned char *)s;
2352    e = q + size;
2353
2354    if (byteorder)
2355        bo = *byteorder;
2356
2357    /* Check for BOM marks (U+FEFF) in the input and adjust current
2358       byte order setting accordingly. In native mode, the leading BOM
2359       mark is skipped, in all other modes, it is copied to the output
2360       stream as-is (giving a ZWNBSP character). */
2361    if (bo == 0) {
2362        if (size >= 2) {
2363            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2364#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2365	    if (bom == 0xFEFF) {
2366		q += 2;
2367		bo = -1;
2368	    }
2369	    else if (bom == 0xFFFE) {
2370		q += 2;
2371		bo = 1;
2372	    }
2373#else
2374	    if (bom == 0xFEFF) {
2375		q += 2;
2376		bo = 1;
2377	    }
2378	    else if (bom == 0xFFFE) {
2379		q += 2;
2380		bo = -1;
2381	    }
2382#endif
2383	}
2384    }
2385
2386    if (bo == -1) {
2387        /* force LE */
2388        ihi = 1;
2389        ilo = 0;
2390    }
2391    else if (bo == 1) {
2392        /* force BE */
2393        ihi = 0;
2394        ilo = 1;
2395    }
2396
2397    while (q < e) {
2398	Py_UNICODE ch;
2399	/* remaining bytes at the end? (size should be even) */
2400	if (e-q<2) {
2401	    if (consumed)
2402		break;
2403	    errmsg = "truncated data";
2404	    startinpos = ((const char *)q)-starts;
2405	    endinpos = ((const char *)e)-starts;
2406	    goto utf16Error;
2407	    /* The remaining input chars are ignored if the callback
2408	       chooses to skip the input */
2409	}
2410	ch = (q[ihi] << 8) | q[ilo];
2411
2412	q += 2;
2413
2414	if (ch < 0xD800 || ch > 0xDFFF) {
2415	    *p++ = ch;
2416	    continue;
2417	}
2418
2419	/* UTF-16 code pair: */
2420	if (q >= e) {
2421	    errmsg = "unexpected end of data";
2422	    startinpos = (((const char *)q)-2)-starts;
2423	    endinpos = ((const char *)e)-starts;
2424	    goto utf16Error;
2425	}
2426	if (0xD800 <= ch && ch <= 0xDBFF) {
2427	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2428	    q += 2;
2429	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2430#ifndef Py_UNICODE_WIDE
2431		*p++ = ch;
2432		*p++ = ch2;
2433#else
2434		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435#endif
2436		continue;
2437	    }
2438	    else {
2439                errmsg = "illegal UTF-16 surrogate";
2440		startinpos = (((const char *)q)-4)-starts;
2441		endinpos = startinpos+2;
2442		goto utf16Error;
2443	    }
2444
2445	}
2446	errmsg = "illegal encoding";
2447	startinpos = (((const char *)q)-2)-starts;
2448	endinpos = startinpos+2;
2449	/* Fall through to report the error */
2450
2451    utf16Error:
2452	outpos = p-PyUnicode_AS_UNICODE(unicode);
2453	if (unicode_decode_call_errorhandler(
2454	         errors, &errorHandler,
2455	         "utf16", errmsg,
2456	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2457	         (PyObject **)&unicode, &outpos, &p))
2458	    goto onError;
2459    }
2460
2461    if (byteorder)
2462        *byteorder = bo;
2463
2464    if (consumed)
2465	*consumed = (const char *)q-starts;
2466
2467    /* Adjust length */
2468    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2469        goto onError;
2470
2471    Py_XDECREF(errorHandler);
2472    Py_XDECREF(exc);
2473    return (PyObject *)unicode;
2474
2475onError:
2476    Py_DECREF(unicode);
2477    Py_XDECREF(errorHandler);
2478    Py_XDECREF(exc);
2479    return NULL;
2480}
2481
2482PyObject *
2483PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2484		      Py_ssize_t size,
2485		      const char *errors,
2486		      int byteorder)
2487{
2488    PyObject *v;
2489    unsigned char *p;
2490#ifdef Py_UNICODE_WIDE
2491    int i, pairs;
2492#else
2493    const int pairs = 0;
2494#endif
2495    /* Offsets from p for storing byte pairs in the right order. */
2496#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2497    int ihi = 1, ilo = 0;
2498#else
2499    int ihi = 0, ilo = 1;
2500#endif
2501
2502#define STORECHAR(CH)                   \
2503    do {                                \
2504        p[ihi] = ((CH) >> 8) & 0xff;    \
2505        p[ilo] = (CH) & 0xff;           \
2506        p += 2;                         \
2507    } while(0)
2508
2509#ifdef Py_UNICODE_WIDE
2510    for (i = pairs = 0; i < size; i++)
2511	if (s[i] >= 0x10000)
2512	    pairs++;
2513#endif
2514    v = PyBytes_FromStringAndSize(NULL,
2515		  2 * (size + pairs + (byteorder == 0)));
2516    if (v == NULL)
2517        return NULL;
2518
2519    p = (unsigned char *)PyBytes_AS_STRING(v);
2520    if (byteorder == 0)
2521	STORECHAR(0xFEFF);
2522    if (size == 0)
2523        return v;
2524
2525    if (byteorder == -1) {
2526        /* force LE */
2527        ihi = 1;
2528        ilo = 0;
2529    }
2530    else if (byteorder == 1) {
2531        /* force BE */
2532        ihi = 0;
2533        ilo = 1;
2534    }
2535
2536    while (size-- > 0) {
2537	Py_UNICODE ch = *s++;
2538	Py_UNICODE ch2 = 0;
2539#ifdef Py_UNICODE_WIDE
2540	if (ch >= 0x10000) {
2541	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2542	    ch  = 0xD800 | ((ch-0x10000) >> 10);
2543	}
2544#endif
2545        STORECHAR(ch);
2546        if (ch2)
2547            STORECHAR(ch2);
2548    }
2549    return v;
2550#undef STORECHAR
2551}
2552
2553PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2554{
2555    if (!PyUnicode_Check(unicode)) {
2556        PyErr_BadArgument();
2557        return NULL;
2558    }
2559    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2560				 PyUnicode_GET_SIZE(unicode),
2561				 NULL,
2562				 0);
2563}
2564
2565/* --- Unicode Escape Codec ----------------------------------------------- */
2566
2567static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2568
2569PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2570					Py_ssize_t size,
2571					const char *errors)
2572{
2573    const char *starts = s;
2574    Py_ssize_t startinpos;
2575    Py_ssize_t endinpos;
2576    Py_ssize_t outpos;
2577    int i;
2578    PyUnicodeObject *v;
2579    Py_UNICODE *p;
2580    const char *end;
2581    char* message;
2582    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2583    PyObject *errorHandler = NULL;
2584    PyObject *exc = NULL;
2585
2586    /* Escaped strings will always be longer than the resulting
2587       Unicode string, so we start with size here and then reduce the
2588       length after conversion to the true value.
2589       (but if the error callback returns a long replacement string
2590       we'll have to allocate more space) */
2591    v = _PyUnicode_New(size);
2592    if (v == NULL)
2593        goto onError;
2594    if (size == 0)
2595        return (PyObject *)v;
2596
2597    p = PyUnicode_AS_UNICODE(v);
2598    end = s + size;
2599
2600    while (s < end) {
2601        unsigned char c;
2602        Py_UNICODE x;
2603        int digits;
2604
2605        /* Non-escape characters are interpreted as Unicode ordinals */
2606        if (*s != '\\') {
2607            *p++ = (unsigned char) *s++;
2608            continue;
2609        }
2610
2611        startinpos = s-starts;
2612        /* \ - Escapes */
2613        s++;
2614        switch (*s++) {
2615
2616        /* \x escapes */
2617        case '\n': break;
2618        case '\\': *p++ = '\\'; break;
2619        case '\'': *p++ = '\''; break;
2620        case '\"': *p++ = '\"'; break;
2621        case 'b': *p++ = '\b'; break;
2622        case 'f': *p++ = '\014'; break; /* FF */
2623        case 't': *p++ = '\t'; break;
2624        case 'n': *p++ = '\n'; break;
2625        case 'r': *p++ = '\r'; break;
2626        case 'v': *p++ = '\013'; break; /* VT */
2627        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2628
2629        /* \OOO (octal) escapes */
2630        case '0': case '1': case '2': case '3':
2631        case '4': case '5': case '6': case '7':
2632            x = s[-1] - '0';
2633            if ('0' <= *s && *s <= '7') {
2634                x = (x<<3) + *s++ - '0';
2635                if ('0' <= *s && *s <= '7')
2636                    x = (x<<3) + *s++ - '0';
2637            }
2638            *p++ = x;
2639            break;
2640
2641        /* hex escapes */
2642        /* \xXX */
2643        case 'x':
2644            digits = 2;
2645            message = "truncated \\xXX escape";
2646            goto hexescape;
2647
2648        /* \uXXXX */
2649        case 'u':
2650            digits = 4;
2651            message = "truncated \\uXXXX escape";
2652            goto hexescape;
2653
2654        /* \UXXXXXXXX */
2655        case 'U':
2656            digits = 8;
2657            message = "truncated \\UXXXXXXXX escape";
2658        hexescape:
2659            chr = 0;
2660            outpos = p-PyUnicode_AS_UNICODE(v);
2661            if (s+digits>end) {
2662                endinpos = size;
2663                if (unicode_decode_call_errorhandler(
2664                    errors, &errorHandler,
2665                    "unicodeescape", "end of string in escape sequence",
2666                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2667                    (PyObject **)&v, &outpos, &p))
2668                    goto onError;
2669                goto nextByte;
2670            }
2671            for (i = 0; i < digits; ++i) {
2672                c = (unsigned char) s[i];
2673                if (!isxdigit(c)) {
2674                    endinpos = (s+i+1)-starts;
2675                    if (unicode_decode_call_errorhandler(
2676                        errors, &errorHandler,
2677                        "unicodeescape", message,
2678                        &starts, &end, &startinpos, &endinpos, &exc, &s,
2679                        (PyObject **)&v, &outpos, &p))
2680                        goto onError;
2681                    goto nextByte;
2682                }
2683                chr = (chr<<4) & ~0xF;
2684                if (c >= '0' && c <= '9')
2685                    chr += c - '0';
2686                else if (c >= 'a' && c <= 'f')
2687                    chr += 10 + c - 'a';
2688                else
2689                    chr += 10 + c - 'A';
2690            }
2691            s += i;
2692            if (chr == 0xffffffff && PyErr_Occurred())
2693                /* _decoding_error will have already written into the
2694                   target buffer. */
2695                break;
2696        store:
2697            /* when we get here, chr is a 32-bit unicode character */
2698            if (chr <= 0xffff)
2699                /* UCS-2 character */
2700                *p++ = (Py_UNICODE) chr;
2701            else if (chr <= 0x10ffff) {
2702                /* UCS-4 character. Either store directly, or as
2703                   surrogate pair. */
2704#ifdef Py_UNICODE_WIDE
2705                *p++ = chr;
2706#else
2707                chr -= 0x10000L;
2708                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2709                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2710#endif
2711            } else {
2712                endinpos = s-starts;
2713                outpos = p-PyUnicode_AS_UNICODE(v);
2714                if (unicode_decode_call_errorhandler(
2715                    errors, &errorHandler,
2716                    "unicodeescape", "illegal Unicode character",
2717                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2718                    (PyObject **)&v, &outpos, &p))
2719                    goto onError;
2720            }
2721            break;
2722
2723        /* \N{name} */
2724        case 'N':
2725            message = "malformed \\N character escape";
2726            if (ucnhash_CAPI == NULL) {
2727                /* load the unicode data module */
2728                PyObject *m, *api;
2729                m = PyImport_ImportModule("unicodedata");
2730                if (m == NULL)
2731                    goto ucnhashError;
2732                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2733                Py_DECREF(m);
2734                if (api == NULL)
2735                    goto ucnhashError;
2736                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2737                Py_DECREF(api);
2738                if (ucnhash_CAPI == NULL)
2739                    goto ucnhashError;
2740            }
2741            if (*s == '{') {
2742                const char *start = s+1;
2743                /* look for the closing brace */
2744                while (*s != '}' && s < end)
2745                    s++;
2746                if (s > start && s < end && *s == '}') {
2747                    /* found a name.  look it up in the unicode database */
2748                    message = "unknown Unicode character name";
2749                    s++;
2750                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2751                        goto store;
2752                }
2753            }
2754            endinpos = s-starts;
2755            outpos = p-PyUnicode_AS_UNICODE(v);
2756            if (unicode_decode_call_errorhandler(
2757                errors, &errorHandler,
2758                "unicodeescape", message,
2759                &starts, &end, &startinpos, &endinpos, &exc, &s,
2760                (PyObject **)&v, &outpos, &p))
2761                goto onError;
2762            break;
2763
2764        default:
2765            if (s > end) {
2766                message = "\\ at end of string";
2767                s--;
2768                endinpos = s-starts;
2769                outpos = p-PyUnicode_AS_UNICODE(v);
2770                if (unicode_decode_call_errorhandler(
2771                    errors, &errorHandler,
2772                    "unicodeescape", message,
2773                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2774                    (PyObject **)&v, &outpos, &p))
2775                    goto onError;
2776            }
2777            else {
2778                *p++ = '\\';
2779                *p++ = (unsigned char)s[-1];
2780            }
2781            break;
2782        }
2783        nextByte:
2784        ;
2785    }
2786    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2787        goto onError;
2788    Py_XDECREF(errorHandler);
2789    Py_XDECREF(exc);
2790    return (PyObject *)v;
2791
2792ucnhashError:
2793    PyErr_SetString(
2794        PyExc_UnicodeError,
2795        "\\N escapes not supported (can't load unicodedata module)"
2796        );
2797    Py_XDECREF(v);
2798    Py_XDECREF(errorHandler);
2799    Py_XDECREF(exc);
2800    return NULL;
2801
2802onError:
2803    Py_XDECREF(v);
2804    Py_XDECREF(errorHandler);
2805    Py_XDECREF(exc);
2806    return NULL;
2807}
2808
2809/* Return a Unicode-Escape string version of the Unicode object.
2810
2811   If quotes is true, the string is enclosed in u"" or u'' quotes as
2812   appropriate.
2813
2814*/
2815
2816Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2817                                      Py_ssize_t size,
2818                                      Py_UNICODE ch)
2819{
2820    /* like wcschr, but doesn't stop at NULL characters */
2821
2822    while (size-- > 0) {
2823        if (*s == ch)
2824            return s;
2825        s++;
2826    }
2827
2828    return NULL;
2829}
2830
2831static const char *hexdigits = "0123456789abcdef";
2832
2833PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2834					Py_ssize_t size)
2835{
2836    PyObject *repr;
2837    char *p;
2838
2839    /* XXX(nnorwitz): rather than over-allocating, it would be
2840       better to choose a different scheme.  Perhaps scan the
2841       first N-chars of the string and allocate based on that size.
2842    */
2843    /* Initial allocation is based on the longest-possible unichr
2844       escape.
2845
2846       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2847       unichr, so in this case it's the longest unichr escape. In
2848       narrow (UTF-16) builds this is five chars per source unichr
2849       since there are two unichrs in the surrogate pair, so in narrow
2850       (UTF-16) builds it's not the longest unichr escape.
2851
2852       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2853       so in the narrow (UTF-16) build case it's the longest unichr
2854       escape.
2855    */
2856
2857    repr = PyBytes_FromStringAndSize(NULL,
2858#ifdef Py_UNICODE_WIDE
2859        + 10*size
2860#else
2861        + 6*size
2862#endif
2863        + 1);
2864    if (repr == NULL)
2865        return NULL;
2866
2867    p = PyBytes_AS_STRING(repr);
2868
2869    while (size-- > 0) {
2870        Py_UNICODE ch = *s++;
2871
2872        /* Escape backslashes */
2873        if (ch == '\\') {
2874            *p++ = '\\';
2875            *p++ = (char) ch;
2876            continue;
2877        }
2878
2879#ifdef Py_UNICODE_WIDE
2880        /* Map 21-bit characters to '\U00xxxxxx' */
2881        else if (ch >= 0x10000) {
2882            *p++ = '\\';
2883            *p++ = 'U';
2884            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2885            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2886            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2887            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2888            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2889            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2890            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2891            *p++ = hexdigits[ch & 0x0000000F];
2892	    continue;
2893        }
2894#else
2895	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2896	else if (ch >= 0xD800 && ch < 0xDC00) {
2897	    Py_UNICODE ch2;
2898	    Py_UCS4 ucs;
2899
2900	    ch2 = *s++;
2901	    size--;
2902	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2903		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2904		*p++ = '\\';
2905		*p++ = 'U';
2906		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2907		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2908		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2909		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2910		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2911		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2912		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2913		*p++ = hexdigits[ucs & 0x0000000F];
2914		continue;
2915	    }
2916	    /* Fall through: isolated surrogates are copied as-is */
2917	    s--;
2918	    size++;
2919	}
2920#endif
2921
2922        /* Map 16-bit characters to '\uxxxx' */
2923        if (ch >= 256) {
2924            *p++ = '\\';
2925            *p++ = 'u';
2926            *p++ = hexdigits[(ch >> 12) & 0x000F];
2927            *p++ = hexdigits[(ch >> 8) & 0x000F];
2928            *p++ = hexdigits[(ch >> 4) & 0x000F];
2929            *p++ = hexdigits[ch & 0x000F];
2930        }
2931
2932        /* Map special whitespace to '\t', \n', '\r' */
2933        else if (ch == '\t') {
2934            *p++ = '\\';
2935            *p++ = 't';
2936        }
2937        else if (ch == '\n') {
2938            *p++ = '\\';
2939            *p++ = 'n';
2940        }
2941        else if (ch == '\r') {
2942            *p++ = '\\';
2943            *p++ = 'r';
2944        }
2945
2946        /* Map non-printable US ASCII to '\xhh' */
2947        else if (ch < ' ' || ch >= 0x7F) {
2948            *p++ = '\\';
2949            *p++ = 'x';
2950            *p++ = hexdigits[(ch >> 4) & 0x000F];
2951            *p++ = hexdigits[ch & 0x000F];
2952        }
2953
2954        /* Copy everything else as-is */
2955        else
2956            *p++ = (char) ch;
2957    }
2958
2959    *p = '\0';
2960    if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2961        Py_DECREF(repr);
2962        return NULL;
2963    }
2964    return repr;
2965}
2966
2967PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2968{
2969    PyObject *s, *result;
2970    if (!PyUnicode_Check(unicode)) {
2971        PyErr_BadArgument();
2972        return NULL;
2973    }
2974    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2975                                      PyUnicode_GET_SIZE(unicode));
2976
2977    if (!s)
2978        return NULL;
2979    result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2980                                        PyBytes_GET_SIZE(s));
2981    Py_DECREF(s);
2982    return result;
2983}
2984
2985/* --- Raw Unicode Escape Codec ------------------------------------------- */
2986
2987PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2988					   Py_ssize_t size,
2989					   const char *errors)
2990{
2991    const char *starts = s;
2992    Py_ssize_t startinpos;
2993    Py_ssize_t endinpos;
2994    Py_ssize_t outpos;
2995    PyUnicodeObject *v;
2996    Py_UNICODE *p;
2997    const char *end;
2998    const char *bs;
2999    PyObject *errorHandler = NULL;
3000    PyObject *exc = NULL;
3001
3002    /* Escaped strings will always be longer than the resulting
3003       Unicode string, so we start with size here and then reduce the
3004       length after conversion to the true value. (But decoding error
3005       handler might have to resize the string) */
3006    v = _PyUnicode_New(size);
3007    if (v == NULL)
3008	goto onError;
3009    if (size == 0)
3010	return (PyObject *)v;
3011    p = PyUnicode_AS_UNICODE(v);
3012    end = s + size;
3013    while (s < end) {
3014	unsigned char c;
3015	Py_UCS4 x;
3016	int i;
3017        int count;
3018
3019	/* Non-escape characters are interpreted as Unicode ordinals */
3020	if (*s != '\\') {
3021	    *p++ = (unsigned char)*s++;
3022	    continue;
3023	}
3024	startinpos = s-starts;
3025
3026	/* \u-escapes are only interpreted iff the number of leading
3027	   backslashes if odd */
3028	bs = s;
3029	for (;s < end;) {
3030	    if (*s != '\\')
3031		break;
3032	    *p++ = (unsigned char)*s++;
3033	}
3034	if (((s - bs) & 1) == 0 ||
3035	    s >= end ||
3036	    (*s != 'u' && *s != 'U')) {
3037	    continue;
3038	}
3039	p--;
3040        count = *s=='u' ? 4 : 8;
3041	s++;
3042
3043	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3044	outpos = p-PyUnicode_AS_UNICODE(v);
3045	for (x = 0, i = 0; i < count; ++i, ++s) {
3046	    c = (unsigned char)*s;
3047	    if (!isxdigit(c)) {
3048		endinpos = s-starts;
3049		if (unicode_decode_call_errorhandler(
3050		    errors, &errorHandler,
3051		    "rawunicodeescape", "truncated \\uXXXX",
3052		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3053		    (PyObject **)&v, &outpos, &p))
3054		    goto onError;
3055		goto nextByte;
3056	    }
3057	    x = (x<<4) & ~0xF;
3058	    if (c >= '0' && c <= '9')
3059		x += c - '0';
3060	    else if (c >= 'a' && c <= 'f')
3061		x += 10 + c - 'a';
3062	    else
3063		x += 10 + c - 'A';
3064	}
3065#ifndef Py_UNICODE_WIDE
3066        if (x > 0x10000) {
3067            if (unicode_decode_call_errorhandler(
3068                    errors, &errorHandler,
3069                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3070		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3071		    (PyObject **)&v, &outpos, &p))
3072		    goto onError;
3073        }
3074#endif
3075	*p++ = x;
3076	nextByte:
3077	;
3078    }
3079    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3080	goto onError;
3081    Py_XDECREF(errorHandler);
3082    Py_XDECREF(exc);
3083    return (PyObject *)v;
3084
3085 onError:
3086    Py_XDECREF(v);
3087    Py_XDECREF(errorHandler);
3088    Py_XDECREF(exc);
3089    return NULL;
3090}
3091
3092PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3093					   Py_ssize_t size)
3094{
3095    PyObject *repr;
3096    char *p;
3097    char *q;
3098
3099#ifdef Py_UNICODE_WIDE
3100    repr = PyBytes_FromStringAndSize(NULL, 10 * size);
3101#else
3102    repr = PyBytes_FromStringAndSize(NULL, 6 * size);
3103#endif
3104    if (repr == NULL)
3105        return NULL;
3106    if (size == 0)
3107	return repr;
3108
3109    p = q = PyBytes_AS_STRING(repr);
3110    while (size-- > 0) {
3111        Py_UNICODE ch = *s++;
3112#ifdef Py_UNICODE_WIDE
3113	/* Map 32-bit characters to '\Uxxxxxxxx' */
3114	if (ch >= 0x10000) {
3115            *p++ = '\\';
3116            *p++ = 'U';
3117            *p++ = hexdigits[(ch >> 28) & 0xf];
3118            *p++ = hexdigits[(ch >> 24) & 0xf];
3119            *p++ = hexdigits[(ch >> 20) & 0xf];
3120            *p++ = hexdigits[(ch >> 16) & 0xf];
3121            *p++ = hexdigits[(ch >> 12) & 0xf];
3122            *p++ = hexdigits[(ch >> 8) & 0xf];
3123            *p++ = hexdigits[(ch >> 4) & 0xf];
3124            *p++ = hexdigits[ch & 15];
3125        }
3126        else
3127#endif
3128	/* Map 16-bit characters to '\uxxxx' */
3129	if (ch >= 256) {
3130            *p++ = '\\';
3131            *p++ = 'u';
3132            *p++ = hexdigits[(ch >> 12) & 0xf];
3133            *p++ = hexdigits[(ch >> 8) & 0xf];
3134            *p++ = hexdigits[(ch >> 4) & 0xf];
3135            *p++ = hexdigits[ch & 15];
3136        }
3137	/* Copy everything else as-is */
3138	else
3139            *p++ = (char) ch;
3140    }
3141    *p = '\0';
3142    if (PyBytes_Resize(repr, p - q)) {
3143        Py_DECREF(repr);
3144        return NULL;
3145    }
3146    return repr;
3147}
3148
3149PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3150{
3151    PyObject *s, *result;
3152    if (!PyUnicode_Check(unicode)) {
3153        PyErr_BadArgument();
3154        return NULL;
3155    }
3156    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3157                                         PyUnicode_GET_SIZE(unicode));
3158
3159    if (!s)
3160        return NULL;
3161    result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
3162                                        PyBytes_GET_SIZE(s));
3163    Py_DECREF(s);
3164    return result;
3165}
3166
3167/* --- Unicode Internal Codec ------------------------------------------- */
3168
3169PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3170					   Py_ssize_t size,
3171					   const char *errors)
3172{
3173    const char *starts = s;
3174    Py_ssize_t startinpos;
3175    Py_ssize_t endinpos;
3176    Py_ssize_t outpos;
3177    PyUnicodeObject *v;
3178    Py_UNICODE *p;
3179    const char *end;
3180    const char *reason;
3181    PyObject *errorHandler = NULL;
3182    PyObject *exc = NULL;
3183
3184#ifdef Py_UNICODE_WIDE
3185    Py_UNICODE unimax = PyUnicode_GetMax();
3186#endif
3187
3188    /* XXX overflow detection missing */
3189    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3190    if (v == NULL)
3191	goto onError;
3192    if (PyUnicode_GetSize((PyObject *)v) == 0)
3193	return (PyObject *)v;
3194    p = PyUnicode_AS_UNICODE(v);
3195    end = s + size;
3196
3197    while (s < end) {
3198        memcpy(p, s, sizeof(Py_UNICODE));
3199        /* We have to sanity check the raw data, otherwise doom looms for
3200           some malformed UCS-4 data. */
3201        if (
3202            #ifdef Py_UNICODE_WIDE
3203            *p > unimax || *p < 0 ||
3204            #endif
3205            end-s < Py_UNICODE_SIZE
3206            )
3207            {
3208            startinpos = s - starts;
3209            if (end-s < Py_UNICODE_SIZE) {
3210                endinpos = end-starts;
3211                reason = "truncated input";
3212            }
3213            else {
3214                endinpos = s - starts + Py_UNICODE_SIZE;
3215                reason = "illegal code point (> 0x10FFFF)";
3216            }
3217            outpos = p - PyUnicode_AS_UNICODE(v);
3218            if (unicode_decode_call_errorhandler(
3219                    errors, &errorHandler,
3220                    "unicode_internal", reason,
3221                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3222                    (PyObject **)&v, &outpos, &p)) {
3223                goto onError;
3224            }
3225        }
3226        else {
3227            p++;
3228            s += Py_UNICODE_SIZE;
3229        }
3230    }
3231
3232    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3233        goto onError;
3234    Py_XDECREF(errorHandler);
3235    Py_XDECREF(exc);
3236    return (PyObject *)v;
3237
3238 onError:
3239    Py_XDECREF(v);
3240    Py_XDECREF(errorHandler);
3241    Py_XDECREF(exc);
3242    return NULL;
3243}
3244
3245/* --- Latin-1 Codec ------------------------------------------------------ */
3246
3247PyObject *PyUnicode_DecodeLatin1(const char *s,
3248				 Py_ssize_t size,
3249				 const char *errors)
3250{
3251    PyUnicodeObject *v;
3252    Py_UNICODE *p;
3253
3254    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3255    if (size == 1) {
3256	Py_UNICODE r = *(unsigned char*)s;
3257	return PyUnicode_FromUnicode(&r, 1);
3258    }
3259
3260    v = _PyUnicode_New(size);
3261    if (v == NULL)
3262	goto onError;
3263    if (size == 0)
3264	return (PyObject *)v;
3265    p = PyUnicode_AS_UNICODE(v);
3266    while (size-- > 0)
3267	*p++ = (unsigned char)*s++;
3268    return (PyObject *)v;
3269
3270 onError:
3271    Py_XDECREF(v);
3272    return NULL;
3273}
3274
3275/* create or adjust a UnicodeEncodeError */
3276static void make_encode_exception(PyObject **exceptionObject,
3277    const char *encoding,
3278    const Py_UNICODE *unicode, Py_ssize_t size,
3279    Py_ssize_t startpos, Py_ssize_t endpos,
3280    const char *reason)
3281{
3282    if (*exceptionObject == NULL) {
3283	*exceptionObject = PyUnicodeEncodeError_Create(
3284	    encoding, unicode, size, startpos, endpos, reason);
3285    }
3286    else {
3287	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3288	    goto onError;
3289	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3290	    goto onError;
3291	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3292	    goto onError;
3293	return;
3294	onError:
3295	Py_DECREF(*exceptionObject);
3296	*exceptionObject = NULL;
3297    }
3298}
3299
3300/* raises a UnicodeEncodeError */
3301static void raise_encode_exception(PyObject **exceptionObject,
3302    const char *encoding,
3303    const Py_UNICODE *unicode, Py_ssize_t size,
3304    Py_ssize_t startpos, Py_ssize_t endpos,
3305    const char *reason)
3306{
3307    make_encode_exception(exceptionObject,
3308	encoding, unicode, size, startpos, endpos, reason);
3309    if (*exceptionObject != NULL)
3310	PyCodec_StrictErrors(*exceptionObject);
3311}
3312
3313/* error handling callback helper:
3314   build arguments, call the callback and check the arguments,
3315   put the result into newpos and return the replacement string, which
3316   has to be freed by the caller */
3317static PyObject *unicode_encode_call_errorhandler(const char *errors,
3318    PyObject **errorHandler,
3319    const char *encoding, const char *reason,
3320    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3321    Py_ssize_t startpos, Py_ssize_t endpos,
3322    Py_ssize_t *newpos)
3323{
3324    static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3325
3326    PyObject *restuple;
3327    PyObject *resunicode;
3328
3329    if (*errorHandler == NULL) {
3330	*errorHandler = PyCodec_LookupError(errors);
3331        if (*errorHandler == NULL)
3332	    return NULL;
3333    }
3334
3335    make_encode_exception(exceptionObject,
3336	encoding, unicode, size, startpos, endpos, reason);
3337    if (*exceptionObject == NULL)
3338	return NULL;
3339
3340    restuple = PyObject_CallFunctionObjArgs(
3341	*errorHandler, *exceptionObject, NULL);
3342    if (restuple == NULL)
3343	return NULL;
3344    if (!PyTuple_Check(restuple)) {
3345	PyErr_Format(PyExc_TypeError, &argparse[4]);
3346	Py_DECREF(restuple);
3347	return NULL;
3348    }
3349    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3350	&resunicode, newpos)) {
3351	Py_DECREF(restuple);
3352	return NULL;
3353    }
3354    if (*newpos<0)
3355	*newpos = size+*newpos;
3356    if (*newpos<0 || *newpos>size) {
3357	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3358	Py_DECREF(restuple);
3359	return NULL;
3360    }
3361    Py_INCREF(resunicode);
3362    Py_DECREF(restuple);
3363    return resunicode;
3364}
3365
3366static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3367				 Py_ssize_t size,
3368				 const char *errors,
3369				 int limit)
3370{
3371    /* output object */
3372    PyObject *res;
3373    /* pointers to the beginning and end+1 of input */
3374    const Py_UNICODE *startp = p;
3375    const Py_UNICODE *endp = p + size;
3376    /* pointer to the beginning of the unencodable characters */
3377    /* const Py_UNICODE *badp = NULL; */
3378    /* pointer into the output */
3379    char *str;
3380    /* current output position */
3381    Py_ssize_t respos = 0;
3382    Py_ssize_t ressize;
3383    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3384    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3385    PyObject *errorHandler = NULL;
3386    PyObject *exc = NULL;
3387    /* the following variable is used for caching string comparisons
3388     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3389    int known_errorHandler = -1;
3390
3391    /* allocate enough for a simple encoding without
3392       replacements, if we need more, we'll resize */
3393    res = PyBytes_FromStringAndSize(NULL, size);
3394    if (res == NULL)
3395        goto onError;
3396    if (size == 0)
3397	return res;
3398    str = PyBytes_AS_STRING(res);
3399    ressize = size;
3400
3401    while (p<endp) {
3402	Py_UNICODE c = *p;
3403
3404	/* can we encode this? */
3405	if (c<limit) {
3406	    /* no overflow check, because we know that the space is enough */
3407	    *str++ = (char)c;
3408	    ++p;
3409	}
3410	else {
3411	    Py_ssize_t unicodepos = p-startp;
3412	    Py_ssize_t requiredsize;
3413	    PyObject *repunicode;
3414	    Py_ssize_t repsize;
3415	    Py_ssize_t newpos;
3416	    Py_ssize_t respos;
3417	    Py_UNICODE *uni2;
3418	    /* startpos for collecting unencodable chars */
3419	    const Py_UNICODE *collstart = p;
3420	    const Py_UNICODE *collend = p;
3421	    /* find all unecodable characters */
3422	    while ((collend < endp) && ((*collend)>=limit))
3423		++collend;
3424	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3425	    if (known_errorHandler==-1) {
3426		if ((errors==NULL) || (!strcmp(errors, "strict")))
3427		    known_errorHandler = 1;
3428		else if (!strcmp(errors, "replace"))
3429		    known_errorHandler = 2;
3430		else if (!strcmp(errors, "ignore"))
3431		    known_errorHandler = 3;
3432		else if (!strcmp(errors, "xmlcharrefreplace"))
3433		    known_errorHandler = 4;
3434		else
3435		    known_errorHandler = 0;
3436	    }
3437	    switch (known_errorHandler) {
3438		case 1: /* strict */
3439		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3440		    goto onError;
3441		case 2: /* replace */
3442		    while (collstart++<collend)
3443			*str++ = '?'; /* fall through */
3444		case 3: /* ignore */
3445		    p = collend;
3446		    break;
3447		case 4: /* xmlcharrefreplace */
3448		    respos = str - PyBytes_AS_STRING(res);
3449		    /* determine replacement size (temporarily (mis)uses p) */
3450		    for (p = collstart, repsize = 0; p < collend; ++p) {
3451			if (*p<10)
3452			    repsize += 2+1+1;
3453			else if (*p<100)
3454			    repsize += 2+2+1;
3455			else if (*p<1000)
3456			    repsize += 2+3+1;
3457			else if (*p<10000)
3458			    repsize += 2+4+1;
3459#ifndef Py_UNICODE_WIDE
3460			else
3461			    repsize += 2+5+1;
3462#else
3463			else if (*p<100000)
3464			    repsize += 2+5+1;
3465			else if (*p<1000000)
3466			    repsize += 2+6+1;
3467			else
3468			    repsize += 2+7+1;
3469#endif
3470		    }
3471		    requiredsize = respos+repsize+(endp-collend);
3472		    if (requiredsize > ressize) {
3473			if (requiredsize<2*ressize)
3474			    requiredsize = 2*ressize;
3475			if (PyBytes_Resize(res, requiredsize))
3476			    goto onError;
3477			str = PyBytes_AS_STRING(res) + respos;
3478			ressize = requiredsize;
3479		    }
3480		    /* generate replacement (temporarily (mis)uses p) */
3481		    for (p = collstart; p < collend; ++p) {
3482			str += sprintf(str, "&#%d;", (int)*p);
3483		    }
3484		    p = collend;
3485		    break;
3486		default:
3487		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3488			encoding, reason, startp, size, &exc,
3489			collstart-startp, collend-startp, &newpos);
3490		    if (repunicode == NULL)
3491			goto onError;
3492		    /* need more space? (at least enough for what we
3493		       have+the replacement+the rest of the string, so
3494		       we won't have to check space for encodable characters) */
3495		    respos = str - PyBytes_AS_STRING(res);
3496		    repsize = PyUnicode_GET_SIZE(repunicode);
3497		    requiredsize = respos+repsize+(endp-collend);
3498		    if (requiredsize > ressize) {
3499			if (requiredsize<2*ressize)
3500			    requiredsize = 2*ressize;
3501			if (PyBytes_Resize(res, requiredsize)) {
3502			    Py_DECREF(repunicode);
3503			    goto onError;
3504			}
3505			str = PyBytes_AS_STRING(res) + respos;
3506			ressize = requiredsize;
3507		    }
3508		    /* check if there is anything unencodable in the replacement
3509		       and copy it to the output */
3510		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3511			c = *uni2;
3512			if (c >= limit) {
3513			    raise_encode_exception(&exc, encoding, startp, size,
3514				unicodepos, unicodepos+1, reason);
3515			    Py_DECREF(repunicode);
3516			    goto onError;
3517			}
3518			*str = (char)c;
3519		    }
3520		    p = startp + newpos;
3521		    Py_DECREF(repunicode);
3522	    }
3523	}
3524    }
3525    /* Resize if we allocated to much */
3526    respos = str - PyBytes_AS_STRING(res);
3527    if (respos<ressize)
3528       /* If this falls res will be NULL */
3529	PyBytes_Resize(res, respos);
3530    Py_XDECREF(errorHandler);
3531    Py_XDECREF(exc);
3532    return res;
3533
3534    onError:
3535    Py_XDECREF(res);
3536    Py_XDECREF(errorHandler);
3537    Py_XDECREF(exc);
3538    return NULL;
3539}
3540
3541PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3542				 Py_ssize_t size,
3543				 const char *errors)
3544{
3545    return unicode_encode_ucs1(p, size, errors, 256);
3546}
3547
3548PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3549{
3550    if (!PyUnicode_Check(unicode)) {
3551	PyErr_BadArgument();
3552	return NULL;
3553    }
3554    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3555				  PyUnicode_GET_SIZE(unicode),
3556				  NULL);
3557}
3558
3559/* --- 7-bit ASCII Codec -------------------------------------------------- */
3560
3561PyObject *PyUnicode_DecodeASCII(const char *s,
3562				Py_ssize_t size,
3563				const char *errors)
3564{
3565    const char *starts = s;
3566    PyUnicodeObject *v;
3567    Py_UNICODE *p;
3568    Py_ssize_t startinpos;
3569    Py_ssize_t endinpos;
3570    Py_ssize_t outpos;
3571    const char *e;
3572    PyObject *errorHandler = NULL;
3573    PyObject *exc = NULL;
3574
3575    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3576    if (size == 1 && *(unsigned char*)s < 128) {
3577	Py_UNICODE r = *(unsigned char*)s;
3578	return PyUnicode_FromUnicode(&r, 1);
3579    }
3580
3581    v = _PyUnicode_New(size);
3582    if (v == NULL)
3583	goto onError;
3584    if (size == 0)
3585	return (PyObject *)v;
3586    p = PyUnicode_AS_UNICODE(v);
3587    e = s + size;
3588    while (s < e) {
3589	register unsigned char c = (unsigned char)*s;
3590	if (c < 128) {
3591	    *p++ = c;
3592	    ++s;
3593	}
3594	else {
3595	    startinpos = s-starts;
3596	    endinpos = startinpos + 1;
3597	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3598	    if (unicode_decode_call_errorhandler(
3599		 errors, &errorHandler,
3600		 "ascii", "ordinal not in range(128)",
3601		 &starts, &e, &startinpos, &endinpos, &exc, &s,
3602		 (PyObject **)&v, &outpos, &p))
3603		goto onError;
3604	}
3605    }
3606    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3607	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3608	    goto onError;
3609    Py_XDECREF(errorHandler);
3610    Py_XDECREF(exc);
3611    return (PyObject *)v;
3612
3613 onError:
3614    Py_XDECREF(v);
3615    Py_XDECREF(errorHandler);
3616    Py_XDECREF(exc);
3617    return NULL;
3618}
3619
3620PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3621				Py_ssize_t size,
3622				const char *errors)
3623{
3624    return unicode_encode_ucs1(p, size, errors, 128);
3625}
3626
3627PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3628{
3629    if (!PyUnicode_Check(unicode)) {
3630	PyErr_BadArgument();
3631	return NULL;
3632    }
3633    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3634				 PyUnicode_GET_SIZE(unicode),
3635				 NULL);
3636}
3637
3638#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3639
3640/* --- MBCS codecs for Windows -------------------------------------------- */
3641
3642#if SIZEOF_INT < SIZEOF_SSIZE_T
3643#define NEED_RETRY
3644#endif
3645
3646/* XXX This code is limited to "true" double-byte encodings, as
3647   a) it assumes an incomplete character consists of a single byte, and
3648   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3649      encodings, see IsDBCSLeadByteEx documentation. */
3650
3651static int is_dbcs_lead_byte(const char *s, int offset)
3652{
3653    const char *curr = s + offset;
3654
3655    if (IsDBCSLeadByte(*curr)) {
3656	const char *prev = CharPrev(s, curr);
3657	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3658    }
3659    return 0;
3660}
3661
3662/*
3663 * Decode MBCS string into unicode object. If 'final' is set, converts
3664 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3665 */
3666static int decode_mbcs(PyUnicodeObject **v,
3667			const char *s, /* MBCS string */
3668			int size, /* sizeof MBCS string */
3669			int final)
3670{
3671    Py_UNICODE *p;
3672    Py_ssize_t n = 0;
3673    int usize = 0;
3674
3675    assert(size >= 0);
3676
3677    /* Skip trailing lead-byte unless 'final' is set */
3678    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3679	--size;
3680
3681    /* First get the size of the result */
3682    if (size > 0) {
3683	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3684	if (usize == 0) {
3685	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3686	    return -1;
3687	}
3688    }
3689
3690    if (*v == NULL) {
3691	/* Create unicode object */
3692	*v = _PyUnicode_New(usize);
3693	if (*v == NULL)
3694	    return -1;
3695    }
3696    else {
3697	/* Extend unicode object */
3698	n = PyUnicode_GET_SIZE(*v);
3699	if (_PyUnicode_Resize(v, n + usize) < 0)
3700	    return -1;
3701    }
3702
3703    /* Do the conversion */
3704    if (size > 0) {
3705	p = PyUnicode_AS_UNICODE(*v) + n;
3706	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3707	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3708	    return -1;
3709	}
3710    }
3711
3712    return size;
3713}
3714
3715PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3716					Py_ssize_t size,
3717					const char *errors,
3718					Py_ssize_t *consumed)
3719{
3720    PyUnicodeObject *v = NULL;
3721    int done;
3722
3723    if (consumed)
3724	*consumed = 0;
3725
3726#ifdef NEED_RETRY
3727  retry:
3728    if (size > INT_MAX)
3729	done = decode_mbcs(&v, s, INT_MAX, 0);
3730    else
3731#endif
3732	done = decode_mbcs(&v, s, (int)size, !consumed);
3733
3734    if (done < 0) {
3735        Py_XDECREF(v);
3736	return NULL;
3737    }
3738
3739    if (consumed)
3740	*consumed += done;
3741
3742#ifdef NEED_RETRY
3743    if (size > INT_MAX) {
3744	s += done;
3745	size -= done;
3746	goto retry;
3747    }
3748#endif
3749
3750    return (PyObject *)v;
3751}
3752
3753PyObject *PyUnicode_DecodeMBCS(const char *s,
3754				Py_ssize_t size,
3755				const char *errors)
3756{
3757    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3758}
3759
3760/*
3761 * Convert unicode into string object (MBCS).
3762 * Returns 0 if succeed, -1 otherwise.
3763 */
3764static int encode_mbcs(PyObject **repr,
3765			const Py_UNICODE *p, /* unicode */
3766			int size) /* size of unicode */
3767{
3768    int mbcssize = 0;
3769    Py_ssize_t n = 0;
3770
3771    assert(size >= 0);
3772
3773    /* First get the size of the result */
3774    if (size > 0) {
3775	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3776	if (mbcssize == 0) {
3777	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3778	    return -1;
3779	}
3780    }
3781
3782    if (*repr == NULL) {
3783	/* Create string object */
3784	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
3785	if (*repr == NULL)
3786	    return -1;
3787    }
3788    else {
3789	/* Extend string object */
3790	n = PyBytes_Size(*repr);
3791	if (PyBytes_Resize(*repr, n + mbcssize) < 0)
3792	    return -1;
3793    }
3794
3795    /* Do the conversion */
3796    if (size > 0) {
3797	char *s = PyBytes_AS_STRING(*repr) + n;
3798	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3799	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3800	    return -1;
3801	}
3802    }
3803
3804    return 0;
3805}
3806
3807PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3808				Py_ssize_t size,
3809				const char *errors)
3810{
3811    PyObject *repr = NULL;
3812    int ret;
3813
3814#ifdef NEED_RETRY
3815 retry:
3816    if (size > INT_MAX)
3817	ret = encode_mbcs(&repr, p, INT_MAX);
3818    else
3819#endif
3820	ret = encode_mbcs(&repr, p, (int)size);
3821
3822    if (ret < 0) {
3823	Py_XDECREF(repr);
3824	return NULL;
3825    }
3826
3827#ifdef NEED_RETRY
3828    if (size > INT_MAX) {
3829	p += INT_MAX;
3830	size -= INT_MAX;
3831	goto retry;
3832    }
3833#endif
3834
3835    return repr;
3836}
3837
3838PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3839{
3840    if (!PyUnicode_Check(unicode)) {
3841        PyErr_BadArgument();
3842        return NULL;
3843    }
3844    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3845				PyUnicode_GET_SIZE(unicode),
3846				NULL);
3847}
3848
3849#undef NEED_RETRY
3850
3851#endif /* MS_WINDOWS */
3852
3853/* --- Character Mapping Codec -------------------------------------------- */
3854
3855PyObject *PyUnicode_DecodeCharmap(const char *s,
3856				  Py_ssize_t size,
3857				  PyObject *mapping,
3858				  const char *errors)
3859{
3860    const char *starts = s;
3861    Py_ssize_t startinpos;
3862    Py_ssize_t endinpos;
3863    Py_ssize_t outpos;
3864    const char *e;
3865    PyUnicodeObject *v;
3866    Py_UNICODE *p;
3867    Py_ssize_t extrachars = 0;
3868    PyObject *errorHandler = NULL;
3869    PyObject *exc = NULL;
3870    Py_UNICODE *mapstring = NULL;
3871    Py_ssize_t maplen = 0;
3872
3873    /* Default to Latin-1 */
3874    if (mapping == NULL)
3875	return PyUnicode_DecodeLatin1(s, size, errors);
3876
3877    v = _PyUnicode_New(size);
3878    if (v == NULL)
3879	goto onError;
3880    if (size == 0)
3881	return (PyObject *)v;
3882    p = PyUnicode_AS_UNICODE(v);
3883    e = s + size;
3884    if (PyUnicode_CheckExact(mapping)) {
3885	mapstring = PyUnicode_AS_UNICODE(mapping);
3886	maplen = PyUnicode_GET_SIZE(mapping);
3887	while (s < e) {
3888	    unsigned char ch = *s;
3889	    Py_UNICODE x = 0xfffe; /* illegal value */
3890
3891	    if (ch < maplen)
3892		x = mapstring[ch];
3893
3894	    if (x == 0xfffe) {
3895		/* undefined mapping */
3896		outpos = p-PyUnicode_AS_UNICODE(v);
3897		startinpos = s-starts;
3898		endinpos = startinpos+1;
3899		if (unicode_decode_call_errorhandler(
3900		     errors, &errorHandler,
3901		     "charmap", "character maps to <undefined>",
3902		     &starts, &e, &startinpos, &endinpos, &exc, &s,
3903		     (PyObject **)&v, &outpos, &p)) {
3904		    goto onError;
3905		}
3906		continue;
3907	    }
3908	    *p++ = x;
3909	    ++s;
3910	}
3911    }
3912    else {
3913	while (s < e) {
3914	    unsigned char ch = *s;
3915	    PyObject *w, *x;
3916
3917	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
3918	    w = PyInt_FromLong((long)ch);
3919	    if (w == NULL)
3920		goto onError;
3921	    x = PyObject_GetItem(mapping, w);
3922	    Py_DECREF(w);
3923	    if (x == NULL) {
3924		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3925		    /* No mapping found means: mapping is undefined. */
3926		    PyErr_Clear();
3927		    x = Py_None;
3928		    Py_INCREF(x);
3929		} else
3930		    goto onError;
3931	    }
3932
3933	    /* Apply mapping */
3934	    if (PyInt_Check(x)) {
3935		long value = PyInt_AS_LONG(x);
3936		if (value < 0 || value > 65535) {
3937		    PyErr_SetString(PyExc_TypeError,
3938				    "character mapping must be in range(65536)");
3939		    Py_DECREF(x);
3940		    goto onError;
3941		}
3942		*p++ = (Py_UNICODE)value;
3943	    }
3944	    else if (x == Py_None) {
3945		/* undefined mapping */
3946		outpos = p-PyUnicode_AS_UNICODE(v);
3947		startinpos = s-starts;
3948		endinpos = startinpos+1;
3949		if (unicode_decode_call_errorhandler(
3950		     errors, &errorHandler,
3951		     "charmap", "character maps to <undefined>",
3952		     &starts, &e, &startinpos, &endinpos, &exc, &s,
3953		     (PyObject **)&v, &outpos, &p)) {
3954		    Py_DECREF(x);
3955		    goto onError;
3956		}
3957		Py_DECREF(x);
3958		continue;
3959	    }
3960	    else if (PyUnicode_Check(x)) {
3961		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3962
3963		if (targetsize == 1)
3964		    /* 1-1 mapping */
3965		    *p++ = *PyUnicode_AS_UNICODE(x);
3966
3967		else if (targetsize > 1) {
3968		    /* 1-n mapping */
3969		    if (targetsize > extrachars) {
3970			/* resize first */
3971			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3972			Py_ssize_t needed = (targetsize - extrachars) + \
3973				     (targetsize << 2);
3974			extrachars += needed;
3975			/* XXX overflow detection missing */
3976			if (_PyUnicode_Resize(&v,
3977					     PyUnicode_GET_SIZE(v) + needed) < 0) {
3978			    Py_DECREF(x);
3979			    goto onError;
3980			}
3981			p = PyUnicode_AS_UNICODE(v) + oldpos;
3982		    }
3983		    Py_UNICODE_COPY(p,
3984				    PyUnicode_AS_UNICODE(x),
3985				    targetsize);
3986		    p += targetsize;
3987		    extrachars -= targetsize;
3988		}
3989		/* 1-0 mapping: skip the character */
3990	    }
3991	    else {
3992		/* wrong return value */
3993		PyErr_SetString(PyExc_TypeError,
3994		      "character mapping must return integer, None or unicode");
3995		Py_DECREF(x);
3996		goto onError;
3997	    }
3998	    Py_DECREF(x);
3999	    ++s;
4000	}
4001    }
4002    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4003	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4004	    goto onError;
4005    Py_XDECREF(errorHandler);
4006    Py_XDECREF(exc);
4007    return (PyObject *)v;
4008
4009 onError:
4010    Py_XDECREF(errorHandler);
4011    Py_XDECREF(exc);
4012    Py_XDECREF(v);
4013    return NULL;
4014}
4015
4016/* Charmap encoding: the lookup table */
4017
4018struct encoding_map{
4019  PyObject_HEAD
4020  unsigned char level1[32];
4021  int count2, count3;
4022  unsigned char level23[1];
4023};
4024
4025static PyObject*
4026encoding_map_size(PyObject *obj, PyObject* args)
4027{
4028    struct encoding_map *map = (struct encoding_map*)obj;
4029    return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4030                          128*map->count3);
4031}
4032
4033static PyMethodDef encoding_map_methods[] = {
4034	{"size", encoding_map_size, METH_NOARGS,
4035         PyDoc_STR("Return the size (in bytes) of this object") },
4036        { 0 }
4037};
4038
4039static void
4040encoding_map_dealloc(PyObject* o)
4041{
4042	PyObject_FREE(o);
4043}
4044
4045static PyTypeObject EncodingMapType = {
4046	PyVarObject_HEAD_INIT(NULL, 0)
4047        "EncodingMap",          /*tp_name*/
4048        sizeof(struct encoding_map),   /*tp_basicsize*/
4049        0,                      /*tp_itemsize*/
4050        /* methods */
4051        encoding_map_dealloc,   /*tp_dealloc*/
4052        0,                      /*tp_print*/
4053        0,                      /*tp_getattr*/
4054        0,                      /*tp_setattr*/
4055        0,                      /*tp_compare*/
4056        0,                      /*tp_repr*/
4057        0,                      /*tp_as_number*/
4058        0,                      /*tp_as_sequence*/
4059        0,                      /*tp_as_mapping*/
4060        0,                      /*tp_hash*/
4061        0,                      /*tp_call*/
4062        0,                      /*tp_str*/
4063        0,                      /*tp_getattro*/
4064        0,                      /*tp_setattro*/
4065        0,                      /*tp_as_buffer*/
4066        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4067        0,                      /*tp_doc*/
4068        0,                      /*tp_traverse*/
4069        0,                      /*tp_clear*/
4070        0,                      /*tp_richcompare*/
4071        0,                      /*tp_weaklistoffset*/
4072        0,                      /*tp_iter*/
4073        0,                      /*tp_iternext*/
4074        encoding_map_methods,   /*tp_methods*/
4075        0,                      /*tp_members*/
4076        0,                      /*tp_getset*/
4077        0,                      /*tp_base*/
4078        0,                      /*tp_dict*/
4079        0,                      /*tp_descr_get*/
4080        0,                      /*tp_descr_set*/
4081        0,                      /*tp_dictoffset*/
4082        0,                      /*tp_init*/
4083        0,                      /*tp_alloc*/
4084        0,                      /*tp_new*/
4085        0,                      /*tp_free*/
4086        0,                      /*tp_is_gc*/
4087};
4088
4089PyObject*
4090PyUnicode_BuildEncodingMap(PyObject* string)
4091{
4092    Py_UNICODE *decode;
4093    PyObject *result;
4094    struct encoding_map *mresult;
4095    int i;
4096    int need_dict = 0;
4097    unsigned char level1[32];
4098    unsigned char level2[512];
4099    unsigned char *mlevel1, *mlevel2, *mlevel3;
4100    int count2 = 0, count3 = 0;
4101
4102    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4103        PyErr_BadArgument();
4104        return NULL;
4105    }
4106    decode = PyUnicode_AS_UNICODE(string);
4107    memset(level1, 0xFF, sizeof level1);
4108    memset(level2, 0xFF, sizeof level2);
4109
4110    /* If there isn't a one-to-one mapping of NULL to \0,
4111       or if there are non-BMP characters, we need to use
4112       a mapping dictionary. */
4113    if (decode[0] != 0)
4114        need_dict = 1;
4115    for (i = 1; i < 256; i++) {
4116        int l1, l2;
4117        if (decode[i] == 0
4118            #ifdef Py_UNICODE_WIDE
4119            || decode[i] > 0xFFFF
4120            #endif
4121        ) {
4122            need_dict = 1;
4123            break;
4124        }
4125        if (decode[i] == 0xFFFE)
4126            /* unmapped character */
4127            continue;
4128        l1 = decode[i] >> 11;
4129        l2 = decode[i] >> 7;
4130        if (level1[l1] == 0xFF)
4131            level1[l1] = count2++;
4132        if (level2[l2] == 0xFF)
4133            level2[l2] = count3++;
4134    }
4135
4136    if (count2 >= 0xFF || count3 >= 0xFF)
4137        need_dict = 1;
4138
4139    if (need_dict) {
4140        PyObject *result = PyDict_New();
4141        PyObject *key, *value;
4142        if (!result)
4143            return NULL;
4144        for (i = 0; i < 256; i++) {
4145            key = value = NULL;
4146            key = PyInt_FromLong(decode[i]);
4147            value = PyInt_FromLong(i);
4148            if (!key || !value)
4149                goto failed1;
4150            if (PyDict_SetItem(result, key, value) == -1)
4151                goto failed1;
4152            Py_DECREF(key);
4153            Py_DECREF(value);
4154        }
4155        return result;
4156      failed1:
4157        Py_XDECREF(key);
4158        Py_XDECREF(value);
4159        Py_DECREF(result);
4160        return NULL;
4161    }
4162
4163    /* Create a three-level trie */
4164    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4165                             16*count2 + 128*count3 - 1);
4166    if (!result)
4167        return PyErr_NoMemory();
4168    PyObject_Init(result, &EncodingMapType);
4169    mresult = (struct encoding_map*)result;
4170    mresult->count2 = count2;
4171    mresult->count3 = count3;
4172    mlevel1 = mresult->level1;
4173    mlevel2 = mresult->level23;
4174    mlevel3 = mresult->level23 + 16*count2;
4175    memcpy(mlevel1, level1, 32);
4176    memset(mlevel2, 0xFF, 16*count2);
4177    memset(mlevel3, 0, 128*count3);
4178    count3 = 0;
4179    for (i = 1; i < 256; i++) {
4180        int o1, o2, o3, i2, i3;
4181        if (decode[i] == 0xFFFE)
4182            /* unmapped character */
4183            continue;
4184        o1 = decode[i]>>11;
4185        o2 = (decode[i]>>7) & 0xF;
4186        i2 = 16*mlevel1[o1] + o2;
4187        if (mlevel2[i2] == 0xFF)
4188            mlevel2[i2] = count3++;
4189        o3 = decode[i] & 0x7F;
4190        i3 = 128*mlevel2[i2] + o3;
4191        mlevel3[i3] = i;
4192    }
4193    return result;
4194}
4195
4196static int
4197encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4198{
4199    struct encoding_map *map = (struct encoding_map*)mapping;
4200    int l1 = c>>11;
4201    int l2 = (c>>7) & 0xF;
4202    int l3 = c & 0x7F;
4203    int i;
4204
4205#ifdef Py_UNICODE_WIDE
4206    if (c > 0xFFFF) {
4207	return -1;
4208    }
4209#endif
4210    if (c == 0)
4211        return 0;
4212    /* level 1*/
4213    i = map->level1[l1];
4214    if (i == 0xFF) {
4215        return -1;
4216    }
4217    /* level 2*/
4218    i = map->level23[16*i+l2];
4219    if (i == 0xFF) {
4220        return -1;
4221    }
4222    /* level 3 */
4223    i = map->level23[16*map->count2 + 128*i + l3];
4224    if (i == 0) {
4225        return -1;
4226    }
4227    return i;
4228}
4229
4230/* Lookup the character ch in the mapping. If the character
4231   can't be found, Py_None is returned (or NULL, if another
4232   error occurred). */
4233static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4234{
4235    PyObject *w = PyInt_FromLong((long)c);
4236    PyObject *x;
4237
4238    if (w == NULL)
4239	 return NULL;
4240    x = PyObject_GetItem(mapping, w);
4241    Py_DECREF(w);
4242    if (x == NULL) {
4243	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4244	    /* No mapping found means: mapping is undefined. */
4245	    PyErr_Clear();
4246	    x = Py_None;
4247	    Py_INCREF(x);
4248	    return x;
4249	} else
4250	    return NULL;
4251    }
4252    else if (x == Py_None)
4253	return x;
4254    else if (PyInt_Check(x)) {
4255	long value = PyInt_AS_LONG(x);
4256	if (value < 0 || value > 255) {
4257	    PyErr_SetString(PyExc_TypeError,
4258			     "character mapping must be in range(256)");
4259	    Py_DECREF(x);
4260	    return NULL;
4261	}
4262	return x;
4263    }
4264    else if (PyString_Check(x))
4265	return x;
4266    else {
4267	/* wrong return value */
4268	PyErr_Format(PyExc_TypeError,
4269                "character mapping must return integer, None or str8, not %.400s",
4270                x->ob_type->tp_name);
4271	Py_DECREF(x);
4272	return NULL;
4273    }
4274}
4275
4276static int
4277charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4278{
4279	Py_ssize_t outsize = PyBytes_GET_SIZE(  outobj);
4280	/* exponentially overallocate to minimize reallocations */
4281	if (requiredsize < 2*outsize)
4282	    requiredsize = 2*outsize;
4283	if (PyBytes_Resize(outobj, requiredsize)) {
4284	    Py_DECREF(outobj);
4285	    return -1;
4286	}
4287	return 0;
4288}
4289
4290typedef enum charmapencode_result {
4291  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4292}charmapencode_result;
4293/* lookup the character, put the result in the output string and adjust
4294   various state variables. Resize the output bytes object if not enough
4295   space is available. Return a new reference to the object that
4296   was put in the output buffer, or Py_None, if the mapping was undefined
4297   (in which case no character was written) or NULL, if a
4298   reallocation error occurred. The caller must decref the result */
4299static
4300charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4301    PyObject *outobj, Py_ssize_t *outpos)
4302{
4303    PyObject *rep;
4304    char *outstart;
4305    Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
4306
4307    if (Py_Type(mapping) == &EncodingMapType) {
4308        int res = encoding_map_lookup(c, mapping);
4309	Py_ssize_t requiredsize = *outpos+1;
4310        if (res == -1)
4311            return enc_FAILED;
4312	if (outsize<requiredsize)
4313	    if (charmapencode_resize(outobj, outpos, requiredsize))
4314		return enc_EXCEPTION;
4315        outstart = PyBytes_AS_STRING(outobj);
4316	outstart[(*outpos)++] = (char)res;
4317	return enc_SUCCESS;
4318    }
4319
4320    rep = charmapencode_lookup(c, mapping);
4321    if (rep==NULL)
4322	return enc_EXCEPTION;
4323    else if (rep==Py_None) {
4324	Py_DECREF(rep);
4325	return enc_FAILED;
4326    } else {
4327	if (PyInt_Check(rep)) {
4328	    Py_ssize_t requiredsize = *outpos+1;
4329	    if (outsize<requiredsize)
4330		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4331		    Py_DECREF(rep);
4332		    return enc_EXCEPTION;
4333		}
4334            outstart = PyBytes_AS_STRING(outobj);
4335	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4336	}
4337	else {
4338	    const char *repchars = PyString_AS_STRING(rep);
4339	    Py_ssize_t repsize = PyString_GET_SIZE(rep);
4340	    Py_ssize_t requiredsize = *outpos+repsize;
4341	    if (outsize<requiredsize)
4342		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4343		    Py_DECREF(rep);
4344		    return enc_EXCEPTION;
4345		}
4346            outstart = PyBytes_AS_STRING(outobj);
4347	    memcpy(outstart + *outpos, repchars, repsize);
4348	    *outpos += repsize;
4349	}
4350    }
4351    Py_DECREF(rep);
4352    return enc_SUCCESS;
4353}
4354
4355/* handle an error in PyUnicode_EncodeCharmap
4356   Return 0 on success, -1 on error */
4357static
4358int charmap_encoding_error(
4359    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4360    PyObject **exceptionObject,
4361    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4362    PyObject *res, Py_ssize_t *respos)
4363{
4364    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4365    Py_ssize_t repsize;
4366    Py_ssize_t newpos;
4367    Py_UNICODE *uni2;
4368    /* startpos for collecting unencodable chars */
4369    Py_ssize_t collstartpos = *inpos;
4370    Py_ssize_t collendpos = *inpos+1;
4371    Py_ssize_t collpos;
4372    char *encoding = "charmap";
4373    char *reason = "character maps to <undefined>";
4374    charmapencode_result x;
4375
4376    /* find all unencodable characters */
4377    while (collendpos < size) {
4378        PyObject *rep;
4379        if (Py_Type(mapping) == &EncodingMapType) {
4380	    int res = encoding_map_lookup(p[collendpos], mapping);
4381	    if (res != -1)
4382		break;
4383	    ++collendpos;
4384	    continue;
4385	}
4386
4387	rep = charmapencode_lookup(p[collendpos], mapping);
4388	if (rep==NULL)
4389	    return -1;
4390	else if (rep!=Py_None) {
4391	    Py_DECREF(rep);
4392	    break;
4393	}
4394	Py_DECREF(rep);
4395	++collendpos;
4396    }
4397    /* cache callback name lookup
4398     * (if not done yet, i.e. it's the first error) */
4399    if (*known_errorHandler==-1) {
4400	if ((errors==NULL) || (!strcmp(errors, "strict")))
4401	    *known_errorHandler = 1;
4402	else if (!strcmp(errors, "replace"))
4403	    *known_errorHandler = 2;
4404	else if (!strcmp(errors, "ignore"))
4405	    *known_errorHandler = 3;
4406	else if (!strcmp(errors, "xmlcharrefreplace"))
4407	    *known_errorHandler = 4;
4408	else
4409	    *known_errorHandler = 0;
4410    }
4411    switch (*known_errorHandler) {
4412	case 1: /* strict */
4413	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4414	    return -1;
4415	case 2: /* replace */
4416	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4417		x = charmapencode_output('?', mapping, res, respos);
4418		if (x==enc_EXCEPTION) {
4419		    return -1;
4420		}
4421		else if (x==enc_FAILED) {
4422		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4423		    return -1;
4424		}
4425	    }
4426	    /* fall through */
4427	case 3: /* ignore */
4428	    *inpos = collendpos;
4429	    break;
4430	case 4: /* xmlcharrefreplace */
4431	    /* generate replacement (temporarily (mis)uses p) */
4432	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4433		char buffer[2+29+1+1];
4434		char *cp;
4435		sprintf(buffer, "&#%d;", (int)p[collpos]);
4436		for (cp = buffer; *cp; ++cp) {
4437		    x = charmapencode_output(*cp, mapping, res, respos);
4438		    if (x==enc_EXCEPTION)
4439			return -1;
4440		    else if (x==enc_FAILED) {
4441			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4442			return -1;
4443		    }
4444		}
4445	    }
4446	    *inpos = collendpos;
4447	    break;
4448	default:
4449	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4450		encoding, reason, p, size, exceptionObject,
4451		collstartpos, collendpos, &newpos);
4452	    if (repunicode == NULL)
4453		return -1;
4454	    /* generate replacement  */
4455	    repsize = PyUnicode_GET_SIZE(repunicode);
4456	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4457		x = charmapencode_output(*uni2, mapping, res, respos);
4458		if (x==enc_EXCEPTION) {
4459		    return -1;
4460		}
4461		else if (x==enc_FAILED) {
4462		    Py_DECREF(repunicode);
4463		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4464		    return -1;
4465		}
4466	    }
4467	    *inpos = newpos;
4468	    Py_DECREF(repunicode);
4469    }
4470    return 0;
4471}
4472
4473PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4474				  Py_ssize_t size,
4475				  PyObject *mapping,
4476				  const char *errors)
4477{
4478    /* output object */
4479    PyObject *res = NULL;
4480    /* current input position */
4481    Py_ssize_t inpos = 0;
4482    /* current output position */
4483    Py_ssize_t respos = 0;
4484    PyObject *errorHandler = NULL;
4485    PyObject *exc = NULL;
4486    /* the following variable is used for caching string comparisons
4487     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4488     * 3=ignore, 4=xmlcharrefreplace */
4489    int known_errorHandler = -1;
4490
4491    /* Default to Latin-1 */
4492    if (mapping == NULL)
4493	return PyUnicode_EncodeLatin1(p, size, errors);
4494
4495    /* allocate enough for a simple encoding without
4496       replacements, if we need more, we'll resize */
4497    res = PyBytes_FromStringAndSize(NULL, size);
4498    if (res == NULL)
4499        goto onError;
4500    if (size == 0)
4501	return res;
4502
4503    while (inpos<size) {
4504	/* try to encode it */
4505	charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
4506	if (x==enc_EXCEPTION) /* error */
4507	    goto onError;
4508	if (x==enc_FAILED) { /* unencodable character */
4509	    if (charmap_encoding_error(p, size, &inpos, mapping,
4510		&exc,
4511		&known_errorHandler, &errorHandler, errors,
4512		res, &respos)) {
4513		goto onError;
4514	    }
4515	}
4516	else
4517	    /* done with this character => adjust input position */
4518	    ++inpos;
4519    }
4520
4521    /* Resize if we allocated to much */
4522    if (respos<PyBytes_GET_SIZE(res)) {
4523	if (PyBytes_Resize(res, respos))
4524	    goto onError;
4525    }
4526    Py_XDECREF(exc);
4527    Py_XDECREF(errorHandler);
4528    return res;
4529
4530    onError:
4531    Py_XDECREF(res);
4532    Py_XDECREF(exc);
4533    Py_XDECREF(errorHandler);
4534    return NULL;
4535}
4536
4537PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4538				    PyObject *mapping)
4539{
4540    if (!PyUnicode_Check(unicode) || mapping == NULL) {
4541	PyErr_BadArgument();
4542	return NULL;
4543    }
4544    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4545				   PyUnicode_GET_SIZE(unicode),
4546				   mapping,
4547				   NULL);
4548}
4549
4550/* create or adjust a UnicodeTranslateError */
4551static void make_translate_exception(PyObject **exceptionObject,
4552    const Py_UNICODE *unicode, Py_ssize_t size,
4553    Py_ssize_t startpos, Py_ssize_t endpos,
4554    const char *reason)
4555{
4556    if (*exceptionObject == NULL) {
4557    	*exceptionObject = PyUnicodeTranslateError_Create(
4558	    unicode, size, startpos, endpos, reason);
4559    }
4560    else {
4561	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4562	    goto onError;
4563	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4564	    goto onError;
4565	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4566	    goto onError;
4567	return;
4568	onError:
4569	Py_DECREF(*exceptionObject);
4570	*exceptionObject = NULL;
4571    }
4572}
4573
4574/* raises a UnicodeTranslateError */
4575static void raise_translate_exception(PyObject **exceptionObject,
4576    const Py_UNICODE *unicode, Py_ssize_t size,
4577    Py_ssize_t startpos, Py_ssize_t endpos,
4578    const char *reason)
4579{
4580    make_translate_exception(exceptionObject,
4581	unicode, size, startpos, endpos, reason);
4582    if (*exceptionObject != NULL)
4583	PyCodec_StrictErrors(*exceptionObject);
4584}
4585
4586/* error handling callback helper:
4587   build arguments, call the callback and check the arguments,
4588   put the result into newpos and return the replacement string, which
4589   has to be freed by the caller */
4590static PyObject *unicode_translate_call_errorhandler(const char *errors,
4591    PyObject **errorHandler,
4592    const char *reason,
4593    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4594    Py_ssize_t startpos, Py_ssize_t endpos,
4595    Py_ssize_t *newpos)
4596{
4597    static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4598
4599    Py_ssize_t i_newpos;
4600    PyObject *restuple;
4601    PyObject *resunicode;
4602
4603    if (*errorHandler == NULL) {
4604	*errorHandler = PyCodec_LookupError(errors);
4605        if (*errorHandler == NULL)
4606	    return NULL;
4607    }
4608
4609    make_translate_exception(exceptionObject,
4610	unicode, size, startpos, endpos, reason);
4611    if (*exceptionObject == NULL)
4612	return NULL;
4613
4614    restuple = PyObject_CallFunctionObjArgs(
4615	*errorHandler, *exceptionObject, NULL);
4616    if (restuple == NULL)
4617	return NULL;
4618    if (!PyTuple_Check(restuple)) {
4619	PyErr_Format(PyExc_TypeError, &argparse[4]);
4620	Py_DECREF(restuple);
4621	return NULL;
4622    }
4623    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4624	&resunicode, &i_newpos)) {
4625	Py_DECREF(restuple);
4626	return NULL;
4627    }
4628    if (i_newpos<0)
4629	*newpos = size+i_newpos;
4630    else
4631        *newpos = i_newpos;
4632    if (*newpos<0 || *newpos>size) {
4633	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4634	Py_DECREF(restuple);
4635	return NULL;
4636    }
4637    Py_INCREF(resunicode);
4638    Py_DECREF(restuple);
4639    return resunicode;
4640}
4641
4642/* Lookup the character ch in the mapping and put the result in result,
4643   which must be decrefed by the caller.
4644   Return 0 on success, -1 on error */
4645static
4646int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4647{
4648    PyObject *w = PyInt_FromLong((long)c);
4649    PyObject *x;
4650
4651    if (w == NULL)
4652	 return -1;
4653    x = PyObject_GetItem(mapping, w);
4654    Py_DECREF(w);
4655    if (x == NULL) {
4656	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4657	    /* No mapping found means: use 1:1 mapping. */
4658	    PyErr_Clear();
4659	    *result = NULL;
4660	    return 0;
4661	} else
4662	    return -1;
4663    }
4664    else if (x == Py_None) {
4665	*result = x;
4666	return 0;
4667    }
4668    else if (PyInt_Check(x)) {
4669	long value = PyInt_AS_LONG(x);
4670	long max = PyUnicode_GetMax();
4671	if (value < 0 || value > max) {
4672	    PyErr_Format(PyExc_TypeError,
4673			     "character mapping must be in range(0x%lx)", max+1);
4674	    Py_DECREF(x);
4675	    return -1;
4676	}
4677	*result = x;
4678	return 0;
4679    }
4680    else if (PyUnicode_Check(x)) {
4681	*result = x;
4682	return 0;
4683    }
4684    else {
4685	/* wrong return value */
4686	PyErr_SetString(PyExc_TypeError,
4687	      "character mapping must return integer, None or unicode");
4688	Py_DECREF(x);
4689	return -1;
4690    }
4691}
4692/* ensure that *outobj is at least requiredsize characters long,
4693if not reallocate and adjust various state variables.
4694Return 0 on success, -1 on error */
4695static
4696int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4697    Py_ssize_t requiredsize)
4698{
4699    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4700    if (requiredsize > oldsize) {
4701	/* remember old output position */
4702	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4703	/* exponentially overallocate to minimize reallocations */
4704	if (requiredsize < 2 * oldsize)
4705	    requiredsize = 2 * oldsize;
4706	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4707	    return -1;
4708	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4709    }
4710    return 0;
4711}
4712/* lookup the character, put the result in the output string and adjust
4713   various state variables. Return a new reference to the object that
4714   was put in the output buffer in *result, or Py_None, if the mapping was
4715   undefined (in which case no character was written).
4716   The called must decref result.
4717   Return 0 on success, -1 on error. */
4718static
4719int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4720    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4721    PyObject **res)
4722{
4723    if (charmaptranslate_lookup(*curinp, mapping, res))
4724	return -1;
4725    if (*res==NULL) {
4726	/* not found => default to 1:1 mapping */
4727	*(*outp)++ = *curinp;
4728    }
4729    else if (*res==Py_None)
4730	;
4731    else if (PyInt_Check(*res)) {
4732	/* no overflow check, because we know that the space is enough */
4733	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4734    }
4735    else if (PyUnicode_Check(*res)) {
4736	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4737	if (repsize==1) {
4738	    /* no overflow check, because we know that the space is enough */
4739	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4740	}
4741	else if (repsize!=0) {
4742	    /* more than one character */
4743	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4744		(insize - (curinp-startinp)) +
4745		repsize - 1;
4746	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
4747		return -1;
4748	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4749	    *outp += repsize;
4750	}
4751    }
4752    else
4753	return -1;
4754    return 0;
4755}
4756
4757PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4758				     Py_ssize_t size,
4759				     PyObject *mapping,
4760				     const char *errors)
4761{
4762    /* output object */
4763    PyObject *res = NULL;
4764    /* pointers to the beginning and end+1 of input */
4765    const Py_UNICODE *startp = p;
4766    const Py_UNICODE *endp = p + size;
4767    /* pointer into the output */
4768    Py_UNICODE *str;
4769    /* current output position */
4770    Py_ssize_t respos = 0;
4771    char *reason = "character maps to <undefined>";
4772    PyObject *errorHandler = NULL;
4773    PyObject *exc = NULL;
4774    /* the following variable is used for caching string comparisons
4775     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4776     * 3=ignore, 4=xmlcharrefreplace */
4777    int known_errorHandler = -1;
4778
4779    if (mapping == NULL) {
4780	PyErr_BadArgument();
4781	return NULL;
4782    }
4783
4784    /* allocate enough for a simple 1:1 translation without
4785       replacements, if we need more, we'll resize */
4786    res = PyUnicode_FromUnicode(NULL, size);
4787    if (res == NULL)
4788	goto onError;
4789    if (size == 0)
4790	return res;
4791    str = PyUnicode_AS_UNICODE(res);
4792
4793    while (p<endp) {
4794	/* try to encode it */
4795	PyObject *x = NULL;
4796	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4797	    Py_XDECREF(x);
4798	    goto onError;
4799	}
4800	Py_XDECREF(x);
4801	if (x!=Py_None) /* it worked => adjust input pointer */
4802	    ++p;
4803	else { /* untranslatable character */
4804	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4805	    Py_ssize_t repsize;
4806	    Py_ssize_t newpos;
4807	    Py_UNICODE *uni2;
4808	    /* startpos for collecting untranslatable chars */
4809	    const Py_UNICODE *collstart = p;
4810	    const Py_UNICODE *collend = p+1;
4811	    const Py_UNICODE *coll;
4812
4813	    /* find all untranslatable characters */
4814	    while (collend < endp) {
4815		if (charmaptranslate_lookup(*collend, mapping, &x))
4816		    goto onError;
4817		Py_XDECREF(x);
4818		if (x!=Py_None)
4819		    break;
4820		++collend;
4821	    }
4822	    /* cache callback name lookup
4823	     * (if not done yet, i.e. it's the first error) */
4824	    if (known_errorHandler==-1) {
4825		if ((errors==NULL) || (!strcmp(errors, "strict")))
4826		    known_errorHandler = 1;
4827		else if (!strcmp(errors, "replace"))
4828		    known_errorHandler = 2;
4829		else if (!strcmp(errors, "ignore"))
4830		    known_errorHandler = 3;
4831		else if (!strcmp(errors, "xmlcharrefreplace"))
4832		    known_errorHandler = 4;
4833		else
4834		    known_errorHandler = 0;
4835	    }
4836	    switch (known_errorHandler) {
4837		case 1: /* strict */
4838		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4839		    goto onError;
4840		case 2: /* replace */
4841		    /* No need to check for space, this is a 1:1 replacement */
4842		    for (coll = collstart; coll<collend; ++coll)
4843			*str++ = '?';
4844		    /* fall through */
4845		case 3: /* ignore */
4846		    p = collend;
4847		    break;
4848		case 4: /* xmlcharrefreplace */
4849		    /* generate replacement (temporarily (mis)uses p) */
4850		    for (p = collstart; p < collend; ++p) {
4851			char buffer[2+29+1+1];
4852			char *cp;
4853			sprintf(buffer, "&#%d;", (int)*p);
4854			if (charmaptranslate_makespace(&res, &str,
4855			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4856			    goto onError;
4857			for (cp = buffer; *cp; ++cp)
4858			    *str++ = *cp;
4859		    }
4860		    p = collend;
4861		    break;
4862		default:
4863		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4864			reason, startp, size, &exc,
4865			collstart-startp, collend-startp, &newpos);
4866		    if (repunicode == NULL)
4867			goto onError;
4868		    /* generate replacement  */
4869		    repsize = PyUnicode_GET_SIZE(repunicode);
4870		    if (charmaptranslate_makespace(&res, &str,
4871			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4872			Py_DECREF(repunicode);
4873			goto onError;
4874		    }
4875		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4876			*str++ = *uni2;
4877		    p = startp + newpos;
4878		    Py_DECREF(repunicode);
4879	    }
4880	}
4881    }
4882    /* Resize if we allocated to much */
4883    respos = str-PyUnicode_AS_UNICODE(res);
4884    if (respos<PyUnicode_GET_SIZE(res)) {
4885	if (_PyUnicode_Resize(&res, respos) < 0)
4886	    goto onError;
4887    }
4888    Py_XDECREF(exc);
4889    Py_XDECREF(errorHandler);
4890    return res;
4891
4892    onError:
4893    Py_XDECREF(res);
4894    Py_XDECREF(exc);
4895    Py_XDECREF(errorHandler);
4896    return NULL;
4897}
4898
4899PyObject *PyUnicode_Translate(PyObject *str,
4900			      PyObject *mapping,
4901			      const char *errors)
4902{
4903    PyObject *result;
4904
4905    str = PyUnicode_FromObject(str);
4906    if (str == NULL)
4907	goto onError;
4908    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4909					PyUnicode_GET_SIZE(str),
4910					mapping,
4911					errors);
4912    Py_DECREF(str);
4913    return result;
4914
4915 onError:
4916    Py_XDECREF(str);
4917    return NULL;
4918}
4919
4920/* --- Decimal Encoder ---------------------------------------------------- */
4921
4922int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4923			    Py_ssize_t length,
4924			    char *output,
4925			    const char *errors)
4926{
4927    Py_UNICODE *p, *end;
4928    PyObject *errorHandler = NULL;
4929    PyObject *exc = NULL;
4930    const char *encoding = "decimal";
4931    const char *reason = "invalid decimal Unicode string";
4932    /* the following variable is used for caching string comparisons
4933     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4934    int known_errorHandler = -1;
4935
4936    if (output == NULL) {
4937	PyErr_BadArgument();
4938	return -1;
4939    }
4940
4941    p = s;
4942    end = s + length;
4943    while (p < end) {
4944	register Py_UNICODE ch = *p;
4945	int decimal;
4946	PyObject *repunicode;
4947	Py_ssize_t repsize;
4948	Py_ssize_t newpos;
4949	Py_UNICODE *uni2;
4950	Py_UNICODE *collstart;
4951	Py_UNICODE *collend;
4952
4953	if (Py_UNICODE_ISSPACE(ch)) {
4954	    *output++ = ' ';
4955	    ++p;
4956	    continue;
4957	}
4958	decimal = Py_UNICODE_TODECIMAL(ch);
4959	if (decimal >= 0) {
4960	    *output++ = '0' + decimal;
4961	    ++p;
4962	    continue;
4963	}
4964	if (0 < ch && ch < 256) {
4965	    *output++ = (char)ch;
4966	    ++p;
4967	    continue;
4968	}
4969	/* All other characters are considered unencodable */
4970	collstart = p;
4971	collend = p+1;
4972	while (collend < end) {
4973	    if ((0 < *collend && *collend < 256) ||
4974	        !Py_UNICODE_ISSPACE(*collend) ||
4975	        Py_UNICODE_TODECIMAL(*collend))
4976		break;
4977	}
4978	/* cache callback name lookup
4979	 * (if not done yet, i.e. it's the first error) */
4980	if (known_errorHandler==-1) {
4981	    if ((errors==NULL) || (!strcmp(errors, "strict")))
4982		known_errorHandler = 1;
4983	    else if (!strcmp(errors, "replace"))
4984		known_errorHandler = 2;
4985	    else if (!strcmp(errors, "ignore"))
4986		known_errorHandler = 3;
4987	    else if (!strcmp(errors, "xmlcharrefreplace"))
4988		known_errorHandler = 4;
4989	    else
4990		known_errorHandler = 0;
4991	}
4992	switch (known_errorHandler) {
4993	    case 1: /* strict */
4994		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4995		goto onError;
4996	    case 2: /* replace */
4997		for (p = collstart; p < collend; ++p)
4998		    *output++ = '?';
4999		/* fall through */
5000	    case 3: /* ignore */
5001		p = collend;
5002		break;
5003	    case 4: /* xmlcharrefreplace */
5004		/* generate replacement (temporarily (mis)uses p) */
5005		for (p = collstart; p < collend; ++p)
5006		    output += sprintf(output, "&#%d;", (int)*p);
5007		p = collend;
5008		break;
5009	    default:
5010		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5011		    encoding, reason, s, length, &exc,
5012		    collstart-s, collend-s, &newpos);
5013		if (repunicode == NULL)
5014		    goto onError;
5015		/* generate replacement  */
5016		repsize = PyUnicode_GET_SIZE(repunicode);
5017		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5018		    Py_UNICODE ch = *uni2;
5019		    if (Py_UNICODE_ISSPACE(ch))
5020			*output++ = ' ';
5021		    else {
5022			decimal = Py_UNICODE_TODECIMAL(ch);
5023			if (decimal >= 0)
5024			    *output++ = '0' + decimal;
5025			else if (0 < ch && ch < 256)
5026			    *output++ = (char)ch;
5027			else {
5028			    Py_DECREF(repunicode);
5029			    raise_encode_exception(&exc, encoding,
5030				s, length, collstart-s, collend-s, reason);
5031			    goto onError;
5032			}
5033		    }
5034		}
5035		p = s + newpos;
5036		Py_DECREF(repunicode);
5037	}
5038    }
5039    /* 0-terminate the output string */
5040    *output++ = '\0';
5041    Py_XDECREF(exc);
5042    Py_XDECREF(errorHandler);
5043    return 0;
5044
5045 onError:
5046    Py_XDECREF(exc);
5047    Py_XDECREF(errorHandler);
5048    return -1;
5049}
5050
5051/* --- Helpers ------------------------------------------------------------ */
5052
5053#include "stringlib/unicodedefs.h"
5054
5055#include "stringlib/fastsearch.h"
5056
5057#include "stringlib/count.h"
5058#include "stringlib/find.h"
5059#include "stringlib/partition.h"
5060
5061/* helper macro to fixup start/end slice values */
5062#define FIX_START_END(obj)                      \
5063    if (start < 0)                              \
5064        start += (obj)->length;                 \
5065    if (start < 0)                              \
5066        start = 0;                              \
5067    if (end > (obj)->length)                    \
5068        end = (obj)->length;                    \
5069    if (end < 0)                                \
5070        end += (obj)->length;                   \
5071    if (end < 0)                                \
5072        end = 0;
5073
5074Py_ssize_t PyUnicode_Count(PyObject *str,
5075                           PyObject *substr,
5076                           Py_ssize_t start,
5077                           Py_ssize_t end)
5078{
5079    Py_ssize_t result;
5080    PyUnicodeObject* str_obj;
5081    PyUnicodeObject* sub_obj;
5082
5083    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5084    if (!str_obj)
5085	return -1;
5086    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5087    if (!sub_obj) {
5088	Py_DECREF(str_obj);
5089	return -1;
5090    }
5091
5092    FIX_START_END(str_obj);
5093
5094    result = stringlib_count(
5095        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5096        );
5097
5098    Py_DECREF(sub_obj);
5099    Py_DECREF(str_obj);
5100
5101    return result;
5102}
5103
5104Py_ssize_t PyUnicode_Find(PyObject *str,
5105                          PyObject *sub,
5106                          Py_ssize_t start,
5107                          Py_ssize_t end,
5108                          int direction)
5109{
5110    Py_ssize_t result;
5111
5112    str = PyUnicode_FromObject(str);
5113    if (!str)
5114	return -2;
5115    sub = PyUnicode_FromObject(sub);
5116    if (!sub) {
5117	Py_DECREF(str);
5118	return -2;
5119    }
5120
5121    if (direction > 0)
5122        result = stringlib_find_slice(
5123            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5124            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5125            start, end
5126            );
5127    else
5128        result = stringlib_rfind_slice(
5129            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5130            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5131            start, end
5132            );
5133
5134    Py_DECREF(str);
5135    Py_DECREF(sub);
5136
5137    return result;
5138}
5139
5140static
5141int tailmatch(PyUnicodeObject *self,
5142	      PyUnicodeObject *substring,
5143	      Py_ssize_t start,
5144	      Py_ssize_t end,
5145	      int direction)
5146{
5147    if (substring->length == 0)
5148        return 1;
5149
5150    FIX_START_END(self);
5151
5152    end -= substring->length;
5153    if (end < start)
5154	return 0;
5155
5156    if (direction > 0) {
5157	if (Py_UNICODE_MATCH(self, end, substring))
5158	    return 1;
5159    } else {
5160        if (Py_UNICODE_MATCH(self, start, substring))
5161	    return 1;
5162    }
5163
5164    return 0;
5165}
5166
5167Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5168			PyObject *substr,
5169			Py_ssize_t start,
5170			Py_ssize_t end,
5171			int direction)
5172{
5173    Py_ssize_t result;
5174
5175    str = PyUnicode_FromObject(str);
5176    if (str == NULL)
5177	return -1;
5178    substr = PyUnicode_FromObject(substr);
5179    if (substr == NULL) {
5180	Py_DECREF(str);
5181	return -1;
5182    }
5183
5184    result = tailmatch((PyUnicodeObject *)str,
5185		       (PyUnicodeObject *)substr,
5186		       start, end, direction);
5187    Py_DECREF(str);
5188    Py_DECREF(substr);
5189    return result;
5190}
5191
5192/* Apply fixfct filter to the Unicode object self and return a
5193   reference to the modified object */
5194
5195static
5196PyObject *fixup(PyUnicodeObject *self,
5197		int (*fixfct)(PyUnicodeObject *s))
5198{
5199
5200    PyUnicodeObject *u;
5201
5202    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5203    if (u == NULL)
5204	return NULL;
5205
5206    Py_UNICODE_COPY(u->str, self->str, self->length);
5207
5208    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5209	/* fixfct should return TRUE if it modified the buffer. If
5210	   FALSE, return a reference to the original buffer instead
5211	   (to save space, not time) */
5212	Py_INCREF(self);
5213	Py_DECREF(u);
5214	return (PyObject*) self;
5215    }
5216    return (PyObject*) u;
5217}
5218
5219static
5220int fixupper(PyUnicodeObject *self)
5221{
5222    Py_ssize_t len = self->length;
5223    Py_UNICODE *s = self->str;
5224    int status = 0;
5225
5226    while (len-- > 0) {
5227	register Py_UNICODE ch;
5228
5229	ch = Py_UNICODE_TOUPPER(*s);
5230	if (ch != *s) {
5231            status = 1;
5232	    *s = ch;
5233	}
5234        s++;
5235    }
5236
5237    return status;
5238}
5239
5240static
5241int fixlower(PyUnicodeObject *self)
5242{
5243    Py_ssize_t len = self->length;
5244    Py_UNICODE *s = self->str;
5245    int status = 0;
5246
5247    while (len-- > 0) {
5248	register Py_UNICODE ch;
5249
5250	ch = Py_UNICODE_TOLOWER(*s);
5251	if (ch != *s) {
5252            status = 1;
5253	    *s = ch;
5254	}
5255        s++;
5256    }
5257
5258    return status;
5259}
5260
5261static
5262int fixswapcase(PyUnicodeObject *self)
5263{
5264    Py_ssize_t len = self->length;
5265    Py_UNICODE *s = self->str;
5266    int status = 0;
5267
5268    while (len-- > 0) {
5269        if (Py_UNICODE_ISUPPER(*s)) {
5270            *s = Py_UNICODE_TOLOWER(*s);
5271            status = 1;
5272        } else if (Py_UNICODE_ISLOWER(*s)) {
5273            *s = Py_UNICODE_TOUPPER(*s);
5274            status = 1;
5275        }
5276        s++;
5277    }
5278
5279    return status;
5280}
5281
5282static
5283int fixcapitalize(PyUnicodeObject *self)
5284{
5285    Py_ssize_t len = self->length;
5286    Py_UNICODE *s = self->str;
5287    int status = 0;
5288
5289    if (len == 0)
5290	return 0;
5291    if (Py_UNICODE_ISLOWER(*s)) {
5292	*s = Py_UNICODE_TOUPPER(*s);
5293	status = 1;
5294    }
5295    s++;
5296    while (--len > 0) {
5297        if (Py_UNICODE_ISUPPER(*s)) {
5298            *s = Py_UNICODE_TOLOWER(*s);
5299            status = 1;
5300        }
5301        s++;
5302    }
5303    return status;
5304}
5305
5306static
5307int fixtitle(PyUnicodeObject *self)
5308{
5309    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5310    register Py_UNICODE *e;
5311    int previous_is_cased;
5312
5313    /* Shortcut for single character strings */
5314    if (PyUnicode_GET_SIZE(self) == 1) {
5315	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5316	if (*p != ch) {
5317	    *p = ch;
5318	    return 1;
5319	}
5320	else
5321	    return 0;
5322    }
5323
5324    e = p + PyUnicode_GET_SIZE(self);
5325    previous_is_cased = 0;
5326    for (; p < e; p++) {
5327	register const Py_UNICODE ch = *p;
5328
5329	if (previous_is_cased)
5330	    *p = Py_UNICODE_TOLOWER(ch);
5331	else
5332	    *p = Py_UNICODE_TOTITLE(ch);
5333
5334	if (Py_UNICODE_ISLOWER(ch) ||
5335	    Py_UNICODE_ISUPPER(ch) ||
5336	    Py_UNICODE_ISTITLE(ch))
5337	    previous_is_cased = 1;
5338	else
5339	    previous_is_cased = 0;
5340    }
5341    return 1;
5342}
5343
5344PyObject *
5345PyUnicode_Join(PyObject *separator, PyObject *seq)
5346{
5347    PyObject *internal_separator = NULL;
5348    const Py_UNICODE blank = ' ';
5349    const Py_UNICODE *sep = &blank;
5350    Py_ssize_t seplen = 1;
5351    PyUnicodeObject *res = NULL; /* the result */
5352    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5353    Py_ssize_t res_used;         /* # used bytes */
5354    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5355    PyObject *fseq;          /* PySequence_Fast(seq) */
5356    Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5357    PyObject *item;
5358    Py_ssize_t i;
5359
5360    fseq = PySequence_Fast(seq, "");
5361    if (fseq == NULL) {
5362    	return NULL;
5363    }
5364
5365    /* Grrrr.  A codec may be invoked to convert str objects to
5366     * Unicode, and so it's possible to call back into Python code
5367     * during PyUnicode_FromObject(), and so it's possible for a sick
5368     * codec to change the size of fseq (if seq is a list).  Therefore
5369     * we have to keep refetching the size -- can't assume seqlen
5370     * is invariant.
5371     */
5372    seqlen = PySequence_Fast_GET_SIZE(fseq);
5373    /* If empty sequence, return u"". */
5374    if (seqlen == 0) {
5375    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5376    	goto Done;
5377    }
5378    /* If singleton sequence with an exact Unicode, return that. */
5379    if (seqlen == 1) {
5380	item = PySequence_Fast_GET_ITEM(fseq, 0);
5381	if (PyUnicode_CheckExact(item)) {
5382	    Py_INCREF(item);
5383	    res = (PyUnicodeObject *)item;
5384	    goto Done;
5385	}
5386    }
5387
5388    /* At least two items to join, or one that isn't exact Unicode. */
5389    if (seqlen > 1) {
5390        /* Set up sep and seplen -- they're needed. */
5391    	if (separator == NULL) {
5392	    sep = &blank;
5393	    seplen = 1;
5394        }
5395    	else {
5396	    internal_separator = PyUnicode_FromObject(separator);
5397	    if (internal_separator == NULL)
5398	        goto onError;
5399	    sep = PyUnicode_AS_UNICODE(internal_separator);
5400	    seplen = PyUnicode_GET_SIZE(internal_separator);
5401	    /* In case PyUnicode_FromObject() mutated seq. */
5402	    seqlen = PySequence_Fast_GET_SIZE(fseq);
5403        }
5404    }
5405
5406    /* Get space. */
5407    res = _PyUnicode_New(res_alloc);
5408    if (res == NULL)
5409        goto onError;
5410    res_p = PyUnicode_AS_UNICODE(res);
5411    res_used = 0;
5412
5413    for (i = 0; i < seqlen; ++i) {
5414	Py_ssize_t itemlen;
5415	Py_ssize_t new_res_used;
5416
5417	item = PySequence_Fast_GET_ITEM(fseq, i);
5418	/* Convert item to Unicode. */
5419	if (!PyString_Check(item) && !PyUnicode_Check(item))
5420	{
5421		if (PyBytes_Check(item))
5422		{
5423			PyErr_Format(PyExc_TypeError,
5424                            "sequence item %d: join() will not operate on "
5425                            "bytes objects", i);
5426			goto onError;
5427		}
5428		item = PyObject_Unicode(item);
5429	}
5430	else
5431		item = PyUnicode_FromObject(item);
5432
5433	if (item == NULL)
5434	    goto onError;
5435	/* We own a reference to item from here on. */
5436
5437	/* In case PyUnicode_FromObject() mutated seq. */
5438	seqlen = PySequence_Fast_GET_SIZE(fseq);
5439
5440        /* Make sure we have enough space for the separator and the item. */
5441	itemlen = PyUnicode_GET_SIZE(item);
5442	new_res_used = res_used + itemlen;
5443	if (new_res_used < 0)
5444	    goto Overflow;
5445	if (i < seqlen - 1) {
5446	    new_res_used += seplen;
5447	    if (new_res_used < 0)
5448		goto Overflow;
5449	}
5450	if (new_res_used > res_alloc) {
5451	    /* double allocated size until it's big enough */
5452	    do {
5453	        res_alloc += res_alloc;
5454	        if (res_alloc <= 0)
5455	            goto Overflow;
5456	    } while (new_res_used > res_alloc);
5457	    if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5458		Py_DECREF(item);
5459		goto onError;
5460	    }
5461            res_p = PyUnicode_AS_UNICODE(res) + res_used;
5462	}
5463
5464	/* Copy item, and maybe the separator. */
5465	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5466	res_p += itemlen;
5467	if (i < seqlen - 1) {
5468	    Py_UNICODE_COPY(res_p, sep, seplen);
5469	    res_p += seplen;
5470	}
5471	Py_DECREF(item);
5472	res_used = new_res_used;
5473    }
5474
5475    /* Shrink res to match the used area; this probably can't fail,
5476     * but it's cheap to check.
5477     */
5478    if (_PyUnicode_Resize(&res, res_used) < 0)
5479	goto onError;
5480
5481 Done:
5482    Py_XDECREF(internal_separator);
5483    Py_DECREF(fseq);
5484    return (PyObject *)res;
5485
5486 Overflow:
5487    PyErr_SetString(PyExc_OverflowError,
5488                    "join() result is too long for a Python string");
5489    Py_DECREF(item);
5490    /* fall through */
5491
5492 onError:
5493    Py_XDECREF(internal_separator);
5494    Py_DECREF(fseq);
5495    Py_XDECREF(res);
5496    return NULL;
5497}
5498
5499static
5500PyUnicodeObject *pad(PyUnicodeObject *self,
5501		     Py_ssize_t left,
5502		     Py_ssize_t right,
5503		     Py_UNICODE fill)
5504{
5505    PyUnicodeObject *u;
5506
5507    if (left < 0)
5508        left = 0;
5509    if (right < 0)
5510        right = 0;
5511
5512    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5513        Py_INCREF(self);
5514        return self;
5515    }
5516
5517    u = _PyUnicode_New(left + self->length + right);
5518    if (u) {
5519        if (left)
5520            Py_UNICODE_FILL(u->str, fill, left);
5521        Py_UNICODE_COPY(u->str + left, self->str, self->length);
5522        if (right)
5523            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5524    }
5525
5526    return u;
5527}
5528
5529#define SPLIT_APPEND(data, left, right)					\
5530	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
5531	if (!str)							\
5532	    goto onError;						\
5533	if (PyList_Append(list, str)) {					\
5534	    Py_DECREF(str);						\
5535	    goto onError;						\
5536	}								\
5537        else								\
5538            Py_DECREF(str);
5539
5540static
5541PyObject *split_whitespace(PyUnicodeObject *self,
5542			   PyObject *list,
5543			   Py_ssize_t maxcount)
5544{
5545    register Py_ssize_t i;
5546    register Py_ssize_t j;
5547    Py_ssize_t len = self->length;
5548    PyObject *str;
5549
5550    for (i = j = 0; i < len; ) {
5551	/* find a token */
5552	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5553	    i++;
5554	j = i;
5555	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5556	    i++;
5557	if (j < i) {
5558	    if (maxcount-- <= 0)
5559		break;
5560	    SPLIT_APPEND(self->str, j, i);
5561	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5562		i++;
5563	    j = i;
5564	}
5565    }
5566    if (j < len) {
5567	SPLIT_APPEND(self->str, j, len);
5568    }
5569    return list;
5570
5571 onError:
5572    Py_DECREF(list);
5573    return NULL;
5574}
5575
5576PyObject *PyUnicode_Splitlines(PyObject *string,
5577			       int keepends)
5578{
5579    register Py_ssize_t i;
5580    register Py_ssize_t j;
5581    Py_ssize_t len;
5582    PyObject *list;
5583    PyObject *str;
5584    Py_UNICODE *data;
5585
5586    string = PyUnicode_FromObject(string);
5587    if (string == NULL)
5588	return NULL;
5589    data = PyUnicode_AS_UNICODE(string);
5590    len = PyUnicode_GET_SIZE(string);
5591
5592    list = PyList_New(0);
5593    if (!list)
5594        goto onError;
5595
5596    for (i = j = 0; i < len; ) {
5597	Py_ssize_t eol;
5598
5599	/* Find a line and append it */
5600	while (i < len && !BLOOM_LINEBREAK(data[i]))
5601	    i++;
5602
5603	/* Skip the line break reading CRLF as one line break */
5604	eol = i;
5605	if (i < len) {
5606	    if (data[i] == '\r' && i + 1 < len &&
5607		data[i+1] == '\n')
5608		i += 2;
5609	    else
5610		i++;
5611	    if (keepends)
5612		eol = i;
5613	}
5614	SPLIT_APPEND(data, j, eol);
5615	j = i;
5616    }
5617    if (j < len) {
5618	SPLIT_APPEND(data, j, len);
5619    }
5620
5621    Py_DECREF(string);
5622    return list;
5623
5624 onError:
5625    Py_XDECREF(list);
5626    Py_DECREF(string);
5627    return NULL;
5628}
5629
5630static
5631PyObject *split_char(PyUnicodeObject *self,
5632		     PyObject *list,
5633		     Py_UNICODE ch,
5634		     Py_ssize_t maxcount)
5635{
5636    register Py_ssize_t i;
5637    register Py_ssize_t j;
5638    Py_ssize_t len = self->length;
5639    PyObject *str;
5640
5641    for (i = j = 0; i < len; ) {
5642	if (self->str[i] == ch) {
5643	    if (maxcount-- <= 0)
5644		break;
5645	    SPLIT_APPEND(self->str, j, i);
5646	    i = j = i + 1;
5647	} else
5648	    i++;
5649    }
5650    if (j <= len) {
5651	SPLIT_APPEND(self->str, j, len);
5652    }
5653    return list;
5654
5655 onError:
5656    Py_DECREF(list);
5657    return NULL;
5658}
5659
5660static
5661PyObject *split_substring(PyUnicodeObject *self,
5662			  PyObject *list,
5663			  PyUnicodeObject *substring,
5664			  Py_ssize_t maxcount)
5665{
5666    register Py_ssize_t i;
5667    register Py_ssize_t j;
5668    Py_ssize_t len = self->length;
5669    Py_ssize_t sublen = substring->length;
5670    PyObject *str;
5671
5672    for (i = j = 0; i <= len - sublen; ) {
5673	if (Py_UNICODE_MATCH(self, i, substring)) {
5674	    if (maxcount-- <= 0)
5675		break;
5676	    SPLIT_APPEND(self->str, j, i);
5677	    i = j = i + sublen;
5678	} else
5679	    i++;
5680    }
5681    if (j <= len) {
5682	SPLIT_APPEND(self->str, j, len);
5683    }
5684    return list;
5685
5686 onError:
5687    Py_DECREF(list);
5688    return NULL;
5689}
5690
5691static
5692PyObject *rsplit_whitespace(PyUnicodeObject *self,
5693			    PyObject *list,
5694			    Py_ssize_t maxcount)
5695{
5696    register Py_ssize_t i;
5697    register Py_ssize_t j;
5698    Py_ssize_t len = self->length;
5699    PyObject *str;
5700
5701    for (i = j = len - 1; i >= 0; ) {
5702	/* find a token */
5703	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5704	    i--;
5705	j = i;
5706	while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5707	    i--;
5708	if (j > i) {
5709	    if (maxcount-- <= 0)
5710		break;
5711	    SPLIT_APPEND(self->str, i + 1, j + 1);
5712	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5713		i--;
5714	    j = i;
5715	}
5716    }
5717    if (j >= 0) {
5718	SPLIT_APPEND(self->str, 0, j + 1);
5719    }
5720    if (PyList_Reverse(list) < 0)
5721        goto onError;
5722    return list;
5723
5724 onError:
5725    Py_DECREF(list);
5726    return NULL;
5727}
5728
5729static
5730PyObject *rsplit_char(PyUnicodeObject *self,
5731		      PyObject *list,
5732		      Py_UNICODE ch,
5733		      Py_ssize_t maxcount)
5734{
5735    register Py_ssize_t i;
5736    register Py_ssize_t j;
5737    Py_ssize_t len = self->length;
5738    PyObject *str;
5739
5740    for (i = j = len - 1; i >= 0; ) {
5741	if (self->str[i] == ch) {
5742	    if (maxcount-- <= 0)
5743		break;
5744	    SPLIT_APPEND(self->str, i + 1, j + 1);
5745	    j = i = i - 1;
5746	} else
5747	    i--;
5748    }
5749    if (j >= -1) {
5750	SPLIT_APPEND(self->str, 0, j + 1);
5751    }
5752    if (PyList_Reverse(list) < 0)
5753        goto onError;
5754    return list;
5755
5756 onError:
5757    Py_DECREF(list);
5758    return NULL;
5759}
5760
5761static
5762PyObject *rsplit_substring(PyUnicodeObject *self,
5763			   PyObject *list,
5764			   PyUnicodeObject *substring,
5765			   Py_ssize_t maxcount)
5766{
5767    register Py_ssize_t i;
5768    register Py_ssize_t j;
5769    Py_ssize_t len = self->length;
5770    Py_ssize_t sublen = substring->length;
5771    PyObject *str;
5772
5773    for (i = len - sublen, j = len; i >= 0; ) {
5774	if (Py_UNICODE_MATCH(self, i, substring)) {
5775	    if (maxcount-- <= 0)
5776		break;
5777	    SPLIT_APPEND(self->str, i + sublen, j);
5778	    j = i;
5779	    i -= sublen;
5780	} else
5781	    i--;
5782    }
5783    if (j >= 0) {
5784	SPLIT_APPEND(self->str, 0, j);
5785    }
5786    if (PyList_Reverse(list) < 0)
5787        goto onError;
5788    return list;
5789
5790 onError:
5791    Py_DECREF(list);
5792    return NULL;
5793}
5794
5795#undef SPLIT_APPEND
5796
5797static
5798PyObject *split(PyUnicodeObject *self,
5799		PyUnicodeObject *substring,
5800		Py_ssize_t maxcount)
5801{
5802    PyObject *list;
5803
5804    if (maxcount < 0)
5805        maxcount = PY_SSIZE_T_MAX;
5806
5807    list = PyList_New(0);
5808    if (!list)
5809        return NULL;
5810
5811    if (substring == NULL)
5812	return split_whitespace(self,list,maxcount);
5813
5814    else if (substring->length == 1)
5815	return split_char(self,list,substring->str[0],maxcount);
5816
5817    else if (substring->length == 0) {
5818	Py_DECREF(list);
5819	PyErr_SetString(PyExc_ValueError, "empty separator");
5820	return NULL;
5821    }
5822    else
5823	return split_substring(self,list,substring,maxcount);
5824}
5825
5826static
5827PyObject *rsplit(PyUnicodeObject *self,
5828		 PyUnicodeObject *substring,
5829		 Py_ssize_t maxcount)
5830{
5831    PyObject *list;
5832
5833    if (maxcount < 0)
5834        maxcount = PY_SSIZE_T_MAX;
5835
5836    list = PyList_New(0);
5837    if (!list)
5838        return NULL;
5839
5840    if (substring == NULL)
5841	return rsplit_whitespace(self,list,maxcount);
5842
5843    else if (substring->length == 1)
5844	return rsplit_char(self,list,substring->str[0],maxcount);
5845
5846    else if (substring->length == 0) {
5847	Py_DECREF(list);
5848	PyErr_SetString(PyExc_ValueError, "empty separator");
5849	return NULL;
5850    }
5851    else
5852	return rsplit_substring(self,list,substring,maxcount);
5853}
5854
5855static
5856PyObject *replace(PyUnicodeObject *self,
5857		  PyUnicodeObject *str1,
5858		  PyUnicodeObject *str2,
5859		  Py_ssize_t maxcount)
5860{
5861    PyUnicodeObject *u;
5862
5863    if (maxcount < 0)
5864	maxcount = PY_SSIZE_T_MAX;
5865
5866    if (str1->length == str2->length) {
5867        /* same length */
5868        Py_ssize_t i;
5869        if (str1->length == 1) {
5870            /* replace characters */
5871            Py_UNICODE u1, u2;
5872            if (!findchar(self->str, self->length, str1->str[0]))
5873                goto nothing;
5874            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5875            if (!u)
5876                return NULL;
5877            Py_UNICODE_COPY(u->str, self->str, self->length);
5878            u1 = str1->str[0];
5879            u2 = str2->str[0];
5880            for (i = 0; i < u->length; i++)
5881                if (u->str[i] == u1) {
5882                    if (--maxcount < 0)
5883                        break;
5884                    u->str[i] = u2;
5885                }
5886        } else {
5887            i = fastsearch(
5888                self->str, self->length, str1->str, str1->length, FAST_SEARCH
5889                );
5890            if (i < 0)
5891                goto nothing;
5892            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5893            if (!u)
5894                return NULL;
5895            Py_UNICODE_COPY(u->str, self->str, self->length);
5896            while (i <= self->length - str1->length)
5897                if (Py_UNICODE_MATCH(self, i, str1)) {
5898                    if (--maxcount < 0)
5899                        break;
5900                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5901                    i += str1->length;
5902                } else
5903                    i++;
5904        }
5905    } else {
5906
5907        Py_ssize_t n, i, j, e;
5908        Py_ssize_t product, new_size, delta;
5909        Py_UNICODE *p;
5910
5911        /* replace strings */
5912        n = stringlib_count(self->str, self->length, str1->str, str1->length);
5913        if (n > maxcount)
5914            n = maxcount;
5915        if (n == 0)
5916            goto nothing;
5917        /* new_size = self->length + n * (str2->length - str1->length)); */
5918        delta = (str2->length - str1->length);
5919        if (delta == 0) {
5920            new_size = self->length;
5921        } else {
5922            product = n * (str2->length - str1->length);
5923            if ((product / (str2->length - str1->length)) != n) {
5924                PyErr_SetString(PyExc_OverflowError,
5925                                "replace string is too long");
5926                return NULL;
5927            }
5928            new_size = self->length + product;
5929            if (new_size < 0) {
5930                PyErr_SetString(PyExc_OverflowError,
5931                                "replace string is too long");
5932                return NULL;
5933            }
5934        }
5935        u = _PyUnicode_New(new_size);
5936        if (!u)
5937            return NULL;
5938        i = 0;
5939        p = u->str;
5940        e = self->length - str1->length;
5941        if (str1->length > 0) {
5942            while (n-- > 0) {
5943                /* look for next match */
5944                j = i;
5945                while (j <= e) {
5946                    if (Py_UNICODE_MATCH(self, j, str1))
5947                        break;
5948                    j++;
5949                }
5950		if (j > i) {
5951                    if (j > e)
5952                        break;
5953                    /* copy unchanged part [i:j] */
5954                    Py_UNICODE_COPY(p, self->str+i, j-i);
5955                    p += j - i;
5956                }
5957                /* copy substitution string */
5958                if (str2->length > 0) {
5959                    Py_UNICODE_COPY(p, str2->str, str2->length);
5960                    p += str2->length;
5961                }
5962                i = j + str1->length;
5963            }
5964            if (i < self->length)
5965                /* copy tail [i:] */
5966                Py_UNICODE_COPY(p, self->str+i, self->length-i);
5967        } else {
5968            /* interleave */
5969            while (n > 0) {
5970                Py_UNICODE_COPY(p, str2->str, str2->length);
5971                p += str2->length;
5972                if (--n <= 0)
5973                    break;
5974                *p++ = self->str[i++];
5975            }
5976            Py_UNICODE_COPY(p, self->str+i, self->length-i);
5977        }
5978    }
5979    return (PyObject *) u;
5980
5981nothing:
5982    /* nothing to replace; return original string (when possible) */
5983    if (PyUnicode_CheckExact(self)) {
5984        Py_INCREF(self);
5985        return (PyObject *) self;
5986    }
5987    return PyUnicode_FromUnicode(self->str, self->length);
5988}
5989
5990/* --- Unicode Object Methods --------------------------------------------- */
5991
5992PyDoc_STRVAR(title__doc__,
5993"S.title() -> unicode\n\
5994\n\
5995Return a titlecased version of S, i.e. words start with title case\n\
5996characters, all remaining cased characters have lower case.");
5997
5998static PyObject*
5999unicode_title(PyUnicodeObject *self)
6000{
6001    return fixup(self, fixtitle);
6002}
6003
6004PyDoc_STRVAR(capitalize__doc__,
6005"S.capitalize() -> unicode\n\
6006\n\
6007Return a capitalized version of S, i.e. make the first character\n\
6008have upper case.");
6009
6010static PyObject*
6011unicode_capitalize(PyUnicodeObject *self)
6012{
6013    return fixup(self, fixcapitalize);
6014}
6015
6016#if 0
6017PyDoc_STRVAR(capwords__doc__,
6018"S.capwords() -> unicode\n\
6019\n\
6020Apply .capitalize() to all words in S and return the result with\n\
6021normalized whitespace (all whitespace strings are replaced by ' ').");
6022
6023static PyObject*
6024unicode_capwords(PyUnicodeObject *self)
6025{
6026    PyObject *list;
6027    PyObject *item;
6028    Py_ssize_t i;
6029
6030    /* Split into words */
6031    list = split(self, NULL, -1);
6032    if (!list)
6033        return NULL;
6034
6035    /* Capitalize each word */
6036    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6037        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6038		     fixcapitalize);
6039        if (item == NULL)
6040            goto onError;
6041        Py_DECREF(PyList_GET_ITEM(list, i));
6042        PyList_SET_ITEM(list, i, item);
6043    }
6044
6045    /* Join the words to form a new string */
6046    item = PyUnicode_Join(NULL, list);
6047
6048onError:
6049    Py_DECREF(list);
6050    return (PyObject *)item;
6051}
6052#endif
6053
6054/* Argument converter.  Coerces to a single unicode character */
6055
6056static int
6057convert_uc(PyObject *obj, void *addr)
6058{
6059	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6060	PyObject *uniobj;
6061	Py_UNICODE *unistr;
6062
6063	uniobj = PyUnicode_FromObject(obj);
6064	if (uniobj == NULL) {
6065		PyErr_SetString(PyExc_TypeError,
6066			"The fill character cannot be converted to Unicode");
6067		return 0;
6068	}
6069	if (PyUnicode_GET_SIZE(uniobj) != 1) {
6070		PyErr_SetString(PyExc_TypeError,
6071			"The fill character must be exactly one character long");
6072		Py_DECREF(uniobj);
6073		return 0;
6074	}
6075	unistr = PyUnicode_AS_UNICODE(uniobj);
6076	*fillcharloc = unistr[0];
6077	Py_DECREF(uniobj);
6078	return 1;
6079}
6080
6081PyDoc_STRVAR(center__doc__,
6082"S.center(width[, fillchar]) -> unicode\n\
6083\n\
6084Return S centered in a Unicode string of length width. Padding is\n\
6085done using the specified fill character (default is a space)");
6086
6087static PyObject *
6088unicode_center(PyUnicodeObject *self, PyObject *args)
6089{
6090    Py_ssize_t marg, left;
6091    Py_ssize_t width;
6092    Py_UNICODE fillchar = ' ';
6093
6094    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6095        return NULL;
6096
6097    if (self->length >= width && PyUnicode_CheckExact(self)) {
6098        Py_INCREF(self);
6099        return (PyObject*) self;
6100    }
6101
6102    marg = width - self->length;
6103    left = marg / 2 + (marg & width & 1);
6104
6105    return (PyObject*) pad(self, left, marg - left, fillchar);
6106}
6107
6108#if 0
6109
6110/* This code should go into some future Unicode collation support
6111   module. The basic comparison should compare ordinals on a naive
6112   basis (this is what Java does and thus JPython too). */
6113
6114/* speedy UTF-16 code point order comparison */
6115/* gleaned from: */
6116/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6117
6118static short utf16Fixup[32] =
6119{
6120    0, 0, 0, 0, 0, 0, 0, 0,
6121    0, 0, 0, 0, 0, 0, 0, 0,
6122    0, 0, 0, 0, 0, 0, 0, 0,
6123    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6124};
6125
6126static int
6127unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6128{
6129    Py_ssize_t len1, len2;
6130
6131    Py_UNICODE *s1 = str1->str;
6132    Py_UNICODE *s2 = str2->str;
6133
6134    len1 = str1->length;
6135    len2 = str2->length;
6136
6137    while (len1 > 0 && len2 > 0) {
6138        Py_UNICODE c1, c2;
6139
6140        c1 = *s1++;
6141        c2 = *s2++;
6142
6143	if (c1 > (1<<11) * 26)
6144	    c1 += utf16Fixup[c1>>11];
6145	if (c2 > (1<<11) * 26)
6146            c2 += utf16Fixup[c2>>11];
6147        /* now c1 and c2 are in UTF-32-compatible order */
6148
6149        if (c1 != c2)
6150            return (c1 < c2) ? -1 : 1;
6151
6152        len1--; len2--;
6153    }
6154
6155    return (len1 < len2) ? -1 : (len1 != len2);
6156}
6157
6158#else
6159
6160static int
6161unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6162{
6163    register Py_ssize_t len1, len2;
6164
6165    Py_UNICODE *s1 = str1->str;
6166    Py_UNICODE *s2 = str2->str;
6167
6168    len1 = str1->length;
6169    len2 = str2->length;
6170
6171    while (len1 > 0 && len2 > 0) {
6172        Py_UNICODE c1, c2;
6173
6174        c1 = *s1++;
6175        c2 = *s2++;
6176
6177        if (c1 != c2)
6178            return (c1 < c2) ? -1 : 1;
6179
6180        len1--; len2--;
6181    }
6182
6183    return (len1 < len2) ? -1 : (len1 != len2);
6184}
6185
6186#endif
6187
6188int PyUnicode_Compare(PyObject *left,
6189		      PyObject *right)
6190{
6191    if (PyUnicode_Check(left) && PyUnicode_Check(right))
6192        return unicode_compare((PyUnicodeObject *)left,
6193                               (PyUnicodeObject *)right);
6194    if ((PyString_Check(left) && PyUnicode_Check(right)) ||
6195        (PyUnicode_Check(left) && PyString_Check(right))) {
6196        if (PyUnicode_Check(left))
6197            left = _PyUnicode_AsDefaultEncodedString(left, NULL);
6198        if (PyUnicode_Check(right))
6199            right = _PyUnicode_AsDefaultEncodedString(right, NULL);
6200        assert(PyString_Check(left));
6201        assert(PyString_Check(right));
6202        return PyObject_Compare(left, right);
6203    }
6204    PyErr_Format(PyExc_TypeError,
6205                 "Can't compare %.100s and %.100s",
6206                 left->ob_type->tp_name,
6207                 right->ob_type->tp_name);
6208    return -1;
6209}
6210
6211int
6212PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6213{
6214    int i;
6215    Py_UNICODE *id;
6216    assert(PyUnicode_Check(uni));
6217    id = PyUnicode_AS_UNICODE(uni);
6218    /* Compare Unicode string and source character set string */
6219    for (i = 0; id[i] && str[i]; i++)
6220	if (id[i] != str[i])
6221	    return ((int)id[i] < (int)str[i]) ? -1 : 1;
6222    if (id[i])
6223	return 1; /* uni is longer */
6224    if (str[i])
6225	return -1; /* str is longer */
6226    return 0;
6227}
6228
6229PyObject *PyUnicode_RichCompare(PyObject *left,
6230                                PyObject *right,
6231                                int op)
6232{
6233    int result;
6234
6235    result = PyUnicode_Compare(left, right);
6236    if (result == -1 && PyErr_Occurred())
6237        goto onError;
6238
6239    /* Convert the return value to a Boolean */
6240    switch (op) {
6241    case Py_EQ:
6242        result = (result == 0);
6243        break;
6244    case Py_NE:
6245        result = (result != 0);
6246        break;
6247    case Py_LE:
6248        result = (result <= 0);
6249        break;
6250    case Py_GE:
6251        result = (result >= 0);
6252        break;
6253    case Py_LT:
6254        result = (result == -1);
6255        break;
6256    case Py_GT:
6257        result = (result == 1);
6258        break;
6259    }
6260    return PyBool_FromLong(result);
6261
6262 onError:
6263
6264    /* Standard case
6265
6266       Type errors mean that PyUnicode_FromObject() could not convert
6267       one of the arguments (usually the right hand side) to Unicode,
6268       ie. we can't handle the comparison request. However, it is
6269       possible that the other object knows a comparison method, which
6270       is why we return Py_NotImplemented to give the other object a
6271       chance.
6272
6273    */
6274    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6275        PyErr_Clear();
6276        Py_INCREF(Py_NotImplemented);
6277        return Py_NotImplemented;
6278    }
6279    if (op != Py_EQ && op != Py_NE)
6280        return NULL;
6281
6282    /* Equality comparison.
6283
6284       This is a special case: we silence any PyExc_UnicodeDecodeError
6285       and instead turn it into a PyErr_UnicodeWarning.
6286
6287    */
6288    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6289        return NULL;
6290    PyErr_Clear();
6291    if (PyErr_WarnEx(PyExc_UnicodeWarning,
6292                     (op == Py_EQ) ?
6293                     "Unicode equal comparison "
6294                     "failed to convert both arguments to Unicode - "
6295                     "interpreting them as being unequal"
6296                     :
6297                     "Unicode unequal comparison "
6298                     "failed to convert both arguments to Unicode - "
6299                     "interpreting them as being unequal",
6300                     1) < 0)
6301        return NULL;
6302    result = (op == Py_NE);
6303    return PyBool_FromLong(result);
6304}
6305
6306int PyUnicode_Contains(PyObject *container,
6307		       PyObject *element)
6308{
6309    PyObject *str, *sub;
6310    int result;
6311
6312    /* Coerce the two arguments */
6313    sub = PyUnicode_FromObject(element);
6314    if (!sub) {
6315	PyErr_Format(PyExc_TypeError,
6316	    "'in <string>' requires string as left operand, not %s",
6317	    element->ob_type->tp_name);
6318        return -1;
6319    }
6320
6321    str = PyUnicode_FromObject(container);
6322    if (!str) {
6323        Py_DECREF(sub);
6324        return -1;
6325    }
6326
6327    result = stringlib_contains_obj(str, sub);
6328
6329    Py_DECREF(str);
6330    Py_DECREF(sub);
6331
6332    return result;
6333}
6334
6335/* Concat to string or Unicode object giving a new Unicode object. */
6336
6337PyObject *PyUnicode_Concat(PyObject *left,
6338			   PyObject *right)
6339{
6340    PyUnicodeObject *u = NULL, *v = NULL, *w;
6341
6342    if (PyBytes_Check(left) || PyBytes_Check(right))
6343        return PyBytes_Concat(left, right);
6344
6345    /* Coerce the two arguments */
6346    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6347    if (u == NULL)
6348	goto onError;
6349    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6350    if (v == NULL)
6351	goto onError;
6352
6353    /* Shortcuts */
6354    if (v == unicode_empty) {
6355	Py_DECREF(v);
6356	return (PyObject *)u;
6357    }
6358    if (u == unicode_empty) {
6359	Py_DECREF(u);
6360	return (PyObject *)v;
6361    }
6362
6363    /* Concat the two Unicode strings */
6364    w = _PyUnicode_New(u->length + v->length);
6365    if (w == NULL)
6366	goto onError;
6367    Py_UNICODE_COPY(w->str, u->str, u->length);
6368    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6369
6370    Py_DECREF(u);
6371    Py_DECREF(v);
6372    return (PyObject *)w;
6373
6374onError:
6375    Py_XDECREF(u);
6376    Py_XDECREF(v);
6377    return NULL;
6378}
6379
6380void
6381PyUnicode_Append(PyObject **pleft, PyObject *right)
6382{
6383	PyObject *new;
6384	if (*pleft == NULL)
6385		return;
6386	if (right == NULL || !PyUnicode_Check(*pleft)) {
6387		Py_DECREF(*pleft);
6388		*pleft = NULL;
6389		return;
6390	}
6391	new = PyUnicode_Concat(*pleft, right);
6392	Py_DECREF(*pleft);
6393	*pleft = new;
6394}
6395
6396void
6397PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6398{
6399	PyUnicode_Append(pleft, right);
6400	Py_XDECREF(right);
6401}
6402
6403PyDoc_STRVAR(count__doc__,
6404"S.count(sub[, start[, end]]) -> int\n\
6405\n\
6406Return the number of non-overlapping occurrences of substring sub in\n\
6407Unicode string S[start:end].  Optional arguments start and end are\n\
6408interpreted as in slice notation.");
6409
6410static PyObject *
6411unicode_count(PyUnicodeObject *self, PyObject *args)
6412{
6413    PyUnicodeObject *substring;
6414    Py_ssize_t start = 0;
6415    Py_ssize_t end = PY_SSIZE_T_MAX;
6416    PyObject *result;
6417
6418    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6419		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6420        return NULL;
6421
6422    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6423        (PyObject *)substring);
6424    if (substring == NULL)
6425	return NULL;
6426
6427    FIX_START_END(self);
6428
6429    result = PyInt_FromSsize_t(
6430        stringlib_count(self->str + start, end - start,
6431                        substring->str, substring->length)
6432        );
6433
6434    Py_DECREF(substring);
6435
6436    return result;
6437}
6438
6439PyDoc_STRVAR(encode__doc__,
6440"S.encode([encoding[,errors]]) -> string or unicode\n\
6441\n\
6442Encodes S using the codec registered for encoding. encoding defaults\n\
6443to the default encoding. errors may be given to set a different error\n\
6444handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6445a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6446'xmlcharrefreplace' as well as any other name registered with\n\
6447codecs.register_error that can handle UnicodeEncodeErrors.");
6448
6449static PyObject *
6450unicode_encode(PyUnicodeObject *self, PyObject *args)
6451{
6452    char *encoding = NULL;
6453    char *errors = NULL;
6454    PyObject *v;
6455
6456    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6457        return NULL;
6458    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6459    if (v == NULL)
6460        goto onError;
6461    if (!PyBytes_Check(v)) {
6462        PyErr_Format(PyExc_TypeError,
6463                     "encoder did not return a bytes object "
6464                     "(type=%.400s)",
6465                     Py_Type(v)->tp_name);
6466        Py_DECREF(v);
6467        return NULL;
6468    }
6469    return v;
6470
6471 onError:
6472    return NULL;
6473}
6474
6475PyDoc_STRVAR(expandtabs__doc__,
6476"S.expandtabs([tabsize]) -> unicode\n\
6477\n\
6478Return a copy of S where all tab characters are expanded using spaces.\n\
6479If tabsize is not given, a tab size of 8 characters is assumed.");
6480
6481static PyObject*
6482unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6483{
6484    Py_UNICODE *e;
6485    Py_UNICODE *p;
6486    Py_UNICODE *q;
6487    Py_ssize_t i, j, old_j;
6488    PyUnicodeObject *u;
6489    int tabsize = 8;
6490
6491    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6492	return NULL;
6493
6494    /* First pass: determine size of output string */
6495    i = j = old_j = 0;
6496    e = self->str + self->length;
6497    for (p = self->str; p < e; p++)
6498        if (*p == '\t') {
6499	    if (tabsize > 0) {
6500		j += tabsize - (j % tabsize);
6501		if (old_j > j) {
6502		    PyErr_SetString(PyExc_OverflowError,
6503				    "new string is too long");
6504		    return NULL;
6505		}
6506		old_j = j;
6507	    }
6508	}
6509        else {
6510            j++;
6511            if (*p == '\n' || *p == '\r') {
6512                i += j;
6513                old_j = j = 0;
6514                if (i < 0) {
6515                    PyErr_SetString(PyExc_OverflowError,
6516                                    "new string is too long");
6517                    return NULL;
6518                }
6519            }
6520        }
6521
6522    if ((i + j) < 0) {
6523        PyErr_SetString(PyExc_OverflowError, "new string is too long");
6524        return NULL;
6525    }
6526
6527    /* Second pass: create output string and fill it */
6528    u = _PyUnicode_New(i + j);
6529    if (!u)
6530        return NULL;
6531
6532    j = 0;
6533    q = u->str;
6534
6535    for (p = self->str; p < e; p++)
6536        if (*p == '\t') {
6537	    if (tabsize > 0) {
6538		i = tabsize - (j % tabsize);
6539		j += i;
6540		while (i--)
6541		    *q++ = ' ';
6542	    }
6543	}
6544	else {
6545            j++;
6546	    *q++ = *p;
6547            if (*p == '\n' || *p == '\r')
6548                j = 0;
6549        }
6550
6551    return (PyObject*) u;
6552}
6553
6554PyDoc_STRVAR(find__doc__,
6555"S.find(sub [,start [,end]]) -> int\n\
6556\n\
6557Return the lowest index in S where substring sub is found,\n\
6558such that sub is contained within s[start:end].  Optional\n\
6559arguments start and end are interpreted as in slice notation.\n\
6560\n\
6561Return -1 on failure.");
6562
6563static PyObject *
6564unicode_find(PyUnicodeObject *self, PyObject *args)
6565{
6566    PyObject *substring;
6567    Py_ssize_t start = 0;
6568    Py_ssize_t end = PY_SSIZE_T_MAX;
6569    Py_ssize_t result;
6570
6571    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6572		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6573        return NULL;
6574    substring = PyUnicode_FromObject(substring);
6575    if (!substring)
6576	return NULL;
6577
6578    result = stringlib_find_slice(
6579        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6580        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6581        start, end
6582        );
6583
6584    Py_DECREF(substring);
6585
6586    return PyInt_FromSsize_t(result);
6587}
6588
6589static PyObject *
6590unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6591{
6592    if (index < 0 || index >= self->length) {
6593        PyErr_SetString(PyExc_IndexError, "string index out of range");
6594        return NULL;
6595    }
6596
6597    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6598}
6599
6600/* Believe it or not, this produces the same value for ASCII strings
6601   as string_hash(). */
6602static long
6603unicode_hash(PyUnicodeObject *self)
6604{
6605    Py_ssize_t len;
6606    Py_UNICODE *p;
6607    long x;
6608
6609    if (self->hash != -1)
6610        return self->hash;
6611    len = Py_Size(self);
6612    p = self->str;
6613    x = *p << 7;
6614    while (--len >= 0)
6615        x = (1000003*x) ^ *p++;
6616    x ^= Py_Size(self);
6617    if (x == -1)
6618        x = -2;
6619    self->hash = x;
6620    return x;
6621}
6622
6623PyDoc_STRVAR(index__doc__,
6624"S.index(sub [,start [,end]]) -> int\n\
6625\n\
6626Like S.find() but raise ValueError when the substring is not found.");
6627
6628static PyObject *
6629unicode_index(PyUnicodeObject *self, PyObject *args)
6630{
6631    Py_ssize_t result;
6632    PyObject *substring;
6633    Py_ssize_t start = 0;
6634    Py_ssize_t end = PY_SSIZE_T_MAX;
6635
6636    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6637		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6638        return NULL;
6639    substring = PyUnicode_FromObject(substring);
6640    if (!substring)
6641	return NULL;
6642
6643    result = stringlib_find_slice(
6644        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6645        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6646        start, end
6647        );
6648
6649    Py_DECREF(substring);
6650
6651    if (result < 0) {
6652        PyErr_SetString(PyExc_ValueError, "substring not found");
6653        return NULL;
6654    }
6655
6656    return PyInt_FromSsize_t(result);
6657}
6658
6659PyDoc_STRVAR(islower__doc__,
6660"S.islower() -> bool\n\
6661\n\
6662Return True if all cased characters in S are lowercase and there is\n\
6663at least one cased character in S, False otherwise.");
6664
6665static PyObject*
6666unicode_islower(PyUnicodeObject *self)
6667{
6668    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6669    register const Py_UNICODE *e;
6670    int cased;
6671
6672    /* Shortcut for single character strings */
6673    if (PyUnicode_GET_SIZE(self) == 1)
6674	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6675
6676    /* Special case for empty strings */
6677    if (PyUnicode_GET_SIZE(self) == 0)
6678	return PyBool_FromLong(0);
6679
6680    e = p + PyUnicode_GET_SIZE(self);
6681    cased = 0;
6682    for (; p < e; p++) {
6683	register const Py_UNICODE ch = *p;
6684
6685	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6686	    return PyBool_FromLong(0);
6687	else if (!cased && Py_UNICODE_ISLOWER(ch))
6688	    cased = 1;
6689    }
6690    return PyBool_FromLong(cased);
6691}
6692
6693PyDoc_STRVAR(isupper__doc__,
6694"S.isupper() -> bool\n\
6695\n\
6696Return True if all cased characters in S are uppercase and there is\n\
6697at least one cased character in S, False otherwise.");
6698
6699static PyObject*
6700unicode_isupper(PyUnicodeObject *self)
6701{
6702    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6703    register const Py_UNICODE *e;
6704    int cased;
6705
6706    /* Shortcut for single character strings */
6707    if (PyUnicode_GET_SIZE(self) == 1)
6708	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6709
6710    /* Special case for empty strings */
6711    if (PyUnicode_GET_SIZE(self) == 0)
6712	return PyBool_FromLong(0);
6713
6714    e = p + PyUnicode_GET_SIZE(self);
6715    cased = 0;
6716    for (; p < e; p++) {
6717	register const Py_UNICODE ch = *p;
6718
6719	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6720	    return PyBool_FromLong(0);
6721	else if (!cased && Py_UNICODE_ISUPPER(ch))
6722	    cased = 1;
6723    }
6724    return PyBool_FromLong(cased);
6725}
6726
6727PyDoc_STRVAR(istitle__doc__,
6728"S.istitle() -> bool\n\
6729\n\
6730Return True if S is a titlecased string and there is at least one\n\
6731character in S, i.e. upper- and titlecase characters may only\n\
6732follow uncased characters and lowercase characters only cased ones.\n\
6733Return False otherwise.");
6734
6735static PyObject*
6736unicode_istitle(PyUnicodeObject *self)
6737{
6738    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6739    register const Py_UNICODE *e;
6740    int cased, previous_is_cased;
6741
6742    /* Shortcut for single character strings */
6743    if (PyUnicode_GET_SIZE(self) == 1)
6744	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6745			       (Py_UNICODE_ISUPPER(*p) != 0));
6746
6747    /* Special case for empty strings */
6748    if (PyUnicode_GET_SIZE(self) == 0)
6749	return PyBool_FromLong(0);
6750
6751    e = p + PyUnicode_GET_SIZE(self);
6752    cased = 0;
6753    previous_is_cased = 0;
6754    for (; p < e; p++) {
6755	register const Py_UNICODE ch = *p;
6756
6757	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6758	    if (previous_is_cased)
6759		return PyBool_FromLong(0);
6760	    previous_is_cased = 1;
6761	    cased = 1;
6762	}
6763	else if (Py_UNICODE_ISLOWER(ch)) {
6764	    if (!previous_is_cased)
6765		return PyBool_FromLong(0);
6766	    previous_is_cased = 1;
6767	    cased = 1;
6768	}
6769	else
6770	    previous_is_cased = 0;
6771    }
6772    return PyBool_FromLong(cased);
6773}
6774
6775PyDoc_STRVAR(isspace__doc__,
6776"S.isspace() -> bool\n\
6777\n\
6778Return True if all characters in S are whitespace\n\
6779and there is at least one character in S, False otherwise.");
6780
6781static PyObject*
6782unicode_isspace(PyUnicodeObject *self)
6783{
6784    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6785    register const Py_UNICODE *e;
6786
6787    /* Shortcut for single character strings */
6788    if (PyUnicode_GET_SIZE(self) == 1 &&
6789	Py_UNICODE_ISSPACE(*p))
6790	return PyBool_FromLong(1);
6791
6792    /* Special case for empty strings */
6793    if (PyUnicode_GET_SIZE(self) == 0)
6794	return PyBool_FromLong(0);
6795
6796    e = p + PyUnicode_GET_SIZE(self);
6797    for (; p < e; p++) {
6798	if (!Py_UNICODE_ISSPACE(*p))
6799	    return PyBool_FromLong(0);
6800    }
6801    return PyBool_FromLong(1);
6802}
6803
6804PyDoc_STRVAR(isalpha__doc__,
6805"S.isalpha() -> bool\n\
6806\n\
6807Return True if all characters in S are alphabetic\n\
6808and there is at least one character in S, False otherwise.");
6809
6810static PyObject*
6811unicode_isalpha(PyUnicodeObject *self)
6812{
6813    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6814    register const Py_UNICODE *e;
6815
6816    /* Shortcut for single character strings */
6817    if (PyUnicode_GET_SIZE(self) == 1 &&
6818	Py_UNICODE_ISALPHA(*p))
6819	return PyBool_FromLong(1);
6820
6821    /* Special case for empty strings */
6822    if (PyUnicode_GET_SIZE(self) == 0)
6823	return PyBool_FromLong(0);
6824
6825    e = p + PyUnicode_GET_SIZE(self);
6826    for (; p < e; p++) {
6827	if (!Py_UNICODE_ISALPHA(*p))
6828	    return PyBool_FromLong(0);
6829    }
6830    return PyBool_FromLong(1);
6831}
6832
6833PyDoc_STRVAR(isalnum__doc__,
6834"S.isalnum() -> bool\n\
6835\n\
6836Return True if all characters in S are alphanumeric\n\
6837and there is at least one character in S, False otherwise.");
6838
6839static PyObject*
6840unicode_isalnum(PyUnicodeObject *self)
6841{
6842    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6843    register const Py_UNICODE *e;
6844
6845    /* Shortcut for single character strings */
6846    if (PyUnicode_GET_SIZE(self) == 1 &&
6847	Py_UNICODE_ISALNUM(*p))
6848	return PyBool_FromLong(1);
6849
6850    /* Special case for empty strings */
6851    if (PyUnicode_GET_SIZE(self) == 0)
6852	return PyBool_FromLong(0);
6853
6854    e = p + PyUnicode_GET_SIZE(self);
6855    for (; p < e; p++) {
6856	if (!Py_UNICODE_ISALNUM(*p))
6857	    return PyBool_FromLong(0);
6858    }
6859    return PyBool_FromLong(1);
6860}
6861
6862PyDoc_STRVAR(isdecimal__doc__,
6863"S.isdecimal() -> bool\n\
6864\n\
6865Return True if there are only decimal characters in S,\n\
6866False otherwise.");
6867
6868static PyObject*
6869unicode_isdecimal(PyUnicodeObject *self)
6870{
6871    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6872    register const Py_UNICODE *e;
6873
6874    /* Shortcut for single character strings */
6875    if (PyUnicode_GET_SIZE(self) == 1 &&
6876	Py_UNICODE_ISDECIMAL(*p))
6877	return PyBool_FromLong(1);
6878
6879    /* Special case for empty strings */
6880    if (PyUnicode_GET_SIZE(self) == 0)
6881	return PyBool_FromLong(0);
6882
6883    e = p + PyUnicode_GET_SIZE(self);
6884    for (; p < e; p++) {
6885	if (!Py_UNICODE_ISDECIMAL(*p))
6886	    return PyBool_FromLong(0);
6887    }
6888    return PyBool_FromLong(1);
6889}
6890
6891PyDoc_STRVAR(isdigit__doc__,
6892"S.isdigit() -> bool\n\
6893\n\
6894Return True if all characters in S are digits\n\
6895and there is at least one character in S, False otherwise.");
6896
6897static PyObject*
6898unicode_isdigit(PyUnicodeObject *self)
6899{
6900    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6901    register const Py_UNICODE *e;
6902
6903    /* Shortcut for single character strings */
6904    if (PyUnicode_GET_SIZE(self) == 1 &&
6905	Py_UNICODE_ISDIGIT(*p))
6906	return PyBool_FromLong(1);
6907
6908    /* Special case for empty strings */
6909    if (PyUnicode_GET_SIZE(self) == 0)
6910	return PyBool_FromLong(0);
6911
6912    e = p + PyUnicode_GET_SIZE(self);
6913    for (; p < e; p++) {
6914	if (!Py_UNICODE_ISDIGIT(*p))
6915	    return PyBool_FromLong(0);
6916    }
6917    return PyBool_FromLong(1);
6918}
6919
6920PyDoc_STRVAR(isnumeric__doc__,
6921"S.isnumeric() -> bool\n\
6922\n\
6923Return True if there are only numeric characters in S,\n\
6924False otherwise.");
6925
6926static PyObject*
6927unicode_isnumeric(PyUnicodeObject *self)
6928{
6929    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6930    register const Py_UNICODE *e;
6931
6932    /* Shortcut for single character strings */
6933    if (PyUnicode_GET_SIZE(self) == 1 &&
6934	Py_UNICODE_ISNUMERIC(*p))
6935	return PyBool_FromLong(1);
6936
6937    /* Special case for empty strings */
6938    if (PyUnicode_GET_SIZE(self) == 0)
6939	return PyBool_FromLong(0);
6940
6941    e = p + PyUnicode_GET_SIZE(self);
6942    for (; p < e; p++) {
6943	if (!Py_UNICODE_ISNUMERIC(*p))
6944	    return PyBool_FromLong(0);
6945    }
6946    return PyBool_FromLong(1);
6947}
6948
6949int
6950PyUnicode_IsIdentifier(PyObject *self)
6951{
6952    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
6953    register const Py_UNICODE *e;
6954
6955    /* Special case for empty strings */
6956    if (PyUnicode_GET_SIZE(self) == 0)
6957	return 0;
6958
6959    /* PEP 3131 says that the first character must be in
6960       XID_Start and subsequent characters in XID_Continue,
6961       and for the ASCII range, the 2.x rules apply (i.e
6962       start with letters and underscore, continue with
6963       letters, digits, underscore). However, given the current
6964       definition of XID_Start and XID_Continue, it is sufficient
6965       to check just for these, except that _ must be allowed
6966       as starting an identifier.  */
6967    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
6968        return 0;
6969
6970    e = p + PyUnicode_GET_SIZE(self);
6971    for (p++; p < e; p++) {
6972	if (!_PyUnicode_IsXidContinue(*p))
6973	    return 0;
6974    }
6975    return 1;
6976}
6977
6978PyDoc_STRVAR(isidentifier__doc__,
6979"S.isidentifier() -> bool\n\
6980\n\
6981Return True if S is a valid identifier according\n\
6982to the language definition.");
6983
6984static PyObject*
6985unicode_isidentifier(PyObject *self)
6986{
6987    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
6988}
6989
6990PyDoc_STRVAR(join__doc__,
6991"S.join(sequence) -> unicode\n\
6992\n\
6993Return a string which is the concatenation of the strings in the\n\
6994sequence.  The separator between elements is S.");
6995
6996static PyObject*
6997unicode_join(PyObject *self, PyObject *data)
6998{
6999    return PyUnicode_Join(self, data);
7000}
7001
7002static Py_ssize_t
7003unicode_length(PyUnicodeObject *self)
7004{
7005    return self->length;
7006}
7007
7008PyDoc_STRVAR(ljust__doc__,
7009"S.ljust(width[, fillchar]) -> int\n\
7010\n\
7011Return S left justified in a Unicode string of length width. Padding is\n\
7012done using the specified fill character (default is a space).");
7013
7014static PyObject *
7015unicode_ljust(PyUnicodeObject *self, PyObject *args)
7016{
7017    Py_ssize_t width;
7018    Py_UNICODE fillchar = ' ';
7019
7020    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7021        return NULL;
7022
7023    if (self->length >= width && PyUnicode_CheckExact(self)) {
7024        Py_INCREF(self);
7025        return (PyObject*) self;
7026    }
7027
7028    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7029}
7030
7031PyDoc_STRVAR(lower__doc__,
7032"S.lower() -> unicode\n\
7033\n\
7034Return a copy of the string S converted to lowercase.");
7035
7036static PyObject*
7037unicode_lower(PyUnicodeObject *self)
7038{
7039    return fixup(self, fixlower);
7040}
7041
7042#define LEFTSTRIP 0
7043#define RIGHTSTRIP 1
7044#define BOTHSTRIP 2
7045
7046/* Arrays indexed by above */
7047static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7048
7049#define STRIPNAME(i) (stripformat[i]+3)
7050
7051/* externally visible for str.strip(unicode) */
7052PyObject *
7053_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7054{
7055	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7056	Py_ssize_t len = PyUnicode_GET_SIZE(self);
7057	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7058	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7059	Py_ssize_t i, j;
7060
7061        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7062
7063	i = 0;
7064	if (striptype != RIGHTSTRIP) {
7065            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7066                i++;
7067            }
7068	}
7069
7070	j = len;
7071	if (striptype != LEFTSTRIP) {
7072            do {
7073                j--;
7074            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7075            j++;
7076	}
7077
7078	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7079            Py_INCREF(self);
7080            return (PyObject*)self;
7081	}
7082	else
7083            return PyUnicode_FromUnicode(s+i, j-i);
7084}
7085
7086
7087static PyObject *
7088do_strip(PyUnicodeObject *self, int striptype)
7089{
7090	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7091	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7092
7093	i = 0;
7094	if (striptype != RIGHTSTRIP) {
7095		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7096			i++;
7097		}
7098	}
7099
7100	j = len;
7101	if (striptype != LEFTSTRIP) {
7102		do {
7103			j--;
7104		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7105		j++;
7106	}
7107
7108	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7109		Py_INCREF(self);
7110		return (PyObject*)self;
7111	}
7112	else
7113		return PyUnicode_FromUnicode(s+i, j-i);
7114}
7115
7116
7117static PyObject *
7118do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7119{
7120	PyObject *sep = NULL;
7121
7122	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7123		return NULL;
7124
7125	if (sep != NULL && sep != Py_None) {
7126		if (PyUnicode_Check(sep))
7127			return _PyUnicode_XStrip(self, striptype, sep);
7128		else if (PyString_Check(sep)) {
7129			PyObject *res;
7130			sep = PyUnicode_FromObject(sep);
7131			if (sep==NULL)
7132				return NULL;
7133			res = _PyUnicode_XStrip(self, striptype, sep);
7134			Py_DECREF(sep);
7135			return res;
7136		}
7137		else {
7138			PyErr_Format(PyExc_TypeError,
7139				     "%s arg must be None, unicode or str",
7140				     STRIPNAME(striptype));
7141			return NULL;
7142		}
7143	}
7144
7145	return do_strip(self, striptype);
7146}
7147
7148
7149PyDoc_STRVAR(strip__doc__,
7150"S.strip([chars]) -> unicode\n\
7151\n\
7152Return a copy of the string S with leading and trailing\n\
7153whitespace removed.\n\
7154If chars is given and not None, remove characters in chars instead.\n\
7155If chars is a str, it will be converted to unicode before stripping");
7156
7157static PyObject *
7158unicode_strip(PyUnicodeObject *self, PyObject *args)
7159{
7160	if (PyTuple_GET_SIZE(args) == 0)
7161		return do_strip(self, BOTHSTRIP); /* Common case */
7162	else
7163		return do_argstrip(self, BOTHSTRIP, args);
7164}
7165
7166
7167PyDoc_STRVAR(lstrip__doc__,
7168"S.lstrip([chars]) -> unicode\n\
7169\n\
7170Return a copy of the string S with leading whitespace removed.\n\
7171If chars is given and not None, remove characters in chars instead.\n\
7172If chars is a str, it will be converted to unicode before stripping");
7173
7174static PyObject *
7175unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7176{
7177	if (PyTuple_GET_SIZE(args) == 0)
7178		return do_strip(self, LEFTSTRIP); /* Common case */
7179	else
7180		return do_argstrip(self, LEFTSTRIP, args);
7181}
7182
7183
7184PyDoc_STRVAR(rstrip__doc__,
7185"S.rstrip([chars]) -> unicode\n\
7186\n\
7187Return a copy of the string S with trailing whitespace removed.\n\
7188If chars is given and not None, remove characters in chars instead.\n\
7189If chars is a str, it will be converted to unicode before stripping");
7190
7191static PyObject *
7192unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7193{
7194	if (PyTuple_GET_SIZE(args) == 0)
7195		return do_strip(self, RIGHTSTRIP); /* Common case */
7196	else
7197		return do_argstrip(self, RIGHTSTRIP, args);
7198}
7199
7200
7201static PyObject*
7202unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7203{
7204    PyUnicodeObject *u;
7205    Py_UNICODE *p;
7206    Py_ssize_t nchars;
7207    size_t nbytes;
7208
7209    if (len < 0)
7210        len = 0;
7211
7212    if (len == 1 && PyUnicode_CheckExact(str)) {
7213        /* no repeat, return original string */
7214        Py_INCREF(str);
7215        return (PyObject*) str;
7216    }
7217
7218    /* ensure # of chars needed doesn't overflow int and # of bytes
7219     * needed doesn't overflow size_t
7220     */
7221    nchars = len * str->length;
7222    if (len && nchars / len != str->length) {
7223        PyErr_SetString(PyExc_OverflowError,
7224                        "repeated string is too long");
7225        return NULL;
7226    }
7227    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7228    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7229        PyErr_SetString(PyExc_OverflowError,
7230                        "repeated string is too long");
7231        return NULL;
7232    }
7233    u = _PyUnicode_New(nchars);
7234    if (!u)
7235        return NULL;
7236
7237    p = u->str;
7238
7239    if (str->length == 1 && len > 0) {
7240        Py_UNICODE_FILL(p, str->str[0], len);
7241    } else {
7242	Py_ssize_t done = 0; /* number of characters copied this far */
7243	if (done < nchars) {
7244            Py_UNICODE_COPY(p, str->str, str->length);
7245            done = str->length;
7246	}
7247	while (done < nchars) {
7248            int n = (done <= nchars-done) ? done : nchars-done;
7249            Py_UNICODE_COPY(p+done, p, n);
7250            done += n;
7251	}
7252    }
7253
7254    return (PyObject*) u;
7255}
7256
7257PyObject *PyUnicode_Replace(PyObject *obj,
7258			    PyObject *subobj,
7259			    PyObject *replobj,
7260			    Py_ssize_t maxcount)
7261{
7262    PyObject *self;
7263    PyObject *str1;
7264    PyObject *str2;
7265    PyObject *result;
7266
7267    self = PyUnicode_FromObject(obj);
7268    if (self == NULL)
7269	return NULL;
7270    str1 = PyUnicode_FromObject(subobj);
7271    if (str1 == NULL) {
7272	Py_DECREF(self);
7273	return NULL;
7274    }
7275    str2 = PyUnicode_FromObject(replobj);
7276    if (str2 == NULL) {
7277	Py_DECREF(self);
7278	Py_DECREF(str1);
7279	return NULL;
7280    }
7281    result = replace((PyUnicodeObject *)self,
7282		     (PyUnicodeObject *)str1,
7283		     (PyUnicodeObject *)str2,
7284		     maxcount);
7285    Py_DECREF(self);
7286    Py_DECREF(str1);
7287    Py_DECREF(str2);
7288    return result;
7289}
7290
7291PyDoc_STRVAR(replace__doc__,
7292"S.replace (old, new[, maxsplit]) -> unicode\n\
7293\n\
7294Return a copy of S with all occurrences of substring\n\
7295old replaced by new.  If the optional argument maxsplit is\n\
7296given, only the first maxsplit occurrences are replaced.");
7297
7298static PyObject*
7299unicode_replace(PyUnicodeObject *self, PyObject *args)
7300{
7301    PyUnicodeObject *str1;
7302    PyUnicodeObject *str2;
7303    Py_ssize_t maxcount = -1;
7304    PyObject *result;
7305
7306    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7307        return NULL;
7308    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7309    if (str1 == NULL)
7310	return NULL;
7311    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7312    if (str2 == NULL) {
7313	Py_DECREF(str1);
7314	return NULL;
7315    }
7316
7317    result = replace(self, str1, str2, maxcount);
7318
7319    Py_DECREF(str1);
7320    Py_DECREF(str2);
7321    return result;
7322}
7323
7324static
7325PyObject *unicode_repr(PyObject *unicode)
7326{
7327    PyObject *repr;
7328    Py_UNICODE *p;
7329    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7330    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7331
7332    /* XXX(nnorwitz): rather than over-allocating, it would be
7333       better to choose a different scheme.  Perhaps scan the
7334       first N-chars of the string and allocate based on that size.
7335    */
7336    /* Initial allocation is based on the longest-possible unichr
7337       escape.
7338
7339       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7340       unichr, so in this case it's the longest unichr escape. In
7341       narrow (UTF-16) builds this is five chars per source unichr
7342       since there are two unichrs in the surrogate pair, so in narrow
7343       (UTF-16) builds it's not the longest unichr escape.
7344
7345       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7346       so in the narrow (UTF-16) build case it's the longest unichr
7347       escape.
7348    */
7349
7350    repr = PyUnicode_FromUnicode(NULL,
7351        2 /* quotes */
7352#ifdef Py_UNICODE_WIDE
7353        + 10*size
7354#else
7355        + 6*size
7356#endif
7357        + 1);
7358    if (repr == NULL)
7359        return NULL;
7360
7361    p = PyUnicode_AS_UNICODE(repr);
7362
7363    /* Add quote */
7364    *p++ = (findchar(s, size, '\'') &&
7365            !findchar(s, size, '"')) ? '"' : '\'';
7366    while (size-- > 0) {
7367        Py_UNICODE ch = *s++;
7368
7369        /* Escape quotes and backslashes */
7370        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7371            *p++ = '\\';
7372            *p++ = ch;
7373            continue;
7374        }
7375
7376#ifdef Py_UNICODE_WIDE
7377        /* Map 21-bit characters to '\U00xxxxxx' */
7378        else if (ch >= 0x10000) {
7379            *p++ = '\\';
7380            *p++ = 'U';
7381            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7382            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7383            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7384            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7385            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7386            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7387            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7388            *p++ = hexdigits[ch & 0x0000000F];
7389	    continue;
7390        }
7391#else
7392	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7393	else if (ch >= 0xD800 && ch < 0xDC00) {
7394	    Py_UNICODE ch2;
7395	    Py_UCS4 ucs;
7396
7397	    ch2 = *s++;
7398	    size--;
7399	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7400		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7401		*p++ = '\\';
7402		*p++ = 'U';
7403		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7404		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7405		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7406		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7407		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7408		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7409		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7410		*p++ = hexdigits[ucs & 0x0000000F];
7411		continue;
7412	    }
7413	    /* Fall through: isolated surrogates are copied as-is */
7414	    s--;
7415	    size++;
7416	}
7417#endif
7418
7419        /* Map 16-bit characters to '\uxxxx' */
7420        if (ch >= 256) {
7421            *p++ = '\\';
7422            *p++ = 'u';
7423            *p++ = hexdigits[(ch >> 12) & 0x000F];
7424            *p++ = hexdigits[(ch >> 8) & 0x000F];
7425            *p++ = hexdigits[(ch >> 4) & 0x000F];
7426            *p++ = hexdigits[ch & 0x000F];
7427        }
7428
7429        /* Map special whitespace to '\t', \n', '\r' */
7430        else if (ch == '\t') {
7431            *p++ = '\\';
7432            *p++ = 't';
7433        }
7434        else if (ch == '\n') {
7435            *p++ = '\\';
7436            *p++ = 'n';
7437        }
7438        else if (ch == '\r') {
7439            *p++ = '\\';
7440            *p++ = 'r';
7441        }
7442
7443        /* Map non-printable US ASCII to '\xhh' */
7444        else if (ch < ' ' || ch >= 0x7F) {
7445            *p++ = '\\';
7446            *p++ = 'x';
7447            *p++ = hexdigits[(ch >> 4) & 0x000F];
7448            *p++ = hexdigits[ch & 0x000F];
7449        }
7450
7451        /* Copy everything else as-is */
7452        else
7453            *p++ = (char) ch;
7454    }
7455    /* Add quote */
7456    *p++ = PyUnicode_AS_UNICODE(repr)[0];
7457
7458    *p = '\0';
7459    _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7460    return repr;
7461}
7462
7463PyDoc_STRVAR(rfind__doc__,
7464"S.rfind(sub [,start [,end]]) -> int\n\
7465\n\
7466Return the highest index in S where substring sub is found,\n\
7467such that sub is contained within s[start:end].  Optional\n\
7468arguments start and end are interpreted as in slice notation.\n\
7469\n\
7470Return -1 on failure.");
7471
7472static PyObject *
7473unicode_rfind(PyUnicodeObject *self, PyObject *args)
7474{
7475    PyObject *substring;
7476    Py_ssize_t start = 0;
7477    Py_ssize_t end = PY_SSIZE_T_MAX;
7478    Py_ssize_t result;
7479
7480    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7481		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7482        return NULL;
7483    substring = PyUnicode_FromObject(substring);
7484    if (!substring)
7485	return NULL;
7486
7487    result = stringlib_rfind_slice(
7488        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7489        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7490        start, end
7491        );
7492
7493    Py_DECREF(substring);
7494
7495    return PyInt_FromSsize_t(result);
7496}
7497
7498PyDoc_STRVAR(rindex__doc__,
7499"S.rindex(sub [,start [,end]]) -> int\n\
7500\n\
7501Like S.rfind() but raise ValueError when the substring is not found.");
7502
7503static PyObject *
7504unicode_rindex(PyUnicodeObject *self, PyObject *args)
7505{
7506    PyObject *substring;
7507    Py_ssize_t start = 0;
7508    Py_ssize_t end = PY_SSIZE_T_MAX;
7509    Py_ssize_t result;
7510
7511    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7512		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7513        return NULL;
7514    substring = PyUnicode_FromObject(substring);
7515    if (!substring)
7516	return NULL;
7517
7518    result = stringlib_rfind_slice(
7519        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7520        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7521        start, end
7522        );
7523
7524    Py_DECREF(substring);
7525
7526    if (result < 0) {
7527        PyErr_SetString(PyExc_ValueError, "substring not found");
7528        return NULL;
7529    }
7530    return PyInt_FromSsize_t(result);
7531}
7532
7533PyDoc_STRVAR(rjust__doc__,
7534"S.rjust(width[, fillchar]) -> unicode\n\
7535\n\
7536Return S right justified in a Unicode string of length width. Padding is\n\
7537done using the specified fill character (default is a space).");
7538
7539static PyObject *
7540unicode_rjust(PyUnicodeObject *self, PyObject *args)
7541{
7542    Py_ssize_t width;
7543    Py_UNICODE fillchar = ' ';
7544
7545    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7546        return NULL;
7547
7548    if (self->length >= width && PyUnicode_CheckExact(self)) {
7549        Py_INCREF(self);
7550        return (PyObject*) self;
7551    }
7552
7553    return (PyObject*) pad(self, width - self->length, 0, fillchar);
7554}
7555
7556PyObject *PyUnicode_Split(PyObject *s,
7557			  PyObject *sep,
7558			  Py_ssize_t maxsplit)
7559{
7560    PyObject *result;
7561
7562    s = PyUnicode_FromObject(s);
7563    if (s == NULL)
7564	return NULL;
7565    if (sep != NULL) {
7566	sep = PyUnicode_FromObject(sep);
7567	if (sep == NULL) {
7568	    Py_DECREF(s);
7569	    return NULL;
7570	}
7571    }
7572
7573    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7574
7575    Py_DECREF(s);
7576    Py_XDECREF(sep);
7577    return result;
7578}
7579
7580PyDoc_STRVAR(split__doc__,
7581"S.split([sep [,maxsplit]]) -> list of strings\n\
7582\n\
7583Return a list of the words in S, using sep as the\n\
7584delimiter string.  If maxsplit is given, at most maxsplit\n\
7585splits are done. If sep is not specified or is None,\n\
7586any whitespace string is a separator.");
7587
7588static PyObject*
7589unicode_split(PyUnicodeObject *self, PyObject *args)
7590{
7591    PyObject *substring = Py_None;
7592    Py_ssize_t maxcount = -1;
7593
7594    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7595        return NULL;
7596
7597    if (substring == Py_None)
7598	return split(self, NULL, maxcount);
7599    else if (PyUnicode_Check(substring))
7600	return split(self, (PyUnicodeObject *)substring, maxcount);
7601    else
7602	return PyUnicode_Split((PyObject *)self, substring, maxcount);
7603}
7604
7605PyObject *
7606PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7607{
7608    PyObject* str_obj;
7609    PyObject* sep_obj;
7610    PyObject* out;
7611
7612    str_obj = PyUnicode_FromObject(str_in);
7613    if (!str_obj)
7614	return NULL;
7615    sep_obj = PyUnicode_FromObject(sep_in);
7616    if (!sep_obj) {
7617        Py_DECREF(str_obj);
7618        return NULL;
7619    }
7620
7621    out = stringlib_partition(
7622        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7623        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7624        );
7625
7626    Py_DECREF(sep_obj);
7627    Py_DECREF(str_obj);
7628
7629    return out;
7630}
7631
7632
7633PyObject *
7634PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7635{
7636    PyObject* str_obj;
7637    PyObject* sep_obj;
7638    PyObject* out;
7639
7640    str_obj = PyUnicode_FromObject(str_in);
7641    if (!str_obj)
7642	return NULL;
7643    sep_obj = PyUnicode_FromObject(sep_in);
7644    if (!sep_obj) {
7645        Py_DECREF(str_obj);
7646        return NULL;
7647    }
7648
7649    out = stringlib_rpartition(
7650        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7651        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7652        );
7653
7654    Py_DECREF(sep_obj);
7655    Py_DECREF(str_obj);
7656
7657    return out;
7658}
7659
7660PyDoc_STRVAR(partition__doc__,
7661"S.partition(sep) -> (head, sep, tail)\n\
7662\n\
7663Searches for the separator sep in S, and returns the part before it,\n\
7664the separator itself, and the part after it.  If the separator is not\n\
7665found, returns S and two empty strings.");
7666
7667static PyObject*
7668unicode_partition(PyUnicodeObject *self, PyObject *separator)
7669{
7670    return PyUnicode_Partition((PyObject *)self, separator);
7671}
7672
7673PyDoc_STRVAR(rpartition__doc__,
7674"S.rpartition(sep) -> (tail, sep, head)\n\
7675\n\
7676Searches for the separator sep in S, starting at the end of S, and returns\n\
7677the part before it, the separator itself, and the part after it.  If the\n\
7678separator is not found, returns two empty strings and S.");
7679
7680static PyObject*
7681unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7682{
7683    return PyUnicode_RPartition((PyObject *)self, separator);
7684}
7685
7686PyObject *PyUnicode_RSplit(PyObject *s,
7687			   PyObject *sep,
7688			   Py_ssize_t maxsplit)
7689{
7690    PyObject *result;
7691
7692    s = PyUnicode_FromObject(s);
7693    if (s == NULL)
7694	return NULL;
7695    if (sep != NULL) {
7696	sep = PyUnicode_FromObject(sep);
7697	if (sep == NULL) {
7698	    Py_DECREF(s);
7699	    return NULL;
7700	}
7701    }
7702
7703    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7704
7705    Py_DECREF(s);
7706    Py_XDECREF(sep);
7707    return result;
7708}
7709
7710PyDoc_STRVAR(rsplit__doc__,
7711"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7712\n\
7713Return a list of the words in S, using sep as the\n\
7714delimiter string, starting at the end of the string and\n\
7715working to the front.  If maxsplit is given, at most maxsplit\n\
7716splits are done. If sep is not specified, any whitespace string\n\
7717is a separator.");
7718
7719static PyObject*
7720unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7721{
7722    PyObject *substring = Py_None;
7723    Py_ssize_t maxcount = -1;
7724
7725    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7726        return NULL;
7727
7728    if (substring == Py_None)
7729	return rsplit(self, NULL, maxcount);
7730    else if (PyUnicode_Check(substring))
7731	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7732    else
7733	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7734}
7735
7736PyDoc_STRVAR(splitlines__doc__,
7737"S.splitlines([keepends]]) -> list of strings\n\
7738\n\
7739Return a list of the lines in S, breaking at line boundaries.\n\
7740Line breaks are not included in the resulting list unless keepends\n\
7741is given and true.");
7742
7743static PyObject*
7744unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7745{
7746    int keepends = 0;
7747
7748    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7749        return NULL;
7750
7751    return PyUnicode_Splitlines((PyObject *)self, keepends);
7752}
7753
7754static
7755PyObject *unicode_str(PyObject *self)
7756{
7757    if (PyUnicode_CheckExact(self)) {
7758        Py_INCREF(self);
7759        return self;
7760    } else
7761        /* Subtype -- return genuine unicode string with the same value. */
7762        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7763                                     PyUnicode_GET_SIZE(self));
7764}
7765
7766PyDoc_STRVAR(swapcase__doc__,
7767"S.swapcase() -> unicode\n\
7768\n\
7769Return a copy of S with uppercase characters converted to lowercase\n\
7770and vice versa.");
7771
7772static PyObject*
7773unicode_swapcase(PyUnicodeObject *self)
7774{
7775    return fixup(self, fixswapcase);
7776}
7777
7778PyDoc_STRVAR(translate__doc__,
7779"S.translate(table) -> unicode\n\
7780\n\
7781Return a copy of the string S, where all characters have been mapped\n\
7782through the given translation table, which must be a mapping of\n\
7783Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7784Unmapped characters are left untouched. Characters mapped to None\n\
7785are deleted.");
7786
7787static PyObject*
7788unicode_translate(PyUnicodeObject *self, PyObject *table)
7789{
7790    return PyUnicode_TranslateCharmap(self->str,
7791				      self->length,
7792				      table,
7793				      "ignore");
7794}
7795
7796PyDoc_STRVAR(upper__doc__,
7797"S.upper() -> unicode\n\
7798\n\
7799Return a copy of S converted to uppercase.");
7800
7801static PyObject*
7802unicode_upper(PyUnicodeObject *self)
7803{
7804    return fixup(self, fixupper);
7805}
7806
7807PyDoc_STRVAR(zfill__doc__,
7808"S.zfill(width) -> unicode\n\
7809\n\
7810Pad a numeric string x with zeros on the left, to fill a field\n\
7811of the specified width. The string x is never truncated.");
7812
7813static PyObject *
7814unicode_zfill(PyUnicodeObject *self, PyObject *args)
7815{
7816    Py_ssize_t fill;
7817    PyUnicodeObject *u;
7818
7819    Py_ssize_t width;
7820    if (!PyArg_ParseTuple(args, "n:zfill", &width))
7821        return NULL;
7822
7823    if (self->length >= width) {
7824        if (PyUnicode_CheckExact(self)) {
7825            Py_INCREF(self);
7826            return (PyObject*) self;
7827        }
7828        else
7829            return PyUnicode_FromUnicode(
7830                PyUnicode_AS_UNICODE(self),
7831                PyUnicode_GET_SIZE(self)
7832            );
7833    }
7834
7835    fill = width - self->length;
7836
7837    u = pad(self, fill, 0, '0');
7838
7839    if (u == NULL)
7840        return NULL;
7841
7842    if (u->str[fill] == '+' || u->str[fill] == '-') {
7843        /* move sign to beginning of string */
7844        u->str[0] = u->str[fill];
7845        u->str[fill] = '0';
7846    }
7847
7848    return (PyObject*) u;
7849}
7850
7851#if 0
7852static PyObject*
7853unicode_freelistsize(PyUnicodeObject *self)
7854{
7855    return PyInt_FromLong(unicode_freelist_size);
7856}
7857#endif
7858
7859PyDoc_STRVAR(startswith__doc__,
7860"S.startswith(prefix[, start[, end]]) -> bool\n\
7861\n\
7862Return True if S starts with the specified prefix, False otherwise.\n\
7863With optional start, test S beginning at that position.\n\
7864With optional end, stop comparing S at that position.\n\
7865prefix can also be a tuple of strings to try.");
7866
7867static PyObject *
7868unicode_startswith(PyUnicodeObject *self,
7869		   PyObject *args)
7870{
7871    PyObject *subobj;
7872    PyUnicodeObject *substring;
7873    Py_ssize_t start = 0;
7874    Py_ssize_t end = PY_SSIZE_T_MAX;
7875    int result;
7876
7877    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7878		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7879	return NULL;
7880    if (PyTuple_Check(subobj)) {
7881        Py_ssize_t i;
7882        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7883            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7884                            PyTuple_GET_ITEM(subobj, i));
7885            if (substring == NULL)
7886                return NULL;
7887            result = tailmatch(self, substring, start, end, -1);
7888            Py_DECREF(substring);
7889            if (result) {
7890                Py_RETURN_TRUE;
7891            }
7892        }
7893        /* nothing matched */
7894        Py_RETURN_FALSE;
7895    }
7896    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7897    if (substring == NULL)
7898         return NULL;
7899    result = tailmatch(self, substring, start, end, -1);
7900    Py_DECREF(substring);
7901    return PyBool_FromLong(result);
7902}
7903
7904
7905PyDoc_STRVAR(endswith__doc__,
7906"S.endswith(suffix[, start[, end]]) -> bool\n\
7907\n\
7908Return True if S ends with the specified suffix, False otherwise.\n\
7909With optional start, test S beginning at that position.\n\
7910With optional end, stop comparing S at that position.\n\
7911suffix can also be a tuple of strings to try.");
7912
7913static PyObject *
7914unicode_endswith(PyUnicodeObject *self,
7915		 PyObject *args)
7916{
7917    PyObject *subobj;
7918    PyUnicodeObject *substring;
7919    Py_ssize_t start = 0;
7920    Py_ssize_t end = PY_SSIZE_T_MAX;
7921    int result;
7922
7923    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7924        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7925	return NULL;
7926    if (PyTuple_Check(subobj)) {
7927        Py_ssize_t i;
7928        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7929            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7930                            PyTuple_GET_ITEM(subobj, i));
7931            if (substring == NULL)
7932            return NULL;
7933            result = tailmatch(self, substring, start, end, +1);
7934            Py_DECREF(substring);
7935            if (result) {
7936                Py_RETURN_TRUE;
7937            }
7938        }
7939        Py_RETURN_FALSE;
7940    }
7941    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7942    if (substring == NULL)
7943    return NULL;
7944
7945    result = tailmatch(self, substring, start, end, +1);
7946    Py_DECREF(substring);
7947    return PyBool_FromLong(result);
7948}
7949
7950#include "stringlib/string_format.h"
7951
7952PyDoc_STRVAR(format__doc__,
7953"S.format(*args, **kwargs) -> unicode\n\
7954\n\
7955");
7956
7957PyDoc_STRVAR(p_format__doc__,
7958"S.__format__(format_spec) -> unicode\n\
7959\n\
7960");
7961
7962static PyObject *
7963unicode_getnewargs(PyUnicodeObject *v)
7964{
7965	return Py_BuildValue("(u#)", v->str, v->length);
7966}
7967
7968
7969static PyMethodDef unicode_methods[] = {
7970
7971    /* Order is according to common usage: often used methods should
7972       appear first, since lookup is done sequentially. */
7973
7974    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7975    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7976    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7977    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7978    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7979    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7980    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7981    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7982    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7983    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7984    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7985    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7986    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7987    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7988    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7989    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7990    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7991    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7992    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7993    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7994    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7995    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7996    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7997    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7998    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7999    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8000    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8001    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8002    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8003    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8004    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8005    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8006    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8007    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8008    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8009    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8010    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8011    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8012    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8013    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8014    {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
8015    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8016    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8017#if 0
8018    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8019#endif
8020
8021#if 0
8022    /* This one is just used for debugging the implementation. */
8023    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8024#endif
8025
8026    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
8027    {NULL, NULL}
8028};
8029
8030static PyObject *
8031unicode_mod(PyObject *v, PyObject *w)
8032{
8033       if (!PyUnicode_Check(v)) {
8034               Py_INCREF(Py_NotImplemented);
8035               return Py_NotImplemented;
8036       }
8037       return PyUnicode_Format(v, w);
8038}
8039
8040static PyNumberMethods unicode_as_number = {
8041	0,				/*nb_add*/
8042	0,				/*nb_subtract*/
8043	0,				/*nb_multiply*/
8044	unicode_mod,			/*nb_remainder*/
8045};
8046
8047static PySequenceMethods unicode_as_sequence = {
8048    (lenfunc) unicode_length, 		/* sq_length */
8049    PyUnicode_Concat,		 	/* sq_concat */
8050    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
8051    (ssizeargfunc) unicode_getitem, 	/* sq_item */
8052    0,				 	/* sq_slice */
8053    0, 					/* sq_ass_item */
8054    0, 					/* sq_ass_slice */
8055    PyUnicode_Contains, 		/* sq_contains */
8056};
8057
8058static PyObject*
8059unicode_subscript(PyUnicodeObject* self, PyObject* item)
8060{
8061    if (PyIndex_Check(item)) {
8062        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8063        if (i == -1 && PyErr_Occurred())
8064            return NULL;
8065        if (i < 0)
8066            i += PyUnicode_GET_SIZE(self);
8067        return unicode_getitem(self, i);
8068    } else if (PySlice_Check(item)) {
8069        Py_ssize_t start, stop, step, slicelength, cur, i;
8070        Py_UNICODE* source_buf;
8071        Py_UNICODE* result_buf;
8072        PyObject* result;
8073
8074        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8075				 &start, &stop, &step, &slicelength) < 0) {
8076            return NULL;
8077        }
8078
8079        if (slicelength <= 0) {
8080            return PyUnicode_FromUnicode(NULL, 0);
8081        } else if (start == 0 && step == 1 && slicelength == self->length &&
8082                   PyUnicode_CheckExact(self)) {
8083            Py_INCREF(self);
8084            return (PyObject *)self;
8085        } else if (step == 1) {
8086            return PyUnicode_FromUnicode(self->str + start, slicelength);
8087        } else {
8088            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8089            result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
8090                                                    sizeof(Py_UNICODE));
8091
8092	    if (result_buf == NULL)
8093		    return PyErr_NoMemory();
8094
8095            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8096                result_buf[i] = source_buf[cur];
8097            }
8098
8099            result = PyUnicode_FromUnicode(result_buf, slicelength);
8100            PyMem_FREE(result_buf);
8101            return result;
8102        }
8103    } else {
8104        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8105        return NULL;
8106    }
8107}
8108
8109static PyMappingMethods unicode_as_mapping = {
8110    (lenfunc)unicode_length,		/* mp_length */
8111    (binaryfunc)unicode_subscript,	/* mp_subscript */
8112    (objobjargproc)0,			/* mp_ass_subscript */
8113};
8114
8115
8116/* Helpers for PyUnicode_Format() */
8117
8118static PyObject *
8119getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8120{
8121    Py_ssize_t argidx = *p_argidx;
8122    if (argidx < arglen) {
8123	(*p_argidx)++;
8124	if (arglen < 0)
8125	    return args;
8126	else
8127	    return PyTuple_GetItem(args, argidx);
8128    }
8129    PyErr_SetString(PyExc_TypeError,
8130		    "not enough arguments for format string");
8131    return NULL;
8132}
8133
8134#define F_LJUST (1<<0)
8135#define F_SIGN	(1<<1)
8136#define F_BLANK (1<<2)
8137#define F_ALT	(1<<3)
8138#define F_ZERO	(1<<4)
8139
8140static Py_ssize_t
8141strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8142{
8143    register Py_ssize_t i;
8144    Py_ssize_t len = strlen(charbuffer);
8145    for (i = len - 1; i >= 0; i--)
8146	buffer[i] = (Py_UNICODE) charbuffer[i];
8147
8148    return len;
8149}
8150
8151static int
8152doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8153{
8154    Py_ssize_t result;
8155
8156    PyOS_ascii_formatd((char *)buffer, len, format, x);
8157    result = strtounicode(buffer, (char *)buffer);
8158    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8159}
8160
8161static int
8162longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8163{
8164    Py_ssize_t result;
8165
8166    PyOS_snprintf((char *)buffer, len, format, x);
8167    result = strtounicode(buffer, (char *)buffer);
8168    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8169}
8170
8171/* XXX To save some code duplication, formatfloat/long/int could have been
8172   shared with stringobject.c, converting from 8-bit to Unicode after the
8173   formatting is done. */
8174
8175static int
8176formatfloat(Py_UNICODE *buf,
8177	    size_t buflen,
8178	    int flags,
8179	    int prec,
8180	    int type,
8181	    PyObject *v)
8182{
8183    /* fmt = '%#.' + `prec` + `type`
8184       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8185    char fmt[20];
8186    double x;
8187
8188    x = PyFloat_AsDouble(v);
8189    if (x == -1.0 && PyErr_Occurred())
8190	return -1;
8191    if (prec < 0)
8192	prec = 6;
8193    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8194	type = 'g';
8195    /* Worst case length calc to ensure no buffer overrun:
8196
8197       'g' formats:
8198	 fmt = %#.<prec>g
8199	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8200	    for any double rep.)
8201	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8202
8203       'f' formats:
8204	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8205	 len = 1 + 50 + 1 + prec = 52 + prec
8206
8207       If prec=0 the effective precision is 1 (the leading digit is
8208       always given), therefore increase the length by one.
8209
8210    */
8211    if (((type == 'g' || type == 'G') &&
8212          buflen <= (size_t)10 + (size_t)prec) ||
8213	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8214	PyErr_SetString(PyExc_OverflowError,
8215			"formatted float is too long (precision too large?)");
8216	return -1;
8217    }
8218    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8219		  (flags&F_ALT) ? "#" : "",
8220		  prec, type);
8221    return doubletounicode(buf, buflen, fmt, x);
8222}
8223
8224static PyObject*
8225formatlong(PyObject *val, int flags, int prec, int type)
8226{
8227	char *buf;
8228	int len;
8229	PyObject *str; /* temporary string object. */
8230	PyObject *result;
8231
8232	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8233	if (!str)
8234		return NULL;
8235	result = PyUnicode_FromStringAndSize(buf, len);
8236	Py_DECREF(str);
8237	return result;
8238}
8239
8240static int
8241formatint(Py_UNICODE *buf,
8242	  size_t buflen,
8243	  int flags,
8244	  int prec,
8245	  int type,
8246	  PyObject *v)
8247{
8248    /* fmt = '%#.' + `prec` + 'l' + `type`
8249     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8250     *                     + 1 + 1
8251     *                   = 24
8252     */
8253    char fmt[64]; /* plenty big enough! */
8254    char *sign;
8255    long x;
8256
8257    x = PyInt_AsLong(v);
8258    if (x == -1 && PyErr_Occurred())
8259        return -1;
8260    if (x < 0 && type == 'u') {
8261        type = 'd';
8262    }
8263    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8264        sign = "-";
8265    else
8266        sign = "";
8267    if (prec < 0)
8268        prec = 1;
8269
8270    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8271     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8272     */
8273    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8274        PyErr_SetString(PyExc_OverflowError,
8275    	        "formatted integer is too long (precision too large?)");
8276        return -1;
8277    }
8278
8279    if ((flags & F_ALT) &&
8280        (type == 'x' || type == 'X' || type == 'o')) {
8281        /* When converting under %#o, %#x or %#X, there are a number
8282         * of issues that cause pain:
8283	 * - for %#o, we want a different base marker than C
8284         * - when 0 is being converted, the C standard leaves off
8285         *   the '0x' or '0X', which is inconsistent with other
8286         *   %#x/%#X conversions and inconsistent with Python's
8287         *   hex() function
8288         * - there are platforms that violate the standard and
8289         *   convert 0 with the '0x' or '0X'
8290         *   (Metrowerks, Compaq Tru64)
8291         * - there are platforms that give '0x' when converting
8292         *   under %#X, but convert 0 in accordance with the
8293         *   standard (OS/2 EMX)
8294         *
8295         * We can achieve the desired consistency by inserting our
8296         * own '0x' or '0X' prefix, and substituting %x/%X in place
8297         * of %#x/%#X.
8298         *
8299         * Note that this is the same approach as used in
8300         * formatint() in stringobject.c
8301         */
8302        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8303                      sign, type, prec, type);
8304    }
8305    else {
8306        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8307                      sign, (flags&F_ALT) ? "#" : "",
8308                      prec, type);
8309    }
8310    if (sign[0])
8311        return longtounicode(buf, buflen, fmt, -x);
8312    else
8313        return longtounicode(buf, buflen, fmt, x);
8314}
8315
8316static int
8317formatchar(Py_UNICODE *buf,
8318           size_t buflen,
8319           PyObject *v)
8320{
8321    /* presume that the buffer is at least 2 characters long */
8322    if (PyUnicode_Check(v)) {
8323	if (PyUnicode_GET_SIZE(v) != 1)
8324	    goto onError;
8325	buf[0] = PyUnicode_AS_UNICODE(v)[0];
8326    }
8327
8328    else if (PyString_Check(v)) {
8329	if (PyString_GET_SIZE(v) != 1)
8330	    goto onError;
8331	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8332    }
8333
8334    else {
8335	/* Integer input truncated to a character */
8336        long x;
8337	x = PyInt_AsLong(v);
8338	if (x == -1 && PyErr_Occurred())
8339	    goto onError;
8340#ifdef Py_UNICODE_WIDE
8341	if (x < 0 || x > 0x10ffff) {
8342	    PyErr_SetString(PyExc_OverflowError,
8343			    "%c arg not in range(0x110000) "
8344			    "(wide Python build)");
8345	    return -1;
8346	}
8347#else
8348	if (x < 0 || x > 0xffff) {
8349	    PyErr_SetString(PyExc_OverflowError,
8350			    "%c arg not in range(0x10000) "
8351			    "(narrow Python build)");
8352	    return -1;
8353	}
8354#endif
8355	buf[0] = (Py_UNICODE) x;
8356    }
8357    buf[1] = '\0';
8358    return 1;
8359
8360 onError:
8361    PyErr_SetString(PyExc_TypeError,
8362		    "%c requires int or char");
8363    return -1;
8364}
8365
8366/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8367
8368   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8369   chars are formatted. XXX This is a magic number. Each formatting
8370   routine does bounds checking to ensure no overflow, but a better
8371   solution may be to malloc a buffer of appropriate size for each
8372   format. For now, the current solution is sufficient.
8373*/
8374#define FORMATBUFLEN (size_t)120
8375
8376PyObject *PyUnicode_Format(PyObject *format,
8377			   PyObject *args)
8378{
8379    Py_UNICODE *fmt, *res;
8380    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8381    int args_owned = 0;
8382    PyUnicodeObject *result = NULL;
8383    PyObject *dict = NULL;
8384    PyObject *uformat;
8385
8386    if (format == NULL || args == NULL) {
8387	PyErr_BadInternalCall();
8388	return NULL;
8389    }
8390    uformat = PyUnicode_FromObject(format);
8391    if (uformat == NULL)
8392	return NULL;
8393    fmt = PyUnicode_AS_UNICODE(uformat);
8394    fmtcnt = PyUnicode_GET_SIZE(uformat);
8395
8396    reslen = rescnt = fmtcnt + 100;
8397    result = _PyUnicode_New(reslen);
8398    if (result == NULL)
8399	goto onError;
8400    res = PyUnicode_AS_UNICODE(result);
8401
8402    if (PyTuple_Check(args)) {
8403	arglen = PyTuple_Size(args);
8404	argidx = 0;
8405    }
8406    else {
8407	arglen = -1;
8408	argidx = -2;
8409    }
8410    if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
8411        !PyObject_TypeCheck(args, &PyBaseString_Type))
8412	dict = args;
8413
8414    while (--fmtcnt >= 0) {
8415	if (*fmt != '%') {
8416	    if (--rescnt < 0) {
8417		rescnt = fmtcnt + 100;
8418		reslen += rescnt;
8419		if (_PyUnicode_Resize(&result, reslen) < 0)
8420		    goto onError;
8421		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8422		--rescnt;
8423	    }
8424	    *res++ = *fmt++;
8425	}
8426	else {
8427	    /* Got a format specifier */
8428	    int flags = 0;
8429	    Py_ssize_t width = -1;
8430	    int prec = -1;
8431	    Py_UNICODE c = '\0';
8432	    Py_UNICODE fill;
8433	    PyObject *v = NULL;
8434	    PyObject *temp = NULL;
8435	    Py_UNICODE *pbuf;
8436	    Py_UNICODE sign;
8437	    Py_ssize_t len;
8438	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8439
8440	    fmt++;
8441	    if (*fmt == '(') {
8442		Py_UNICODE *keystart;
8443		Py_ssize_t keylen;
8444		PyObject *key;
8445		int pcount = 1;
8446
8447		if (dict == NULL) {
8448		    PyErr_SetString(PyExc_TypeError,
8449				    "format requires a mapping");
8450		    goto onError;
8451		}
8452		++fmt;
8453		--fmtcnt;
8454		keystart = fmt;
8455		/* Skip over balanced parentheses */
8456		while (pcount > 0 && --fmtcnt >= 0) {
8457		    if (*fmt == ')')
8458			--pcount;
8459		    else if (*fmt == '(')
8460			++pcount;
8461		    fmt++;
8462		}
8463		keylen = fmt - keystart - 1;
8464		if (fmtcnt < 0 || pcount > 0) {
8465		    PyErr_SetString(PyExc_ValueError,
8466				    "incomplete format key");
8467		    goto onError;
8468		}
8469#if 0
8470		/* keys are converted to strings using UTF-8 and
8471		   then looked up since Python uses strings to hold
8472		   variables names etc. in its namespaces and we
8473		   wouldn't want to break common idioms. */
8474		key = PyUnicode_EncodeUTF8(keystart,
8475					   keylen,
8476					   NULL);
8477#else
8478		key = PyUnicode_FromUnicode(keystart, keylen);
8479#endif
8480		if (key == NULL)
8481		    goto onError;
8482		if (args_owned) {
8483		    Py_DECREF(args);
8484		    args_owned = 0;
8485		}
8486		args = PyObject_GetItem(dict, key);
8487		Py_DECREF(key);
8488		if (args == NULL) {
8489		    goto onError;
8490		}
8491		args_owned = 1;
8492		arglen = -1;
8493		argidx = -2;
8494	    }
8495	    while (--fmtcnt >= 0) {
8496		switch (c = *fmt++) {
8497		case '-': flags |= F_LJUST; continue;
8498		case '+': flags |= F_SIGN; continue;
8499		case ' ': flags |= F_BLANK; continue;
8500		case '#': flags |= F_ALT; continue;
8501		case '0': flags |= F_ZERO; continue;
8502		}
8503		break;
8504	    }
8505	    if (c == '*') {
8506		v = getnextarg(args, arglen, &argidx);
8507		if (v == NULL)
8508		    goto onError;
8509		if (!PyInt_Check(v)) {
8510		    PyErr_SetString(PyExc_TypeError,
8511				    "* wants int");
8512		    goto onError;
8513		}
8514		width = PyInt_AsLong(v);
8515		if (width == -1 && PyErr_Occurred())
8516			goto onError;
8517		if (width < 0) {
8518		    flags |= F_LJUST;
8519		    width = -width;
8520		}
8521		if (--fmtcnt >= 0)
8522		    c = *fmt++;
8523	    }
8524	    else if (c >= '0' && c <= '9') {
8525		width = c - '0';
8526		while (--fmtcnt >= 0) {
8527		    c = *fmt++;
8528		    if (c < '0' || c > '9')
8529			break;
8530		    if ((width*10) / 10 != width) {
8531			PyErr_SetString(PyExc_ValueError,
8532					"width too big");
8533			goto onError;
8534		    }
8535		    width = width*10 + (c - '0');
8536		}
8537	    }
8538	    if (c == '.') {
8539		prec = 0;
8540		if (--fmtcnt >= 0)
8541		    c = *fmt++;
8542		if (c == '*') {
8543		    v = getnextarg(args, arglen, &argidx);
8544		    if (v == NULL)
8545			goto onError;
8546		    if (!PyInt_Check(v)) {
8547			PyErr_SetString(PyExc_TypeError,
8548					"* wants int");
8549			goto onError;
8550		    }
8551		    prec = PyInt_AsLong(v);
8552		    if (prec == -1 && PyErr_Occurred())
8553			goto onError;
8554		    if (prec < 0)
8555			prec = 0;
8556		    if (--fmtcnt >= 0)
8557			c = *fmt++;
8558		}
8559		else if (c >= '0' && c <= '9') {
8560		    prec = c - '0';
8561		    while (--fmtcnt >= 0) {
8562			c = Py_CHARMASK(*fmt++);
8563			if (c < '0' || c > '9')
8564			    break;
8565			if ((prec*10) / 10 != prec) {
8566			    PyErr_SetString(PyExc_ValueError,
8567					    "prec too big");
8568			    goto onError;
8569			}
8570			prec = prec*10 + (c - '0');
8571		    }
8572		}
8573	    } /* prec */
8574	    if (fmtcnt >= 0) {
8575		if (c == 'h' || c == 'l' || c == 'L') {
8576		    if (--fmtcnt >= 0)
8577			c = *fmt++;
8578		}
8579	    }
8580	    if (fmtcnt < 0) {
8581		PyErr_SetString(PyExc_ValueError,
8582				"incomplete format");
8583		goto onError;
8584	    }
8585	    if (c != '%') {
8586		v = getnextarg(args, arglen, &argidx);
8587		if (v == NULL)
8588		    goto onError;
8589	    }
8590	    sign = 0;
8591	    fill = ' ';
8592	    switch (c) {
8593
8594	    case '%':
8595		pbuf = formatbuf;
8596		/* presume that buffer length is at least 1 */
8597		pbuf[0] = '%';
8598		len = 1;
8599		break;
8600
8601	    case 's':
8602	    case 'r':
8603		if (PyUnicode_Check(v) && c == 's') {
8604		    temp = v;
8605		    Py_INCREF(temp);
8606		}
8607		else {
8608		    PyObject *unicode;
8609		    if (c == 's')
8610			temp = PyObject_Unicode(v);
8611		    else
8612			temp = PyObject_Repr(v);
8613		    if (temp == NULL)
8614			goto onError;
8615                    if (PyUnicode_Check(temp))
8616                        /* nothing to do */;
8617                    else if (PyString_Check(temp)) {
8618                        /* convert to string to Unicode */
8619		        unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8620						   PyString_GET_SIZE(temp),
8621						   NULL,
8622						   "strict");
8623		        Py_DECREF(temp);
8624		        temp = unicode;
8625		        if (temp == NULL)
8626			    goto onError;
8627		    }
8628		    else {
8629			Py_DECREF(temp);
8630			PyErr_SetString(PyExc_TypeError,
8631					"%s argument has non-string str()");
8632			goto onError;
8633		    }
8634		}
8635		pbuf = PyUnicode_AS_UNICODE(temp);
8636		len = PyUnicode_GET_SIZE(temp);
8637		if (prec >= 0 && len > prec)
8638		    len = prec;
8639		break;
8640
8641	    case 'i':
8642	    case 'd':
8643	    case 'u':
8644	    case 'o':
8645	    case 'x':
8646	    case 'X':
8647		if (c == 'i')
8648		    c = 'd';
8649		if (PyLong_Check(v)) {
8650		    temp = formatlong(v, flags, prec, c);
8651		    if (!temp)
8652			goto onError;
8653		    pbuf = PyUnicode_AS_UNICODE(temp);
8654		    len = PyUnicode_GET_SIZE(temp);
8655		    sign = 1;
8656		}
8657		else {
8658		    pbuf = formatbuf;
8659		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8660				    flags, prec, c, v);
8661		    if (len < 0)
8662			goto onError;
8663		    sign = 1;
8664		}
8665		if (flags & F_ZERO)
8666		    fill = '0';
8667		break;
8668
8669	    case 'e':
8670	    case 'E':
8671	    case 'f':
8672	    case 'F':
8673	    case 'g':
8674	    case 'G':
8675		if (c == 'F')
8676			c = 'f';
8677		pbuf = formatbuf;
8678		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8679			flags, prec, c, v);
8680		if (len < 0)
8681		    goto onError;
8682		sign = 1;
8683		if (flags & F_ZERO)
8684		    fill = '0';
8685		break;
8686
8687	    case 'c':
8688		pbuf = formatbuf;
8689		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8690		if (len < 0)
8691		    goto onError;
8692		break;
8693
8694	    default:
8695		PyErr_Format(PyExc_ValueError,
8696			     "unsupported format character '%c' (0x%x) "
8697			     "at index %zd",
8698			     (31<=c && c<=126) ? (char)c : '?',
8699                             (int)c,
8700			     (Py_ssize_t)(fmt - 1 -
8701					  PyUnicode_AS_UNICODE(uformat)));
8702		goto onError;
8703	    }
8704	    if (sign) {
8705		if (*pbuf == '-' || *pbuf == '+') {
8706		    sign = *pbuf++;
8707		    len--;
8708		}
8709		else if (flags & F_SIGN)
8710		    sign = '+';
8711		else if (flags & F_BLANK)
8712		    sign = ' ';
8713		else
8714		    sign = 0;
8715	    }
8716	    if (width < len)
8717		width = len;
8718	    if (rescnt - (sign != 0) < width) {
8719		reslen -= rescnt;
8720		rescnt = width + fmtcnt + 100;
8721		reslen += rescnt;
8722		if (reslen < 0) {
8723		    Py_XDECREF(temp);
8724		    PyErr_NoMemory();
8725		    goto onError;
8726		}
8727		if (_PyUnicode_Resize(&result, reslen) < 0) {
8728		    Py_XDECREF(temp);
8729		    goto onError;
8730		}
8731		res = PyUnicode_AS_UNICODE(result)
8732		    + reslen - rescnt;
8733	    }
8734	    if (sign) {
8735		if (fill != ' ')
8736		    *res++ = sign;
8737		rescnt--;
8738		if (width > len)
8739		    width--;
8740	    }
8741	    if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
8742		assert(pbuf[0] == '0');
8743		assert(pbuf[1] == c);
8744		if (fill != ' ') {
8745		    *res++ = *pbuf++;
8746		    *res++ = *pbuf++;
8747		}
8748		rescnt -= 2;
8749		width -= 2;
8750		if (width < 0)
8751		    width = 0;
8752		len -= 2;
8753	    }
8754	    if (width > len && !(flags & F_LJUST)) {
8755		do {
8756		    --rescnt;
8757		    *res++ = fill;
8758		} while (--width > len);
8759	    }
8760	    if (fill == ' ') {
8761		if (sign)
8762		    *res++ = sign;
8763		if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
8764		    assert(pbuf[0] == '0');
8765		    assert(pbuf[1] == c);
8766		    *res++ = *pbuf++;
8767		    *res++ = *pbuf++;
8768		}
8769	    }
8770	    Py_UNICODE_COPY(res, pbuf, len);
8771	    res += len;
8772	    rescnt -= len;
8773	    while (--width >= len) {
8774		--rescnt;
8775		*res++ = ' ';
8776	    }
8777	    if (dict && (argidx < arglen) && c != '%') {
8778		PyErr_SetString(PyExc_TypeError,
8779				"not all arguments converted during string formatting");
8780                Py_XDECREF(temp);
8781		goto onError;
8782	    }
8783	    Py_XDECREF(temp);
8784	} /* '%' */
8785    } /* until end */
8786    if (argidx < arglen && !dict) {
8787	PyErr_SetString(PyExc_TypeError,
8788			"not all arguments converted during string formatting");
8789	goto onError;
8790    }
8791
8792    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8793	goto onError;
8794    if (args_owned) {
8795	Py_DECREF(args);
8796    }
8797    Py_DECREF(uformat);
8798    return (PyObject *)result;
8799
8800 onError:
8801    Py_XDECREF(result);
8802    Py_DECREF(uformat);
8803    if (args_owned) {
8804	Py_DECREF(args);
8805    }
8806    return NULL;
8807}
8808
8809static PyObject *
8810unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8811
8812static PyObject *
8813unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8814{
8815        PyObject *x = NULL;
8816	static char *kwlist[] = {"object", "encoding", "errors", 0};
8817	char *encoding = NULL;
8818	char *errors = NULL;
8819
8820	if (type != &PyUnicode_Type)
8821		return unicode_subtype_new(type, args, kwds);
8822	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8823					  kwlist, &x, &encoding, &errors))
8824	    return NULL;
8825	if (x == NULL)
8826		return (PyObject *)_PyUnicode_New(0);
8827	if (encoding == NULL && errors == NULL)
8828	    return PyObject_Unicode(x);
8829	else
8830	return PyUnicode_FromEncodedObject(x, encoding, errors);
8831}
8832
8833static PyObject *
8834unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8835{
8836	PyUnicodeObject *tmp, *pnew;
8837	Py_ssize_t n;
8838
8839	assert(PyType_IsSubtype(type, &PyUnicode_Type));
8840	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8841	if (tmp == NULL)
8842		return NULL;
8843	assert(PyUnicode_Check(tmp));
8844	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8845	if (pnew == NULL) {
8846		Py_DECREF(tmp);
8847		return NULL;
8848	}
8849	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8850	if (pnew->str == NULL) {
8851		_Py_ForgetReference((PyObject *)pnew);
8852		PyObject_Del(pnew);
8853		Py_DECREF(tmp);
8854		return PyErr_NoMemory();
8855	}
8856	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8857	pnew->length = n;
8858	pnew->hash = tmp->hash;
8859	Py_DECREF(tmp);
8860	return (PyObject *)pnew;
8861}
8862
8863PyDoc_STRVAR(unicode_doc,
8864"str(string [, encoding[, errors]]) -> object\n\
8865\n\
8866Create a new string object from the given encoded string.\n\
8867encoding defaults to the current default string encoding.\n\
8868errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8869
8870static PyObject *unicode_iter(PyObject *seq);
8871
8872PyTypeObject PyUnicode_Type = {
8873    PyVarObject_HEAD_INIT(&PyType_Type, 0)
8874    "str", 				/* tp_name */
8875    sizeof(PyUnicodeObject), 		/* tp_size */
8876    0, 					/* tp_itemsize */
8877    /* Slots */
8878    (destructor)unicode_dealloc, 	/* tp_dealloc */
8879    0, 					/* tp_print */
8880    0,				 	/* tp_getattr */
8881    0, 					/* tp_setattr */
8882    0, 					/* tp_compare */
8883    unicode_repr, 			/* tp_repr */
8884    &unicode_as_number, 		/* tp_as_number */
8885    &unicode_as_sequence, 		/* tp_as_sequence */
8886    &unicode_as_mapping, 		/* tp_as_mapping */
8887    (hashfunc) unicode_hash, 		/* tp_hash*/
8888    0, 					/* tp_call*/
8889    (reprfunc) unicode_str,	 	/* tp_str */
8890    PyObject_GenericGetAttr, 		/* tp_getattro */
8891    0,			 		/* tp_setattro */
8892    0, 					/* tp_as_buffer */
8893    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8894        Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
8895    unicode_doc,			/* tp_doc */
8896    0,					/* tp_traverse */
8897    0,					/* tp_clear */
8898    PyUnicode_RichCompare,		/* tp_richcompare */
8899    0,					/* tp_weaklistoffset */
8900    unicode_iter,			/* tp_iter */
8901    0,					/* tp_iternext */
8902    unicode_methods,			/* tp_methods */
8903    0,					/* tp_members */
8904    0,					/* tp_getset */
8905    &PyBaseString_Type,			/* tp_base */
8906    0,					/* tp_dict */
8907    0,					/* tp_descr_get */
8908    0,					/* tp_descr_set */
8909    0,					/* tp_dictoffset */
8910    0,					/* tp_init */
8911    0,					/* tp_alloc */
8912    unicode_new,			/* tp_new */
8913    PyObject_Del,      		/* tp_free */
8914};
8915
8916/* Initialize the Unicode implementation */
8917
8918void _PyUnicode_Init(void)
8919{
8920    int i;
8921
8922    /* XXX - move this array to unicodectype.c ? */
8923    Py_UNICODE linebreak[] = {
8924        0x000A, /* LINE FEED */
8925        0x000D, /* CARRIAGE RETURN */
8926        0x001C, /* FILE SEPARATOR */
8927        0x001D, /* GROUP SEPARATOR */
8928        0x001E, /* RECORD SEPARATOR */
8929        0x0085, /* NEXT LINE */
8930        0x2028, /* LINE SEPARATOR */
8931        0x2029, /* PARAGRAPH SEPARATOR */
8932    };
8933
8934    /* Init the implementation */
8935    unicode_freelist = NULL;
8936    unicode_freelist_size = 0;
8937    unicode_empty = _PyUnicode_New(0);
8938    if (!unicode_empty)
8939	return;
8940
8941    for (i = 0; i < 256; i++)
8942	unicode_latin1[i] = NULL;
8943    if (PyType_Ready(&PyUnicode_Type) < 0)
8944	Py_FatalError("Can't initialize 'unicode'");
8945
8946    /* initialize the linebreak bloom filter */
8947    bloom_linebreak = make_bloom_mask(
8948        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8949        );
8950
8951    PyType_Ready(&EncodingMapType);
8952}
8953
8954/* Finalize the Unicode implementation */
8955
8956void
8957_PyUnicode_Fini(void)
8958{
8959    PyUnicodeObject *u;
8960    int i;
8961
8962    Py_XDECREF(unicode_empty);
8963    unicode_empty = NULL;
8964
8965    for (i = 0; i < 256; i++) {
8966	if (unicode_latin1[i]) {
8967	    Py_DECREF(unicode_latin1[i]);
8968	    unicode_latin1[i] = NULL;
8969	}
8970    }
8971
8972    for (u = unicode_freelist; u != NULL;) {
8973	PyUnicodeObject *v = u;
8974	u = *(PyUnicodeObject **)u;
8975	if (v->str)
8976	    PyMem_DEL(v->str);
8977	Py_XDECREF(v->defenc);
8978	PyObject_Del(v);
8979    }
8980    unicode_freelist = NULL;
8981    unicode_freelist_size = 0;
8982}
8983
8984void
8985PyUnicode_InternInPlace(PyObject **p)
8986{
8987	register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8988	PyObject *t;
8989	if (s == NULL || !PyUnicode_Check(s))
8990		Py_FatalError(
8991		    "PyUnicode_InternInPlace: unicode strings only please!");
8992	/* If it's a subclass, we don't really know what putting
8993	   it in the interned dict might do. */
8994	if (!PyUnicode_CheckExact(s))
8995		return;
8996	if (PyUnicode_CHECK_INTERNED(s))
8997		return;
8998	if (interned == NULL) {
8999		interned = PyDict_New();
9000		if (interned == NULL) {
9001			PyErr_Clear(); /* Don't leave an exception */
9002			return;
9003		}
9004	}
9005	/* It might be that the GetItem call fails even
9006	   though the key is present in the dictionary,
9007	   namely when this happens during a stack overflow. */
9008	Py_ALLOW_RECURSION
9009	t = PyDict_GetItem(interned, (PyObject *)s);
9010	Py_END_ALLOW_RECURSION
9011
9012	if (t) {
9013		Py_INCREF(t);
9014		Py_DECREF(*p);
9015		*p = t;
9016		return;
9017	}
9018
9019	PyThreadState_GET()->recursion_critical = 1;
9020	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9021		PyErr_Clear();
9022		PyThreadState_GET()->recursion_critical = 0;
9023		return;
9024	}
9025	PyThreadState_GET()->recursion_critical = 0;
9026	/* The two references in interned are not counted by refcnt.
9027	   The deallocator will take care of this */
9028	Py_Refcnt(s) -= 2;
9029	PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9030}
9031
9032void
9033PyUnicode_InternImmortal(PyObject **p)
9034{
9035	PyUnicode_InternInPlace(p);
9036	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9037		PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9038		Py_INCREF(*p);
9039	}
9040}
9041
9042PyObject *
9043PyUnicode_InternFromString(const char *cp)
9044{
9045	PyObject *s = PyUnicode_FromString(cp);
9046	if (s == NULL)
9047		return NULL;
9048	PyUnicode_InternInPlace(&s);
9049	return s;
9050}
9051
9052void _Py_ReleaseInternedUnicodeStrings(void)
9053{
9054	PyObject *keys;
9055	PyUnicodeObject *s;
9056	Py_ssize_t i, n;
9057	Py_ssize_t immortal_size = 0, mortal_size = 0;
9058
9059	if (interned == NULL || !PyDict_Check(interned))
9060		return;
9061	keys = PyDict_Keys(interned);
9062	if (keys == NULL || !PyList_Check(keys)) {
9063		PyErr_Clear();
9064		return;
9065	}
9066
9067	/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9068	   detector, interned unicode strings are not forcibly deallocated;
9069	   rather, we give them their stolen references back, and then clear
9070	   and DECREF the interned dict. */
9071
9072	n = PyList_GET_SIZE(keys);
9073	fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9074		n);
9075	for (i = 0; i < n; i++) {
9076		s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9077		switch (s->state) {
9078		case SSTATE_NOT_INTERNED:
9079			/* XXX Shouldn't happen */
9080			break;
9081		case SSTATE_INTERNED_IMMORTAL:
9082			Py_Refcnt(s) += 1;
9083			immortal_size += s->length;
9084			break;
9085		case SSTATE_INTERNED_MORTAL:
9086			Py_Refcnt(s) += 2;
9087			mortal_size += s->length;
9088			break;
9089		default:
9090			Py_FatalError("Inconsistent interned string state.");
9091		}
9092		s->state = SSTATE_NOT_INTERNED;
9093	}
9094	fprintf(stderr, "total size of all interned strings: "
9095			"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9096			"mortal/immortal\n", mortal_size, immortal_size);
9097	Py_DECREF(keys);
9098	PyDict_Clear(interned);
9099	Py_DECREF(interned);
9100	interned = NULL;
9101}
9102
9103
9104/********************* Unicode Iterator **************************/
9105
9106typedef struct {
9107	PyObject_HEAD
9108	Py_ssize_t it_index;
9109	PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9110} unicodeiterobject;
9111
9112static void
9113unicodeiter_dealloc(unicodeiterobject *it)
9114{
9115	_PyObject_GC_UNTRACK(it);
9116	Py_XDECREF(it->it_seq);
9117	PyObject_GC_Del(it);
9118}
9119
9120static int
9121unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9122{
9123	Py_VISIT(it->it_seq);
9124	return 0;
9125}
9126
9127static PyObject *
9128unicodeiter_next(unicodeiterobject *it)
9129{
9130	PyUnicodeObject *seq;
9131	PyObject *item;
9132
9133	assert(it != NULL);
9134	seq = it->it_seq;
9135	if (seq == NULL)
9136		return NULL;
9137	assert(PyUnicode_Check(seq));
9138
9139	if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9140		item = PyUnicode_FromUnicode(
9141                    PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9142		if (item != NULL)
9143			++it->it_index;
9144		return item;
9145	}
9146
9147	Py_DECREF(seq);
9148	it->it_seq = NULL;
9149	return NULL;
9150}
9151
9152static PyObject *
9153unicodeiter_len(unicodeiterobject *it)
9154{
9155	Py_ssize_t len = 0;
9156	if (it->it_seq)
9157		len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9158	return PyInt_FromSsize_t(len);
9159}
9160
9161PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9162
9163static PyMethodDef unicodeiter_methods[] = {
9164	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9165         length_hint_doc},
9166 	{NULL,		NULL}		/* sentinel */
9167};
9168
9169PyTypeObject PyUnicodeIter_Type = {
9170	PyVarObject_HEAD_INIT(&PyType_Type, 0)
9171	"unicodeiterator",			/* tp_name */
9172	sizeof(unicodeiterobject),		/* tp_basicsize */
9173	0,					/* tp_itemsize */
9174	/* methods */
9175	(destructor)unicodeiter_dealloc,	/* tp_dealloc */
9176	0,					/* tp_print */
9177	0,					/* tp_getattr */
9178	0,					/* tp_setattr */
9179	0,					/* tp_compare */
9180	0,					/* tp_repr */
9181	0,					/* tp_as_number */
9182	0,					/* tp_as_sequence */
9183	0,					/* tp_as_mapping */
9184	0,					/* tp_hash */
9185	0,					/* tp_call */
9186	0,					/* tp_str */
9187	PyObject_GenericGetAttr,		/* tp_getattro */
9188	0,					/* tp_setattro */
9189	0,					/* tp_as_buffer */
9190	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9191	0,					/* tp_doc */
9192	(traverseproc)unicodeiter_traverse,	/* tp_traverse */
9193	0,					/* tp_clear */
9194	0,					/* tp_richcompare */
9195	0,					/* tp_weaklistoffset */
9196	PyObject_SelfIter,			/* tp_iter */
9197	(iternextfunc)unicodeiter_next,		/* tp_iternext */
9198	unicodeiter_methods,			/* tp_methods */
9199	0,
9200};
9201
9202static PyObject *
9203unicode_iter(PyObject *seq)
9204{
9205	unicodeiterobject *it;
9206
9207	if (!PyUnicode_Check(seq)) {
9208		PyErr_BadInternalCall();
9209		return NULL;
9210	}
9211	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9212	if (it == NULL)
9213		return NULL;
9214	it->it_index = 0;
9215	Py_INCREF(seq);
9216	it->it_seq = (PyUnicodeObject *)seq;
9217	_PyObject_GC_TRACK(it);
9218	return (PyObject *)it;
9219}
9220
9221size_t
9222Py_UNICODE_strlen(const Py_UNICODE *u)
9223{
9224    int res = 0;
9225    while(*u++)
9226        res++;
9227    return res;
9228}
9229
9230Py_UNICODE*
9231Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9232{
9233    Py_UNICODE *u = s1;
9234    while ((*u++ = *s2++));
9235    return s1;
9236}
9237
9238Py_UNICODE*
9239Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9240{
9241    Py_UNICODE *u = s1;
9242    while ((*u++ = *s2++))
9243        if (n-- == 0)
9244            break;
9245    return s1;
9246}
9247
9248int
9249Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9250{
9251    while (*s1 && *s2 && *s1 == *s2)
9252        s1++, s2++;
9253    if (*s1 && *s2)
9254        return (*s1 < *s2) ? -1 : +1;
9255    if (*s1)
9256        return 1;
9257    if (*s2)
9258        return -1;
9259    return 0;
9260}
9261
9262Py_UNICODE*
9263Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9264{
9265    const Py_UNICODE *p;
9266    for (p = s; *p; p++)
9267        if (*p == c)
9268            return (Py_UNICODE*)p;
9269    return NULL;
9270}
9271
9272
9273#ifdef __cplusplus
9274}
9275#endif
9276
9277
9278/*
9279Local variables:
9280c-basic-offset: 4
9281indent-tabs-mode: nil
9282End:
9283*/
9284