unicodeobject.c revision 72b710a59617ebe6dd1c41613d2c7eb81702efd9
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "bytes_methods.h"
45
46#include "unicodeobject.h"
47#include "ucnhash.h"
48
49#include "formatter_unicode.h"
50
51#ifdef MS_WINDOWS
52#include <windows.h>
53#endif
54
55/* Limit for the Unicode object free list */
56
57#define PyUnicode_MAXFREELIST       1024
58
59/* Limit for the Unicode object free list stay alive optimization.
60
61   The implementation will keep allocated Unicode memory intact for
62   all objects on the free list having a size less than this
63   limit. This reduces malloc() overhead for small Unicode objects.
64
65   At worst this will result in PyUnicode_MAXFREELIST *
66   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
67   malloc()-overhead) bytes of unused garbage.
68
69   Setting the limit to 0 effectively turns the feature off.
70
71   Note: This is an experimental feature ! If you get core dumps when
72   using Unicode objects, turn this feature off.
73
74*/
75
76#define KEEPALIVE_SIZE_LIMIT       9
77
78/* Endianness switches; defaults to little endian */
79
80#ifdef WORDS_BIGENDIAN
81# define BYTEORDER_IS_BIG_ENDIAN
82#else
83# define BYTEORDER_IS_LITTLE_ENDIAN
84#endif
85
86/* --- Globals ------------------------------------------------------------
87
88   The globals are initialized by the _PyUnicode_Init() API and should
89   not be used before calling that API.
90
91*/
92
93
94#ifdef __cplusplus
95extern "C" {
96#endif
97
98/* This dictionary holds all interned unicode strings.  Note that references
99   to strings in this dictionary are *not* counted in the string's ob_refcnt.
100   When the interned string reaches a refcnt of 0 the string deallocation
101   function will delete the reference from this dictionary.
102
103   Another way to look at this is that to say that the actual reference
104   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
105*/
106static PyObject *interned;
107
108/* Free list for Unicode objects */
109static PyUnicodeObject *free_list;
110static int numfree;
111
112/* The empty Unicode object is shared to improve performance. */
113static PyUnicodeObject *unicode_empty;
114
115/* Single character Unicode strings in the Latin-1 range are being
116   shared as well. */
117static PyUnicodeObject *unicode_latin1[256];
118
119/* Default encoding to use and assume when NULL is passed as encoding
120   parameter; it is fixed to "utf-8".  Always use the
121   PyUnicode_GetDefaultEncoding() API to access this global.
122
123   Don't forget to alter Py_FileSystemDefaultEncoding if you change the
124   hard coded default!
125*/
126static const char unicode_default_encoding[] = "utf-8";
127
128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
130	0, 0, 0, 0, 0, 0, 0, 0,
131//     case 0x0009: /* HORIZONTAL TABULATION */
132//     case 0x000A: /* LINE FEED */
133//     case 0x000B: /* VERTICAL TABULATION */
134//     case 0x000C: /* FORM FEED */
135//     case 0x000D: /* CARRIAGE RETURN */
136	0, 1, 1, 1, 1, 1, 0, 0,
137	0, 0, 0, 0, 0, 0, 0, 0,
138//     case 0x001C: /* FILE SEPARATOR */
139//     case 0x001D: /* GROUP SEPARATOR */
140//     case 0x001E: /* RECORD SEPARATOR */
141//     case 0x001F: /* UNIT SEPARATOR */
142	0, 0, 0, 0, 1, 1, 1, 1,
143//     case 0x0020: /* SPACE */
144	1, 0, 0, 0, 0, 0, 0, 0,
145	0, 0, 0, 0, 0, 0, 0, 0,
146	0, 0, 0, 0, 0, 0, 0, 0,
147	0, 0, 0, 0, 0, 0, 0, 0,
148
149	0, 0, 0, 0, 0, 0, 0, 0,
150	0, 0, 0, 0, 0, 0, 0, 0,
151	0, 0, 0, 0, 0, 0, 0, 0,
152	0, 0, 0, 0, 0, 0, 0, 0,
153	0, 0, 0, 0, 0, 0, 0, 0,
154	0, 0, 0, 0, 0, 0, 0, 0,
155	0, 0, 0, 0, 0, 0, 0, 0,
156	0, 0, 0, 0, 0, 0, 0, 0
157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161	0, 0, 0, 0, 0, 0, 0, 0,
162//         0x000A, /* LINE FEED */
163//         0x000D, /* CARRIAGE RETURN */
164	0, 0, 1, 0, 0, 1, 0, 0,
165	0, 0, 0, 0, 0, 0, 0, 0,
166//         0x001C, /* FILE SEPARATOR */
167//         0x001D, /* GROUP SEPARATOR */
168//         0x001E, /* RECORD SEPARATOR */
169	0, 0, 0, 0, 1, 1, 1, 0,
170	0, 0, 0, 0, 0, 0, 0, 0,
171	0, 0, 0, 0, 0, 0, 0, 0,
172	0, 0, 0, 0, 0, 0, 0, 0,
173	0, 0, 0, 0, 0, 0, 0, 0,
174
175	0, 0, 0, 0, 0, 0, 0, 0,
176	0, 0, 0, 0, 0, 0, 0, 0,
177	0, 0, 0, 0, 0, 0, 0, 0,
178	0, 0, 0, 0, 0, 0, 0, 0,
179	0, 0, 0, 0, 0, 0, 0, 0,
180	0, 0, 0, 0, 0, 0, 0, 0,
181	0, 0, 0, 0, 0, 0, 0, 0,
182	0, 0, 0, 0, 0, 0, 0, 0
183};
184
185
186Py_UNICODE
187PyUnicode_GetMax(void)
188{
189#ifdef Py_UNICODE_WIDE
190	return 0x10FFFF;
191#else
192	/* This is actually an illegal character, so it should
193	   not be passed to unichr. */
194	return 0xFFFF;
195#endif
196}
197
198/* --- Bloom Filters ----------------------------------------------------- */
199
200/* stuff to implement simple "bloom filters" for Unicode characters.
201   to keep things simple, we use a single bitmask, using the least 5
202   bits from each unicode characters as the bit index. */
203
204/* the linebreak mask is set up by Unicode_Init below */
205
206#define BLOOM_MASK unsigned long
207
208static BLOOM_MASK bloom_linebreak;
209
210#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
211
212#define BLOOM_LINEBREAK(ch) \
213    ((ch) < 128U ? ascii_linebreak[(ch)] : \
214    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
215
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217{
218    /* calculate simple bloom-style bitmask for a given unicode string */
219
220    long mask;
221    Py_ssize_t i;
222
223    mask = 0;
224    for (i = 0; i < len; i++)
225        mask |= (1 << (ptr[i] & 0x1F));
226
227    return mask;
228}
229
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231{
232    Py_ssize_t i;
233
234    for (i = 0; i < setlen; i++)
235        if (set[i] == chr)
236            return 1;
237
238    return 0;
239}
240
241#define BLOOM_MEMBER(mask, chr, set, setlen)\
242    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
244/* --- Unicode Object ----------------------------------------------------- */
245
246static
247int unicode_resize(register PyUnicodeObject *unicode,
248                      Py_ssize_t length)
249{
250    void *oldstr;
251
252    /* Shortcut if there's nothing much to do. */
253    if (unicode->length == length)
254	goto reset;
255
256    /* Resizing shared object (unicode_empty or single character
257       objects) in-place is not allowed. Use PyUnicode_Resize()
258       instead ! */
259
260    if (unicode == unicode_empty ||
261	(unicode->length == 1 &&
262	 unicode->str[0] < 256U &&
263	 unicode_latin1[unicode->str[0]] == unicode)) {
264        PyErr_SetString(PyExc_SystemError,
265                        "can't resize shared unicode objects");
266        return -1;
267    }
268
269    /* We allocate one more byte to make sure the string is Ux0000 terminated.
270       The overallocation is also used by fastsearch, which assumes that it's
271       safe to look at str[length] (without making any assumptions about what
272       it contains). */
273
274    oldstr = unicode->str;
275    unicode->str = PyObject_REALLOC(unicode->str,
276				    sizeof(Py_UNICODE) * (length + 1));
277    if (!unicode->str) {
278	unicode->str = (Py_UNICODE *)oldstr;
279        PyErr_NoMemory();
280        return -1;
281    }
282    unicode->str[length] = 0;
283    unicode->length = length;
284
285 reset:
286    /* Reset the object caches */
287    if (unicode->defenc) {
288        Py_DECREF(unicode->defenc);
289        unicode->defenc = NULL;
290    }
291    unicode->hash = -1;
292
293    return 0;
294}
295
296/* We allocate one more byte to make sure the string is
297   Ux0000 terminated; some code (e.g. new_identifier)
298   relies on that.
299
300   XXX This allocator could further be enhanced by assuring that the
301       free list never reduces its size below 1.
302
303*/
304
305static
306PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
307{
308    register PyUnicodeObject *unicode;
309
310    /* Optimization for empty strings */
311    if (length == 0 && unicode_empty != NULL) {
312        Py_INCREF(unicode_empty);
313        return unicode_empty;
314    }
315
316    /* Unicode freelist & memory allocation */
317    if (free_list) {
318        unicode = free_list;
319        free_list = *(PyUnicodeObject **)unicode;
320        numfree--;
321	if (unicode->str) {
322	    /* Keep-Alive optimization: we only upsize the buffer,
323	       never downsize it. */
324	    if ((unicode->length < length) &&
325                unicode_resize(unicode, length) < 0) {
326		PyObject_DEL(unicode->str);
327		goto onError;
328	    }
329	}
330        else {
331	    size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
332	    unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
333        }
334        PyObject_INIT(unicode, &PyUnicode_Type);
335    }
336    else {
337	size_t new_size;
338        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
339        if (unicode == NULL)
340            return NULL;
341	new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
342	unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
343    }
344
345    if (!unicode->str) {
346	PyErr_NoMemory();
347	goto onError;
348    }
349    /* Initialize the first element to guard against cases where
350     * the caller fails before initializing str -- unicode_resize()
351     * reads str[0], and the Keep-Alive optimization can keep memory
352     * allocated for str alive across a call to unicode_dealloc(unicode).
353     * We don't want unicode_resize to read uninitialized memory in
354     * that case.
355     */
356    unicode->str[0] = 0;
357    unicode->str[length] = 0;
358    unicode->length = length;
359    unicode->hash = -1;
360    unicode->state = 0;
361    unicode->defenc = NULL;
362    return unicode;
363
364 onError:
365    _Py_ForgetReference((PyObject *)unicode);
366    PyObject_Del(unicode);
367    return NULL;
368}
369
370static
371void unicode_dealloc(register PyUnicodeObject *unicode)
372{
373    switch (PyUnicode_CHECK_INTERNED(unicode)) {
374        case SSTATE_NOT_INTERNED:
375            break;
376
377        case SSTATE_INTERNED_MORTAL:
378            /* revive dead object temporarily for DelItem */
379            Py_REFCNT(unicode) = 3;
380            if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
381                Py_FatalError(
382                    "deletion of interned unicode string failed");
383            break;
384
385        case SSTATE_INTERNED_IMMORTAL:
386            Py_FatalError("Immortal interned unicode string died.");
387
388        default:
389            Py_FatalError("Inconsistent interned unicode string state.");
390    }
391
392    if (PyUnicode_CheckExact(unicode) &&
393	numfree < PyUnicode_MAXFREELIST) {
394        /* Keep-Alive optimization */
395	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
396	    PyObject_DEL(unicode->str);
397	    unicode->str = NULL;
398	    unicode->length = 0;
399	}
400	if (unicode->defenc) {
401	    Py_DECREF(unicode->defenc);
402	    unicode->defenc = NULL;
403	}
404	/* Add to free list */
405        *(PyUnicodeObject **)unicode = free_list;
406        free_list = unicode;
407        numfree++;
408    }
409    else {
410	PyObject_DEL(unicode->str);
411	Py_XDECREF(unicode->defenc);
412	Py_TYPE(unicode)->tp_free((PyObject *)unicode);
413    }
414}
415
416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
417{
418    register PyUnicodeObject *v;
419
420    /* Argument checks */
421    if (unicode == NULL) {
422	PyErr_BadInternalCall();
423	return -1;
424    }
425    v = (PyUnicodeObject *)*unicode;
426    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
427	PyErr_BadInternalCall();
428	return -1;
429    }
430
431    /* Resizing unicode_empty and single character objects is not
432       possible since these are being shared. We simply return a fresh
433       copy with the same Unicode content. */
434    if (v->length != length &&
435	(v == unicode_empty || v->length == 1)) {
436	PyUnicodeObject *w = _PyUnicode_New(length);
437	if (w == NULL)
438	    return -1;
439	Py_UNICODE_COPY(w->str, v->str,
440			length < v->length ? length : v->length);
441	Py_DECREF(*unicode);
442	*unicode = (PyObject *)w;
443	return 0;
444    }
445
446    /* Note that we don't have to modify *unicode for unshared Unicode
447       objects, since we can modify them in-place. */
448    return unicode_resize(v, length);
449}
450
451/* Internal API for use in unicodeobject.c only ! */
452#define _PyUnicode_Resize(unicodevar, length) \
453        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
454
455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
456				Py_ssize_t size)
457{
458    PyUnicodeObject *unicode;
459
460    /* If the Unicode data is known at construction time, we can apply
461       some optimizations which share commonly used objects. */
462    if (u != NULL) {
463
464	/* Optimization for empty strings */
465	if (size == 0 && unicode_empty != NULL) {
466	    Py_INCREF(unicode_empty);
467	    return (PyObject *)unicode_empty;
468	}
469
470	/* Single character Unicode objects in the Latin-1 range are
471	   shared when using this constructor */
472	if (size == 1 && *u < 256) {
473	    unicode = unicode_latin1[*u];
474	    if (!unicode) {
475		unicode = _PyUnicode_New(1);
476		if (!unicode)
477		    return NULL;
478		unicode->str[0] = *u;
479		unicode_latin1[*u] = unicode;
480	    }
481	    Py_INCREF(unicode);
482	    return (PyObject *)unicode;
483	}
484    }
485
486    unicode = _PyUnicode_New(size);
487    if (!unicode)
488        return NULL;
489
490    /* Copy the Unicode data into the new object */
491    if (u != NULL)
492	Py_UNICODE_COPY(unicode->str, u, size);
493
494    return (PyObject *)unicode;
495}
496
497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
498{
499    PyUnicodeObject *unicode;
500
501	if (size < 0) {
502		PyErr_SetString(PyExc_SystemError,
503		    "Negative size passed to PyUnicode_FromStringAndSize");
504		return NULL;
505	}
506
507    /* If the Unicode data is known at construction time, we can apply
508       some optimizations which share commonly used objects.
509       Also, this means the input must be UTF-8, so fall back to the
510       UTF-8 decoder at the end. */
511    if (u != NULL) {
512
513	/* Optimization for empty strings */
514	if (size == 0 && unicode_empty != NULL) {
515	    Py_INCREF(unicode_empty);
516	    return (PyObject *)unicode_empty;
517	}
518
519	/* Single characters are shared when using this constructor.
520           Restrict to ASCII, since the input must be UTF-8. */
521	if (size == 1 && Py_CHARMASK(*u) < 128) {
522	    unicode = unicode_latin1[Py_CHARMASK(*u)];
523	    if (!unicode) {
524		unicode = _PyUnicode_New(1);
525		if (!unicode)
526		    return NULL;
527		unicode->str[0] = Py_CHARMASK(*u);
528		unicode_latin1[Py_CHARMASK(*u)] = unicode;
529	    }
530	    Py_INCREF(unicode);
531	    return (PyObject *)unicode;
532	}
533
534        return PyUnicode_DecodeUTF8(u, size, NULL);
535    }
536
537    unicode = _PyUnicode_New(size);
538    if (!unicode)
539        return NULL;
540
541    return (PyObject *)unicode;
542}
543
544PyObject *PyUnicode_FromString(const char *u)
545{
546    size_t size = strlen(u);
547    if (size > PY_SSIZE_T_MAX) {
548        PyErr_SetString(PyExc_OverflowError, "input too long");
549        return NULL;
550    }
551
552    return PyUnicode_FromStringAndSize(u, size);
553}
554
555#ifdef HAVE_WCHAR_H
556
557PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
558				 Py_ssize_t size)
559{
560    PyUnicodeObject *unicode;
561
562    if (w == NULL) {
563        if (size == 0)
564            return PyUnicode_FromStringAndSize(NULL, 0);
565	PyErr_BadInternalCall();
566	return NULL;
567    }
568
569    if (size == -1) {
570        size = wcslen(w);
571    }
572
573    unicode = _PyUnicode_New(size);
574    if (!unicode)
575        return NULL;
576
577    /* Copy the wchar_t data into the new object */
578#ifdef HAVE_USABLE_WCHAR_T
579    memcpy(unicode->str, w, size * sizeof(wchar_t));
580#else
581    {
582	register Py_UNICODE *u;
583	register Py_ssize_t i;
584	u = PyUnicode_AS_UNICODE(unicode);
585	for (i = size; i > 0; i--)
586	    *u++ = *w++;
587    }
588#endif
589
590    return (PyObject *)unicode;
591}
592
593static void
594makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
595{
596	*fmt++ = '%';
597	if (width) {
598		if (zeropad)
599			*fmt++ = '0';
600		fmt += sprintf(fmt, "%d", width);
601	}
602	if (precision)
603		fmt += sprintf(fmt, ".%d", precision);
604	if (longflag)
605		*fmt++ = 'l';
606	else if (size_tflag) {
607		char *f = PY_FORMAT_SIZE_T;
608		while (*f)
609			*fmt++ = *f++;
610	}
611	*fmt++ = c;
612	*fmt = '\0';
613}
614
615#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
616
617PyObject *
618PyUnicode_FromFormatV(const char *format, va_list vargs)
619{
620	va_list count;
621	Py_ssize_t callcount = 0;
622	PyObject **callresults = NULL;
623	PyObject **callresult = NULL;
624	Py_ssize_t n = 0;
625	int width = 0;
626	int precision = 0;
627	int zeropad;
628	const char* f;
629	Py_UNICODE *s;
630	PyObject *string;
631	/* used by sprintf */
632	char buffer[21];
633	/* use abuffer instead of buffer, if we need more space
634	 * (which can happen if there's a format specifier with width). */
635	char *abuffer = NULL;
636	char *realbuffer;
637	Py_ssize_t abuffersize = 0;
638	char fmt[60]; /* should be enough for %0width.precisionld */
639	const char *copy;
640
641#ifdef VA_LIST_IS_ARRAY
642	Py_MEMCPY(count, vargs, sizeof(va_list));
643#else
644#ifdef  __va_copy
645	__va_copy(count, vargs);
646#else
647	count = vargs;
648#endif
649#endif
650	/* step 1: count the number of %S/%R format specifications
651	 * (we call PyObject_Str()/PyObject_Repr() for these objects
652	 * once during step 3 and put the result in an array) */
653	for (f = format; *f; f++) {
654		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
655			++callcount;
656	}
657	/* step 2: allocate memory for the results of
658	 * PyObject_Str()/PyObject_Repr() calls */
659	if (callcount) {
660		callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
661		if (!callresults) {
662			PyErr_NoMemory();
663			return NULL;
664		}
665		callresult = callresults;
666	}
667	/* step 3: figure out how large a buffer we need */
668	for (f = format; *f; f++) {
669		if (*f == '%') {
670			const char* p = f;
671			width = 0;
672			while (ISDIGIT((unsigned)*f))
673				width = (width*10) + *f++ - '0';
674			while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
675				;
676
677			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
678			 * they don't affect the amount of space we reserve.
679			 */
680			if ((*f == 'l' || *f == 'z') &&
681					(f[1] == 'd' || f[1] == 'u'))
682                                ++f;
683
684			switch (*f) {
685			case 'c':
686				(void)va_arg(count, int);
687				/* fall through... */
688			case '%':
689				n++;
690				break;
691			case 'd': case 'u': case 'i': case 'x':
692				(void) va_arg(count, int);
693				/* 20 bytes is enough to hold a 64-bit
694				   integer.  Decimal takes the most space.
695				   This isn't enough for octal.
696				   If a width is specified we need more
697				   (which we allocate later). */
698				if (width < 20)
699					width = 20;
700				n += width;
701				if (abuffersize < width)
702					abuffersize = width;
703				break;
704			case 's':
705			{
706				/* UTF-8 */
707				unsigned char*s;
708				s = va_arg(count, unsigned char*);
709				while (*s) {
710					if (*s < 128) {
711						n++; s++;
712					} else if (*s < 0xc0) {
713						/* invalid UTF-8 */
714						n++; s++;
715					} else if (*s < 0xc0) {
716						n++;
717						s++; if(!*s)break;
718						s++;
719					} else if (*s < 0xe0) {
720						n++;
721						s++; if(!*s)break;
722						s++; if(!*s)break;
723						s++;
724					} else {
725						#ifdef Py_UNICODE_WIDE
726						n++;
727						#else
728						n+=2;
729						#endif
730						s++; if(!*s)break;
731						s++; if(!*s)break;
732						s++; if(!*s)break;
733						s++;
734					}
735				}
736				break;
737			}
738			case 'U':
739			{
740				PyObject *obj = va_arg(count, PyObject *);
741				assert(obj && PyUnicode_Check(obj));
742				n += PyUnicode_GET_SIZE(obj);
743				break;
744			}
745			case 'V':
746			{
747				PyObject *obj = va_arg(count, PyObject *);
748				const char *str = va_arg(count, const char *);
749				assert(obj || str);
750				assert(!obj || PyUnicode_Check(obj));
751				if (obj)
752					n += PyUnicode_GET_SIZE(obj);
753				else
754					n += strlen(str);
755				break;
756			}
757			case 'S':
758			{
759				PyObject *obj = va_arg(count, PyObject *);
760				PyObject *str;
761				assert(obj);
762				str = PyObject_Str(obj);
763				if (!str)
764					goto fail;
765				n += PyUnicode_GET_SIZE(str);
766				/* Remember the str and switch to the next slot */
767				*callresult++ = str;
768				break;
769			}
770			case 'R':
771			{
772				PyObject *obj = va_arg(count, PyObject *);
773				PyObject *repr;
774				assert(obj);
775				repr = PyObject_Repr(obj);
776				if (!repr)
777					goto fail;
778				n += PyUnicode_GET_SIZE(repr);
779				/* Remember the repr and switch to the next slot */
780				*callresult++ = repr;
781				break;
782			}
783			case 'p':
784				(void) va_arg(count, int);
785				/* maximum 64-bit pointer representation:
786				 * 0xffffffffffffffff
787				 * so 19 characters is enough.
788				 * XXX I count 18 -- what's the extra for?
789				 */
790				n += 19;
791				break;
792			default:
793				/* if we stumble upon an unknown
794				   formatting code, copy the rest of
795				   the format string to the output
796				   string. (we cannot just skip the
797				   code, since there's no way to know
798				   what's in the argument list) */
799				n += strlen(p);
800				goto expand;
801			}
802		} else
803			n++;
804	}
805 expand:
806	if (abuffersize > 20) {
807		abuffer = PyObject_Malloc(abuffersize);
808		if (!abuffer) {
809			PyErr_NoMemory();
810			goto fail;
811		}
812		realbuffer = abuffer;
813	}
814	else
815		realbuffer = buffer;
816	/* step 4: fill the buffer */
817	/* Since we've analyzed how much space we need for the worst case,
818	   we don't have to resize the string.
819	   There can be no errors beyond this point. */
820	string = PyUnicode_FromUnicode(NULL, n);
821	if (!string)
822		goto fail;
823
824	s = PyUnicode_AS_UNICODE(string);
825	callresult = callresults;
826
827	for (f = format; *f; f++) {
828		if (*f == '%') {
829			const char* p = f++;
830			int longflag = 0;
831			int size_tflag = 0;
832			zeropad = (*f == '0');
833			/* parse the width.precision part */
834			width = 0;
835			while (ISDIGIT((unsigned)*f))
836				width = (width*10) + *f++ - '0';
837			precision = 0;
838			if (*f == '.') {
839				f++;
840				while (ISDIGIT((unsigned)*f))
841					precision = (precision*10) + *f++ - '0';
842			}
843			/* handle the long flag, but only for %ld and %lu.
844			   others can be added when necessary. */
845			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
846				longflag = 1;
847				++f;
848			}
849			/* handle the size_t flag. */
850			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
851				size_tflag = 1;
852				++f;
853			}
854
855			switch (*f) {
856			case 'c':
857				*s++ = va_arg(vargs, int);
858				break;
859			case 'd':
860				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
861				if (longflag)
862					sprintf(realbuffer, fmt, va_arg(vargs, long));
863				else if (size_tflag)
864					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
865				else
866					sprintf(realbuffer, fmt, va_arg(vargs, int));
867				appendstring(realbuffer);
868				break;
869			case 'u':
870				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
871				if (longflag)
872					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
873				else if (size_tflag)
874					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
875				else
876					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
877				appendstring(realbuffer);
878				break;
879			case 'i':
880				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
881				sprintf(realbuffer, fmt, va_arg(vargs, int));
882				appendstring(realbuffer);
883				break;
884			case 'x':
885				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
886				sprintf(realbuffer, fmt, va_arg(vargs, int));
887				appendstring(realbuffer);
888				break;
889			case 's':
890			{
891				/* Parameter must be UTF-8 encoded.
892				   In case of encoding errors, use
893				   the replacement character. */
894				PyObject *u;
895				p = va_arg(vargs, char*);
896				u = PyUnicode_DecodeUTF8(p, strlen(p),
897							 "replace");
898				if (!u)
899					goto fail;
900				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
901						PyUnicode_GET_SIZE(u));
902				s += PyUnicode_GET_SIZE(u);
903				Py_DECREF(u);
904				break;
905			}
906			case 'U':
907			{
908				PyObject *obj = va_arg(vargs, PyObject *);
909				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
910				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
911				s += size;
912				break;
913			}
914			case 'V':
915			{
916				PyObject *obj = va_arg(vargs, PyObject *);
917				const char *str = va_arg(vargs, const char *);
918				if (obj) {
919					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
920					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
921					s += size;
922				} else {
923					appendstring(str);
924				}
925				break;
926			}
927			case 'S':
928			case 'R':
929			{
930				Py_UNICODE *ucopy;
931				Py_ssize_t usize;
932				Py_ssize_t upos;
933				/* unused, since we already have the result */
934				(void) va_arg(vargs, PyObject *);
935				ucopy = PyUnicode_AS_UNICODE(*callresult);
936				usize = PyUnicode_GET_SIZE(*callresult);
937				for (upos = 0; upos<usize;)
938					*s++ = ucopy[upos++];
939				/* We're done with the unicode()/repr() => forget it */
940				Py_DECREF(*callresult);
941				/* switch to next unicode()/repr() result */
942				++callresult;
943				break;
944			}
945			case 'p':
946				sprintf(buffer, "%p", va_arg(vargs, void*));
947				/* %p is ill-defined:  ensure leading 0x. */
948				if (buffer[1] == 'X')
949					buffer[1] = 'x';
950				else if (buffer[1] != 'x') {
951					memmove(buffer+2, buffer, strlen(buffer)+1);
952					buffer[0] = '0';
953					buffer[1] = 'x';
954				}
955				appendstring(buffer);
956				break;
957			case '%':
958				*s++ = '%';
959				break;
960			default:
961				appendstring(p);
962				goto end;
963			}
964		} else
965			*s++ = *f;
966	}
967
968 end:
969	if (callresults)
970		PyObject_Free(callresults);
971	if (abuffer)
972		PyObject_Free(abuffer);
973	_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
974	return string;
975 fail:
976	if (callresults) {
977		PyObject **callresult2 = callresults;
978		while (callresult2 < callresult) {
979			Py_DECREF(*callresult2);
980			++callresult2;
981		}
982		PyObject_Free(callresults);
983	}
984	if (abuffer)
985		PyObject_Free(abuffer);
986	return NULL;
987}
988
989#undef appendstring
990
991PyObject *
992PyUnicode_FromFormat(const char *format, ...)
993{
994	PyObject* ret;
995	va_list vargs;
996
997#ifdef HAVE_STDARG_PROTOTYPES
998	va_start(vargs, format);
999#else
1000	va_start(vargs);
1001#endif
1002	ret = PyUnicode_FromFormatV(format, vargs);
1003	va_end(vargs);
1004	return ret;
1005}
1006
1007Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1008				wchar_t *w,
1009				Py_ssize_t size)
1010{
1011    if (unicode == NULL) {
1012	PyErr_BadInternalCall();
1013	return -1;
1014    }
1015
1016    /* If possible, try to copy the 0-termination as well */
1017    if (size > PyUnicode_GET_SIZE(unicode))
1018	size = PyUnicode_GET_SIZE(unicode) + 1;
1019
1020#ifdef HAVE_USABLE_WCHAR_T
1021    memcpy(w, unicode->str, size * sizeof(wchar_t));
1022#else
1023    {
1024	register Py_UNICODE *u;
1025	register Py_ssize_t i;
1026	u = PyUnicode_AS_UNICODE(unicode);
1027	for (i = size; i > 0; i--)
1028	    *w++ = *u++;
1029    }
1030#endif
1031
1032    if (size > PyUnicode_GET_SIZE(unicode))
1033        return PyUnicode_GET_SIZE(unicode);
1034    else
1035    return size;
1036}
1037
1038#endif
1039
1040PyObject *PyUnicode_FromOrdinal(int ordinal)
1041{
1042    Py_UNICODE s[2];
1043
1044    if (ordinal < 0 || ordinal > 0x10ffff) {
1045	PyErr_SetString(PyExc_ValueError,
1046			"chr() arg not in range(0x110000)");
1047	return NULL;
1048    }
1049
1050#ifndef Py_UNICODE_WIDE
1051    if (ordinal > 0xffff) {
1052        ordinal -= 0x10000;
1053        s[0] = 0xD800 | (ordinal >> 10);
1054        s[1] = 0xDC00 | (ordinal & 0x3FF);
1055        return PyUnicode_FromUnicode(s, 2);
1056    }
1057#endif
1058
1059    s[0] = (Py_UNICODE)ordinal;
1060    return PyUnicode_FromUnicode(s, 1);
1061}
1062
1063PyObject *PyUnicode_FromObject(register PyObject *obj)
1064{
1065    /* XXX Perhaps we should make this API an alias of
1066           PyObject_Str() instead ?! */
1067    if (PyUnicode_CheckExact(obj)) {
1068	Py_INCREF(obj);
1069	return obj;
1070    }
1071    if (PyUnicode_Check(obj)) {
1072	/* For a Unicode subtype that's not a Unicode object,
1073	   return a true Unicode object with the same data. */
1074	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1075				     PyUnicode_GET_SIZE(obj));
1076    }
1077    PyErr_Format(PyExc_TypeError,
1078                 "Can't convert '%.100s' object to str implicitly",
1079                 Py_TYPE(obj)->tp_name);
1080    return NULL;
1081}
1082
1083PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1084				      const char *encoding,
1085				      const char *errors)
1086{
1087    const char *s = NULL;
1088    Py_ssize_t len;
1089    PyObject *v;
1090
1091    if (obj == NULL) {
1092	PyErr_BadInternalCall();
1093	return NULL;
1094    }
1095
1096    if (PyUnicode_Check(obj)) {
1097	PyErr_SetString(PyExc_TypeError,
1098			"decoding Unicode is not supported");
1099	return NULL;
1100	}
1101
1102    /* Coerce object */
1103    if (PyBytes_Check(obj)) {
1104	    s = PyBytes_AS_STRING(obj);
1105	    len = PyBytes_GET_SIZE(obj);
1106	    }
1107    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1108	/* Overwrite the error message with something more useful in
1109	   case of a TypeError. */
1110	if (PyErr_ExceptionMatches(PyExc_TypeError))
1111	PyErr_Format(PyExc_TypeError,
1112			 "coercing to Unicode: need string or buffer, "
1113			 "%.80s found",
1114		     Py_TYPE(obj)->tp_name);
1115	goto onError;
1116    }
1117
1118    /* Convert to Unicode */
1119    if (len == 0) {
1120	Py_INCREF(unicode_empty);
1121	v = (PyObject *)unicode_empty;
1122    }
1123    else
1124	v = PyUnicode_Decode(s, len, encoding, errors);
1125
1126    return v;
1127
1128 onError:
1129    return NULL;
1130}
1131
1132PyObject *PyUnicode_Decode(const char *s,
1133			   Py_ssize_t size,
1134			   const char *encoding,
1135			   const char *errors)
1136{
1137    PyObject *buffer = NULL, *unicode;
1138    Py_buffer info;
1139    char lower[20];  /* Enough for any encoding name we recognize */
1140    char *l;
1141    const char *e;
1142
1143    if (encoding == NULL)
1144        encoding = PyUnicode_GetDefaultEncoding();
1145
1146    /* Convert encoding to lower case and replace '_' with '-' in order to
1147       catch e.g. UTF_8 */
1148    e = encoding;
1149    l = lower;
1150    while (*e && l < &lower[(sizeof lower) - 2]) {
1151        if (ISUPPER(*e)) {
1152            *l++ = TOLOWER(*e++);
1153        }
1154        else if (*e == '_') {
1155            *l++ = '-';
1156            e++;
1157        }
1158        else {
1159            *l++ = *e++;
1160        }
1161    }
1162    *l = '\0';
1163
1164    /* Shortcuts for common default encodings */
1165    if (strcmp(lower, "utf-8") == 0)
1166        return PyUnicode_DecodeUTF8(s, size, errors);
1167    else if ((strcmp(lower, "latin-1") == 0) ||
1168             (strcmp(lower, "iso-8859-1") == 0))
1169        return PyUnicode_DecodeLatin1(s, size, errors);
1170#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1171    else if (strcmp(lower, "mbcs") == 0)
1172        return PyUnicode_DecodeMBCS(s, size, errors);
1173#endif
1174    else if (strcmp(lower, "ascii") == 0)
1175        return PyUnicode_DecodeASCII(s, size, errors);
1176    else if (strcmp(lower, "utf-16") == 0)
1177        return PyUnicode_DecodeUTF16(s, size, errors, 0);
1178    else if (strcmp(lower, "utf-32") == 0)
1179        return PyUnicode_DecodeUTF32(s, size, errors, 0);
1180
1181    /* Decode via the codec registry */
1182    buffer = NULL;
1183    if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1184        goto onError;
1185    buffer = PyMemoryView_FromMemory(&info);
1186    if (buffer == NULL)
1187        goto onError;
1188    unicode = PyCodec_Decode(buffer, encoding, errors);
1189    if (unicode == NULL)
1190        goto onError;
1191    if (!PyUnicode_Check(unicode)) {
1192        PyErr_Format(PyExc_TypeError,
1193                     "decoder did not return an unicode object (type=%.400s)",
1194                     Py_TYPE(unicode)->tp_name);
1195        Py_DECREF(unicode);
1196        goto onError;
1197    }
1198    Py_DECREF(buffer);
1199    return unicode;
1200
1201 onError:
1202    Py_XDECREF(buffer);
1203    return NULL;
1204}
1205
1206PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1207                                    const char *encoding,
1208                                    const char *errors)
1209{
1210    PyObject *v;
1211
1212    if (!PyUnicode_Check(unicode)) {
1213        PyErr_BadArgument();
1214        goto onError;
1215    }
1216
1217    if (encoding == NULL)
1218	encoding = PyUnicode_GetDefaultEncoding();
1219
1220    /* Decode via the codec registry */
1221    v = PyCodec_Decode(unicode, encoding, errors);
1222    if (v == NULL)
1223        goto onError;
1224    return v;
1225
1226 onError:
1227    return NULL;
1228}
1229
1230PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1231			   Py_ssize_t size,
1232			   const char *encoding,
1233			   const char *errors)
1234{
1235    PyObject *v, *unicode;
1236
1237    unicode = PyUnicode_FromUnicode(s, size);
1238    if (unicode == NULL)
1239	return NULL;
1240    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1241    Py_DECREF(unicode);
1242    return v;
1243}
1244
1245PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1246                                    const char *encoding,
1247                                    const char *errors)
1248{
1249    PyObject *v;
1250
1251    if (!PyUnicode_Check(unicode)) {
1252        PyErr_BadArgument();
1253        goto onError;
1254    }
1255
1256    if (encoding == NULL)
1257	encoding = PyUnicode_GetDefaultEncoding();
1258
1259    /* Encode via the codec registry */
1260    v = PyCodec_Encode(unicode, encoding, errors);
1261    if (v == NULL)
1262        goto onError;
1263    return v;
1264
1265 onError:
1266    return NULL;
1267}
1268
1269PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1270                                    const char *encoding,
1271                                    const char *errors)
1272{
1273    PyObject *v;
1274
1275    if (!PyUnicode_Check(unicode)) {
1276        PyErr_BadArgument();
1277        goto onError;
1278    }
1279
1280    if (encoding == NULL)
1281	encoding = PyUnicode_GetDefaultEncoding();
1282
1283    /* Shortcuts for common default encodings */
1284    if (errors == NULL) {
1285	if (strcmp(encoding, "utf-8") == 0)
1286	    return PyUnicode_AsUTF8String(unicode);
1287	else if (strcmp(encoding, "latin-1") == 0)
1288	    return PyUnicode_AsLatin1String(unicode);
1289#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1290	else if (strcmp(encoding, "mbcs") == 0)
1291	    return PyUnicode_AsMBCSString(unicode);
1292#endif
1293	else if (strcmp(encoding, "ascii") == 0)
1294	    return PyUnicode_AsASCIIString(unicode);
1295    }
1296
1297    /* Encode via the codec registry */
1298    v = PyCodec_Encode(unicode, encoding, errors);
1299    if (v == NULL)
1300        goto onError;
1301    assert(PyBytes_Check(v));
1302    return v;
1303
1304 onError:
1305    return NULL;
1306}
1307
1308PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1309					    const char *errors)
1310{
1311    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1312    if (v)
1313        return v;
1314    if (errors != NULL)
1315        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1316    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1317                             PyUnicode_GET_SIZE(unicode),
1318                             NULL);
1319    if (!v)
1320        return NULL;
1321    ((PyUnicodeObject *)unicode)->defenc = v;
1322    return v;
1323}
1324
1325PyObject*
1326PyUnicode_DecodeFSDefault(const char *s) {
1327    Py_ssize_t size = (Py_ssize_t)strlen(s);
1328    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1329}
1330
1331PyObject*
1332PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1333{
1334    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1335       can be undefined. If it is case, decode using UTF-8. The following assumes
1336       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1337       bootstrapping process where the codecs aren't ready yet.
1338    */
1339    if (Py_FileSystemDefaultEncoding) {
1340#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1341        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1342            return PyUnicode_DecodeMBCS(s, size, "replace");
1343        }
1344#elif defined(__APPLE__)
1345        if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1346            return PyUnicode_DecodeUTF8(s, size, "replace");
1347        }
1348#endif
1349        return PyUnicode_Decode(s, size,
1350                                Py_FileSystemDefaultEncoding,
1351                                "replace");
1352    }
1353    else {
1354        return PyUnicode_DecodeUTF8(s, size, "replace");
1355    }
1356}
1357
1358char*
1359PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1360{
1361    PyObject *bytes;
1362    if (!PyUnicode_Check(unicode)) {
1363        PyErr_BadArgument();
1364        return NULL;
1365    }
1366    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1367    if (bytes == NULL)
1368        return NULL;
1369    if (psize != NULL)
1370        *psize = PyBytes_GET_SIZE(bytes);
1371    return PyBytes_AS_STRING(bytes);
1372}
1373
1374char*
1375PyUnicode_AsString(PyObject *unicode)
1376{
1377    return PyUnicode_AsStringAndSize(unicode, NULL);
1378}
1379
1380Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1381{
1382    if (!PyUnicode_Check(unicode)) {
1383        PyErr_BadArgument();
1384        goto onError;
1385    }
1386    return PyUnicode_AS_UNICODE(unicode);
1387
1388 onError:
1389    return NULL;
1390}
1391
1392Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1393{
1394    if (!PyUnicode_Check(unicode)) {
1395        PyErr_BadArgument();
1396        goto onError;
1397    }
1398    return PyUnicode_GET_SIZE(unicode);
1399
1400 onError:
1401    return -1;
1402}
1403
1404const char *PyUnicode_GetDefaultEncoding(void)
1405{
1406    return unicode_default_encoding;
1407}
1408
1409int PyUnicode_SetDefaultEncoding(const char *encoding)
1410{
1411    if (strcmp(encoding, unicode_default_encoding) != 0) {
1412        PyErr_Format(PyExc_ValueError,
1413                     "Can only set default encoding to %s",
1414                     unicode_default_encoding);
1415        return -1;
1416    }
1417    return 0;
1418}
1419
1420/* error handling callback helper:
1421   build arguments, call the callback and check the arguments,
1422   if no exception occurred, copy the replacement to the output
1423   and adjust various state variables.
1424   return 0 on success, -1 on error
1425*/
1426
1427static
1428int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1429                 const char *encoding, const char *reason,
1430                 const char **input, const char **inend, Py_ssize_t *startinpos,
1431                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1432                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1433{
1434    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1435
1436    PyObject *restuple = NULL;
1437    PyObject *repunicode = NULL;
1438    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1439    Py_ssize_t insize;
1440    Py_ssize_t requiredsize;
1441    Py_ssize_t newpos;
1442    Py_UNICODE *repptr;
1443    PyObject *inputobj = NULL;
1444    Py_ssize_t repsize;
1445    int res = -1;
1446
1447    if (*errorHandler == NULL) {
1448	*errorHandler = PyCodec_LookupError(errors);
1449	if (*errorHandler == NULL)
1450	   goto onError;
1451    }
1452
1453    if (*exceptionObject == NULL) {
1454    	*exceptionObject = PyUnicodeDecodeError_Create(
1455	    encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1456	if (*exceptionObject == NULL)
1457	   goto onError;
1458    }
1459    else {
1460	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1461	    goto onError;
1462	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1463	    goto onError;
1464	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1465	    goto onError;
1466    }
1467
1468    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1469    if (restuple == NULL)
1470	goto onError;
1471    if (!PyTuple_Check(restuple)) {
1472	PyErr_Format(PyExc_TypeError, &argparse[4]);
1473	goto onError;
1474    }
1475    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1476	goto onError;
1477
1478    /* Copy back the bytes variables, which might have been modified by the
1479       callback */
1480    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1481    if (!inputobj)
1482        goto onError;
1483    if (!PyBytes_Check(inputobj)) {
1484	PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1485    }
1486    *input = PyBytes_AS_STRING(inputobj);
1487    insize = PyBytes_GET_SIZE(inputobj);
1488    *inend = *input + insize;
1489    /* we can DECREF safely, as the exception has another reference,
1490       so the object won't go away. */
1491    Py_DECREF(inputobj);
1492
1493    if (newpos<0)
1494	newpos = insize+newpos;
1495    if (newpos<0 || newpos>insize) {
1496	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1497	goto onError;
1498    }
1499
1500    /* need more space? (at least enough for what we
1501       have+the replacement+the rest of the string (starting
1502       at the new input position), so we won't have to check space
1503       when there are no errors in the rest of the string) */
1504    repptr = PyUnicode_AS_UNICODE(repunicode);
1505    repsize = PyUnicode_GET_SIZE(repunicode);
1506    requiredsize = *outpos + repsize + insize-newpos;
1507    if (requiredsize > outsize) {
1508	if (requiredsize<2*outsize)
1509	    requiredsize = 2*outsize;
1510	if (PyUnicode_Resize(output, requiredsize) < 0)
1511	    goto onError;
1512	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1513    }
1514    *endinpos = newpos;
1515    *inptr = *input + newpos;
1516    Py_UNICODE_COPY(*outptr, repptr, repsize);
1517    *outptr += repsize;
1518    *outpos += repsize;
1519
1520    /* we made it! */
1521    res = 0;
1522
1523    onError:
1524    Py_XDECREF(restuple);
1525    return res;
1526}
1527
1528/* --- UTF-7 Codec -------------------------------------------------------- */
1529
1530/* see RFC2152 for details */
1531
1532static
1533char utf7_special[128] = {
1534    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1535       encoded:
1536	   0 - not special
1537	   1 - special
1538	   2 - whitespace (optional)
1539	   3 - RFC2152 Set O (optional) */
1540    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1541    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1542    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1543    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1544    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1545    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1546    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1547    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1548
1549};
1550
1551/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1552   warnings about the comparison always being false; since
1553   utf7_special[0] is 1, we can safely make that one comparison
1554   true  */
1555
1556#define SPECIAL(c, encodeO, encodeWS) \
1557    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1558     (encodeWS && (utf7_special[(c)] == 2)) || \
1559     (encodeO && (utf7_special[(c)] == 3)))
1560
1561#define B64(n)  \
1562    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1563#define B64CHAR(c) \
1564    (ISALNUM(c) || (c) == '+' || (c) == '/')
1565#define UB64(c) \
1566    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1567     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1568
1569#define ENCODE(out, ch, bits)                   \
1570    while (bits >= 6) {                         \
1571        *out++ = B64(ch >> (bits-6));           \
1572        bits -= 6;                              \
1573    }
1574
1575#define DECODE(out, ch, bits, surrogate)                                \
1576    while (bits >= 16) {                                                \
1577        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1578        bits -= 16;                                                     \
1579        if (surrogate) {                                                \
1580            /* We have already generated an error for the high surrogate \
1581               so let's not bother seeing if the low surrogate is correct or not */ \
1582            surrogate = 0;                                              \
1583        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1584            /* This is a surrogate pair. Unfortunately we can't represent \
1585               it in a 16-bit character */                              \
1586            surrogate = 1;                                              \
1587            errmsg = "code pairs are not supported";                    \
1588            goto utf7Error;                                             \
1589        } else {                                                        \
1590            *out++ = outCh;                                             \
1591        }                                                               \
1592    }
1593
1594PyObject *PyUnicode_DecodeUTF7(const char *s,
1595			       Py_ssize_t size,
1596			       const char *errors)
1597{
1598    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1599}
1600
1601PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1602			       Py_ssize_t size,
1603			       const char *errors,
1604			       Py_ssize_t *consumed)
1605{
1606    const char *starts = s;
1607    Py_ssize_t startinpos;
1608    Py_ssize_t endinpos;
1609    Py_ssize_t outpos;
1610    const char *e;
1611    PyUnicodeObject *unicode;
1612    Py_UNICODE *p;
1613    const char *errmsg = "";
1614    int inShift = 0;
1615    unsigned int bitsleft = 0;
1616    unsigned long charsleft = 0;
1617    int surrogate = 0;
1618    PyObject *errorHandler = NULL;
1619    PyObject *exc = NULL;
1620
1621    unicode = _PyUnicode_New(size);
1622    if (!unicode)
1623        return NULL;
1624    if (size == 0) {
1625        if (consumed)
1626            *consumed = 0;
1627        return (PyObject *)unicode;
1628    }
1629
1630    p = unicode->str;
1631    e = s + size;
1632
1633    while (s < e) {
1634        Py_UNICODE ch;
1635        restart:
1636        ch = *s;
1637
1638        if (inShift) {
1639            if ((ch == '-') || !B64CHAR(ch)) {
1640                inShift = 0;
1641                s++;
1642
1643                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1644                if (bitsleft >= 6) {
1645                    /* The shift sequence has a partial character in it. If
1646                       bitsleft < 6 then we could just classify it as padding
1647                       but that is not the case here */
1648
1649                    errmsg = "partial character in shift sequence";
1650                    goto utf7Error;
1651                }
1652                /* According to RFC2152 the remaining bits should be zero. We
1653                   choose to signal an error/insert a replacement character
1654                   here so indicate the potential of a misencoded character. */
1655
1656                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1657                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1658                    errmsg = "non-zero padding bits in shift sequence";
1659                    goto utf7Error;
1660                }
1661
1662                if (ch == '-') {
1663                    if ((s < e) && (*(s) == '-')) {
1664                        *p++ = '-';
1665                        inShift = 1;
1666                    }
1667                } else if (SPECIAL(ch,0,0)) {
1668                    errmsg = "unexpected special character";
1669	                goto utf7Error;
1670                } else  {
1671                    *p++ = ch;
1672                }
1673            } else {
1674                charsleft = (charsleft << 6) | UB64(ch);
1675                bitsleft += 6;
1676                s++;
1677                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1678            }
1679        }
1680        else if ( ch == '+' ) {
1681            startinpos = s-starts;
1682            s++;
1683            if (s < e && *s == '-') {
1684                s++;
1685                *p++ = '+';
1686            } else
1687            {
1688                inShift = 1;
1689                bitsleft = 0;
1690            }
1691        }
1692        else if (SPECIAL(ch,0,0)) {
1693            startinpos = s-starts;
1694            errmsg = "unexpected special character";
1695            s++;
1696            goto utf7Error;
1697        }
1698        else {
1699            *p++ = ch;
1700            s++;
1701        }
1702        continue;
1703    utf7Error:
1704        outpos = p-PyUnicode_AS_UNICODE(unicode);
1705        endinpos = s-starts;
1706        if (unicode_decode_call_errorhandler(
1707             errors, &errorHandler,
1708             "utf7", errmsg,
1709             &starts, &e, &startinpos, &endinpos, &exc, &s,
1710             (PyObject **)&unicode, &outpos, &p))
1711        goto onError;
1712    }
1713
1714    if (inShift && !consumed) {
1715        outpos = p-PyUnicode_AS_UNICODE(unicode);
1716        endinpos = size;
1717        if (unicode_decode_call_errorhandler(
1718             errors, &errorHandler,
1719             "utf7", "unterminated shift sequence",
1720             &starts, &e, &startinpos, &endinpos, &exc, &s,
1721             (PyObject **)&unicode, &outpos, &p))
1722            goto onError;
1723        if (s < e)
1724           goto restart;
1725    }
1726    if (consumed) {
1727        if(inShift)
1728            *consumed = startinpos;
1729        else
1730            *consumed = s-starts;
1731    }
1732
1733    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1734        goto onError;
1735
1736    Py_XDECREF(errorHandler);
1737    Py_XDECREF(exc);
1738    return (PyObject *)unicode;
1739
1740onError:
1741    Py_XDECREF(errorHandler);
1742    Py_XDECREF(exc);
1743    Py_DECREF(unicode);
1744    return NULL;
1745}
1746
1747
1748PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1749                   Py_ssize_t size,
1750                   int encodeSetO,
1751                   int encodeWhiteSpace,
1752                   const char *errors)
1753{
1754    PyObject *v, *result;
1755    /* It might be possible to tighten this worst case */
1756    Py_ssize_t cbAllocated = 5 * size;
1757    int inShift = 0;
1758    Py_ssize_t i = 0;
1759    unsigned int bitsleft = 0;
1760    unsigned long charsleft = 0;
1761    char * out;
1762    char * start;
1763
1764    if (size == 0)
1765       return PyBytes_FromStringAndSize(NULL, 0);
1766
1767    v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
1768    if (v == NULL)
1769        return NULL;
1770
1771    start = out = PyByteArray_AS_STRING(v);
1772    for (;i < size; ++i) {
1773        Py_UNICODE ch = s[i];
1774
1775        if (!inShift) {
1776            if (ch == '+') {
1777                *out++ = '+';
1778                *out++ = '-';
1779            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1780                charsleft = ch;
1781                bitsleft = 16;
1782                *out++ = '+';
1783                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1784                inShift = bitsleft > 0;
1785            } else {
1786                *out++ = (char) ch;
1787            }
1788        } else {
1789            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1790                *out++ = B64(charsleft << (6-bitsleft));
1791                charsleft = 0;
1792                bitsleft = 0;
1793                /* Characters not in the BASE64 set implicitly unshift the sequence
1794                   so no '-' is required, except if the character is itself a '-' */
1795                if (B64CHAR(ch) || ch == '-') {
1796                    *out++ = '-';
1797                }
1798                inShift = 0;
1799                *out++ = (char) ch;
1800            } else {
1801                bitsleft += 16;
1802                charsleft = (charsleft << 16) | ch;
1803                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1804
1805                /* If the next character is special then we dont' need to terminate
1806                   the shift sequence. If the next character is not a BASE64 character
1807                   or '-' then the shift sequence will be terminated implicitly and we
1808                   don't have to insert a '-'. */
1809
1810                if (bitsleft == 0) {
1811                    if (i + 1 < size) {
1812                        Py_UNICODE ch2 = s[i+1];
1813
1814                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1815
1816                        } else if (B64CHAR(ch2) || ch2 == '-') {
1817                            *out++ = '-';
1818                            inShift = 0;
1819                        } else {
1820                            inShift = 0;
1821                        }
1822
1823                    }
1824                    else {
1825                        *out++ = '-';
1826                        inShift = 0;
1827                    }
1828                }
1829            }
1830        }
1831    }
1832    if (bitsleft) {
1833        *out++= B64(charsleft << (6-bitsleft) );
1834        *out++ = '-';
1835    }
1836
1837    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
1838    Py_DECREF(v);
1839    return result;
1840}
1841
1842#undef SPECIAL
1843#undef B64
1844#undef B64CHAR
1845#undef UB64
1846#undef ENCODE
1847#undef DECODE
1848
1849/* --- UTF-8 Codec -------------------------------------------------------- */
1850
1851static
1852char utf8_code_length[256] = {
1853    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1854       illegal prefix.  see RFC 2279 for details */
1855    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1863    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1867    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1869    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1870    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1871};
1872
1873PyObject *PyUnicode_DecodeUTF8(const char *s,
1874			       Py_ssize_t size,
1875			       const char *errors)
1876{
1877    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1878}
1879
1880PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1881			                Py_ssize_t size,
1882			                const char *errors,
1883			                Py_ssize_t *consumed)
1884{
1885    const char *starts = s;
1886    int n;
1887    Py_ssize_t startinpos;
1888    Py_ssize_t endinpos;
1889    Py_ssize_t outpos;
1890    const char *e;
1891    PyUnicodeObject *unicode;
1892    Py_UNICODE *p;
1893    const char *errmsg = "";
1894    PyObject *errorHandler = NULL;
1895    PyObject *exc = NULL;
1896
1897    /* Note: size will always be longer than the resulting Unicode
1898       character count */
1899    unicode = _PyUnicode_New(size);
1900    if (!unicode)
1901        return NULL;
1902    if (size == 0) {
1903        if (consumed)
1904            *consumed = 0;
1905        return (PyObject *)unicode;
1906    }
1907
1908    /* Unpack UTF-8 encoded data */
1909    p = unicode->str;
1910    e = s + size;
1911
1912    while (s < e) {
1913        Py_UCS4 ch = (unsigned char)*s;
1914
1915        if (ch < 0x80) {
1916            *p++ = (Py_UNICODE)ch;
1917            s++;
1918            continue;
1919        }
1920
1921        n = utf8_code_length[ch];
1922
1923        if (s + n > e) {
1924	    if (consumed)
1925		break;
1926	    else {
1927		errmsg = "unexpected end of data";
1928		startinpos = s-starts;
1929		endinpos = size;
1930		goto utf8Error;
1931	    }
1932	}
1933
1934        switch (n) {
1935
1936        case 0:
1937            errmsg = "unexpected code byte";
1938	    startinpos = s-starts;
1939	    endinpos = startinpos+1;
1940	    goto utf8Error;
1941
1942        case 1:
1943            errmsg = "internal error";
1944	    startinpos = s-starts;
1945	    endinpos = startinpos+1;
1946	    goto utf8Error;
1947
1948        case 2:
1949            if ((s[1] & 0xc0) != 0x80) {
1950                errmsg = "invalid data";
1951		startinpos = s-starts;
1952		endinpos = startinpos+2;
1953		goto utf8Error;
1954	    }
1955            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1956            if (ch < 0x80) {
1957		startinpos = s-starts;
1958		endinpos = startinpos+2;
1959                errmsg = "illegal encoding";
1960		goto utf8Error;
1961	    }
1962	    else
1963		*p++ = (Py_UNICODE)ch;
1964            break;
1965
1966        case 3:
1967            if ((s[1] & 0xc0) != 0x80 ||
1968                (s[2] & 0xc0) != 0x80) {
1969                errmsg = "invalid data";
1970		startinpos = s-starts;
1971		endinpos = startinpos+3;
1972		goto utf8Error;
1973	    }
1974            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1975            if (ch < 0x0800) {
1976		/* Note: UTF-8 encodings of surrogates are considered
1977		   legal UTF-8 sequences;
1978
1979		   XXX For wide builds (UCS-4) we should probably try
1980		       to recombine the surrogates into a single code
1981		       unit.
1982		*/
1983                errmsg = "illegal encoding";
1984		startinpos = s-starts;
1985		endinpos = startinpos+3;
1986		goto utf8Error;
1987	    }
1988	    else
1989		*p++ = (Py_UNICODE)ch;
1990            break;
1991
1992        case 4:
1993            if ((s[1] & 0xc0) != 0x80 ||
1994                (s[2] & 0xc0) != 0x80 ||
1995                (s[3] & 0xc0) != 0x80) {
1996                errmsg = "invalid data";
1997		startinpos = s-starts;
1998		endinpos = startinpos+4;
1999		goto utf8Error;
2000	    }
2001            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2002                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2003            /* validate and convert to UTF-16 */
2004            if ((ch < 0x10000)        /* minimum value allowed for 4
2005					 byte encoding */
2006                || (ch > 0x10ffff))   /* maximum value allowed for
2007					 UTF-16 */
2008	    {
2009                errmsg = "illegal encoding";
2010		startinpos = s-starts;
2011		endinpos = startinpos+4;
2012		goto utf8Error;
2013	    }
2014#ifdef Py_UNICODE_WIDE
2015	    *p++ = (Py_UNICODE)ch;
2016#else
2017            /*  compute and append the two surrogates: */
2018
2019            /*  translate from 10000..10FFFF to 0..FFFF */
2020            ch -= 0x10000;
2021
2022            /*  high surrogate = top 10 bits added to D800 */
2023            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2024
2025            /*  low surrogate = bottom 10 bits added to DC00 */
2026            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2027#endif
2028            break;
2029
2030        default:
2031            /* Other sizes are only needed for UCS-4 */
2032            errmsg = "unsupported Unicode code range";
2033	    startinpos = s-starts;
2034	    endinpos = startinpos+n;
2035	    goto utf8Error;
2036        }
2037        s += n;
2038	continue;
2039
2040    utf8Error:
2041    outpos = p-PyUnicode_AS_UNICODE(unicode);
2042    if (unicode_decode_call_errorhandler(
2043	     errors, &errorHandler,
2044	     "utf8", errmsg,
2045	     &starts, &e, &startinpos, &endinpos, &exc, &s,
2046	     (PyObject **)&unicode, &outpos, &p))
2047	goto onError;
2048    }
2049    if (consumed)
2050	*consumed = s-starts;
2051
2052    /* Adjust length */
2053    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2054        goto onError;
2055
2056    Py_XDECREF(errorHandler);
2057    Py_XDECREF(exc);
2058    return (PyObject *)unicode;
2059
2060onError:
2061    Py_XDECREF(errorHandler);
2062    Py_XDECREF(exc);
2063    Py_DECREF(unicode);
2064    return NULL;
2065}
2066
2067/* Allocation strategy:  if the string is short, convert into a stack buffer
2068   and allocate exactly as much space needed at the end.  Else allocate the
2069   maximum possible needed (4 result bytes per Unicode character), and return
2070   the excess memory at the end.
2071*/
2072PyObject *
2073PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2074		     Py_ssize_t size,
2075		     const char *errors)
2076{
2077#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2078
2079    Py_ssize_t i;                /* index into s of next input byte */
2080    PyObject *result;            /* result string object */
2081    char *p;                     /* next free byte in output buffer */
2082    Py_ssize_t nallocated;      /* number of result bytes allocated */
2083    Py_ssize_t nneeded;            /* number of result bytes needed */
2084    char stackbuf[MAX_SHORT_UNICHARS * 4];
2085
2086    assert(s != NULL);
2087    assert(size >= 0);
2088
2089    if (size <= MAX_SHORT_UNICHARS) {
2090        /* Write into the stack buffer; nallocated can't overflow.
2091         * At the end, we'll allocate exactly as much heap space as it
2092         * turns out we need.
2093         */
2094        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2095        result = NULL;   /* will allocate after we're done */
2096        p = stackbuf;
2097    }
2098    else {
2099        /* Overallocate on the heap, and give the excess back at the end. */
2100        nallocated = size * 4;
2101        if (nallocated / 4 != size)  /* overflow! */
2102            return PyErr_NoMemory();
2103        result = PyBytes_FromStringAndSize(NULL, nallocated);
2104        if (result == NULL)
2105            return NULL;
2106        p = PyBytes_AS_STRING(result);
2107    }
2108
2109    for (i = 0; i < size;) {
2110        Py_UCS4 ch = s[i++];
2111
2112        if (ch < 0x80)
2113            /* Encode ASCII */
2114            *p++ = (char) ch;
2115
2116        else if (ch < 0x0800) {
2117            /* Encode Latin-1 */
2118            *p++ = (char)(0xc0 | (ch >> 6));
2119            *p++ = (char)(0x80 | (ch & 0x3f));
2120        }
2121        else {
2122            /* Encode UCS2 Unicode ordinals */
2123            if (ch < 0x10000) {
2124                /* Special case: check for high surrogate */
2125                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2126                    Py_UCS4 ch2 = s[i];
2127                    /* Check for low surrogate and combine the two to
2128                       form a UCS4 value */
2129                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2130                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2131                        i++;
2132                        goto encodeUCS4;
2133                    }
2134                    /* Fall through: handles isolated high surrogates */
2135                }
2136                *p++ = (char)(0xe0 | (ch >> 12));
2137                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2138                *p++ = (char)(0x80 | (ch & 0x3f));
2139                continue;
2140    	    }
2141encodeUCS4:
2142            /* Encode UCS4 Unicode ordinals */
2143            *p++ = (char)(0xf0 | (ch >> 18));
2144            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2145            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2146            *p++ = (char)(0x80 | (ch & 0x3f));
2147        }
2148    }
2149
2150    if (result == NULL) {
2151        /* This was stack allocated. */
2152        nneeded = p - stackbuf;
2153        assert(nneeded <= nallocated);
2154        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2155    }
2156    else {
2157        /* Cut back to size actually needed. */
2158        nneeded = p - PyBytes_AS_STRING(result);
2159        assert(nneeded <= nallocated);
2160        _PyBytes_Resize(&result, nneeded);
2161    }
2162    return result;
2163
2164#undef MAX_SHORT_UNICHARS
2165}
2166
2167PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2168{
2169    if (!PyUnicode_Check(unicode)) {
2170        PyErr_BadArgument();
2171        return NULL;
2172    }
2173    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2174				PyUnicode_GET_SIZE(unicode),
2175				NULL);
2176}
2177
2178/* --- UTF-32 Codec ------------------------------------------------------- */
2179
2180PyObject *
2181PyUnicode_DecodeUTF32(const char *s,
2182		      Py_ssize_t size,
2183		      const char *errors,
2184		      int *byteorder)
2185{
2186    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2187}
2188
2189PyObject *
2190PyUnicode_DecodeUTF32Stateful(const char *s,
2191			      Py_ssize_t size,
2192			      const char *errors,
2193			      int *byteorder,
2194			      Py_ssize_t *consumed)
2195{
2196    const char *starts = s;
2197    Py_ssize_t startinpos;
2198    Py_ssize_t endinpos;
2199    Py_ssize_t outpos;
2200    PyUnicodeObject *unicode;
2201    Py_UNICODE *p;
2202#ifndef Py_UNICODE_WIDE
2203    int i, pairs;
2204#else
2205    const int pairs = 0;
2206#endif
2207    const unsigned char *q, *e;
2208    int bo = 0;       /* assume native ordering by default */
2209    const char *errmsg = "";
2210    /* Offsets from q for retrieving bytes in the right order. */
2211#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2212    int iorder[] = {0, 1, 2, 3};
2213#else
2214    int iorder[] = {3, 2, 1, 0};
2215#endif
2216    PyObject *errorHandler = NULL;
2217    PyObject *exc = NULL;
2218    /* On narrow builds we split characters outside the BMP into two
2219       codepoints => count how much extra space we need. */
2220#ifndef Py_UNICODE_WIDE
2221    for (i = pairs = 0; i < size/4; i++)
2222	if (((Py_UCS4 *)s)[i] >= 0x10000)
2223	    pairs++;
2224#endif
2225
2226    /* This might be one to much, because of a BOM */
2227    unicode = _PyUnicode_New((size+3)/4+pairs);
2228    if (!unicode)
2229        return NULL;
2230    if (size == 0)
2231        return (PyObject *)unicode;
2232
2233    /* Unpack UTF-32 encoded data */
2234    p = unicode->str;
2235    q = (unsigned char *)s;
2236    e = q + size;
2237
2238    if (byteorder)
2239        bo = *byteorder;
2240
2241    /* Check for BOM marks (U+FEFF) in the input and adjust current
2242       byte order setting accordingly. In native mode, the leading BOM
2243       mark is skipped, in all other modes, it is copied to the output
2244       stream as-is (giving a ZWNBSP character). */
2245    if (bo == 0) {
2246        if (size >= 4) {
2247            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2248                                (q[iorder[1]] << 8) | q[iorder[0]];
2249#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2250	    if (bom == 0x0000FEFF) {
2251		q += 4;
2252		bo = -1;
2253	    }
2254	    else if (bom == 0xFFFE0000) {
2255		q += 4;
2256		bo = 1;
2257	    }
2258#else
2259	    if (bom == 0x0000FEFF) {
2260		q += 4;
2261		bo = 1;
2262	    }
2263	    else if (bom == 0xFFFE0000) {
2264		q += 4;
2265		bo = -1;
2266	    }
2267#endif
2268	}
2269    }
2270
2271    if (bo == -1) {
2272        /* force LE */
2273        iorder[0] = 0;
2274        iorder[1] = 1;
2275        iorder[2] = 2;
2276        iorder[3] = 3;
2277    }
2278    else if (bo == 1) {
2279        /* force BE */
2280        iorder[0] = 3;
2281        iorder[1] = 2;
2282        iorder[2] = 1;
2283        iorder[3] = 0;
2284    }
2285
2286    while (q < e) {
2287	Py_UCS4 ch;
2288	/* remaining bytes at the end? (size should be divisible by 4) */
2289	if (e-q<4) {
2290	    if (consumed)
2291		break;
2292	    errmsg = "truncated data";
2293	    startinpos = ((const char *)q)-starts;
2294	    endinpos = ((const char *)e)-starts;
2295	    goto utf32Error;
2296	    /* The remaining input chars are ignored if the callback
2297	       chooses to skip the input */
2298	}
2299	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2300	     (q[iorder[1]] << 8) | q[iorder[0]];
2301
2302	if (ch >= 0x110000)
2303	{
2304	    errmsg = "codepoint not in range(0x110000)";
2305	    startinpos = ((const char *)q)-starts;
2306	    endinpos = startinpos+4;
2307	    goto utf32Error;
2308	}
2309#ifndef Py_UNICODE_WIDE
2310	if (ch >= 0x10000)
2311	{
2312	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
2313	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2314	}
2315	else
2316#endif
2317	    *p++ = ch;
2318	q += 4;
2319	continue;
2320    utf32Error:
2321	outpos = p-PyUnicode_AS_UNICODE(unicode);
2322	if (unicode_decode_call_errorhandler(
2323	         errors, &errorHandler,
2324	         "utf32", errmsg,
2325	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2326	         (PyObject **)&unicode, &outpos, &p))
2327	    goto onError;
2328    }
2329
2330    if (byteorder)
2331        *byteorder = bo;
2332
2333    if (consumed)
2334	*consumed = (const char *)q-starts;
2335
2336    /* Adjust length */
2337    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2338        goto onError;
2339
2340    Py_XDECREF(errorHandler);
2341    Py_XDECREF(exc);
2342    return (PyObject *)unicode;
2343
2344onError:
2345    Py_DECREF(unicode);
2346    Py_XDECREF(errorHandler);
2347    Py_XDECREF(exc);
2348    return NULL;
2349}
2350
2351PyObject *
2352PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2353		      Py_ssize_t size,
2354		      const char *errors,
2355		      int byteorder)
2356{
2357    PyObject *v, *result;
2358    unsigned char *p;
2359#ifndef Py_UNICODE_WIDE
2360    int i, pairs;
2361#else
2362    const int pairs = 0;
2363#endif
2364    /* Offsets from p for storing byte pairs in the right order. */
2365#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366    int iorder[] = {0, 1, 2, 3};
2367#else
2368    int iorder[] = {3, 2, 1, 0};
2369#endif
2370
2371#define STORECHAR(CH)                       \
2372    do {                                    \
2373        p[iorder[3]] = ((CH) >> 24) & 0xff; \
2374        p[iorder[2]] = ((CH) >> 16) & 0xff; \
2375        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2376        p[iorder[0]] = (CH) & 0xff;         \
2377        p += 4;                             \
2378    } while(0)
2379
2380    /* In narrow builds we can output surrogate pairs as one codepoint,
2381       so we need less space. */
2382#ifndef Py_UNICODE_WIDE
2383    for (i = pairs = 0; i < size-1; i++)
2384	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386	    pairs++;
2387#endif
2388    v = PyByteArray_FromStringAndSize(NULL,
2389		  4 * (size - pairs + (byteorder == 0)));
2390    if (v == NULL)
2391        return NULL;
2392
2393    p = (unsigned char *)PyByteArray_AS_STRING(v);
2394    if (byteorder == 0)
2395	STORECHAR(0xFEFF);
2396    if (size == 0)
2397        goto done;
2398
2399    if (byteorder == -1) {
2400        /* force LE */
2401        iorder[0] = 0;
2402        iorder[1] = 1;
2403        iorder[2] = 2;
2404        iorder[3] = 3;
2405    }
2406    else if (byteorder == 1) {
2407        /* force BE */
2408        iorder[0] = 3;
2409        iorder[1] = 2;
2410        iorder[2] = 1;
2411        iorder[3] = 0;
2412    }
2413
2414    while (size-- > 0) {
2415	Py_UCS4 ch = *s++;
2416#ifndef Py_UNICODE_WIDE
2417	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2418	    Py_UCS4 ch2 = *s;
2419	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2420		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2421		s++;
2422		size--;
2423	    }
2424	}
2425#endif
2426        STORECHAR(ch);
2427    }
2428
2429  done:
2430    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2431    Py_DECREF(v);
2432    return result;
2433#undef STORECHAR
2434}
2435
2436PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2437{
2438    if (!PyUnicode_Check(unicode)) {
2439        PyErr_BadArgument();
2440        return NULL;
2441    }
2442    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2443				 PyUnicode_GET_SIZE(unicode),
2444				 NULL,
2445				 0);
2446}
2447
2448/* --- UTF-16 Codec ------------------------------------------------------- */
2449
2450PyObject *
2451PyUnicode_DecodeUTF16(const char *s,
2452		      Py_ssize_t size,
2453		      const char *errors,
2454		      int *byteorder)
2455{
2456    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2457}
2458
2459PyObject *
2460PyUnicode_DecodeUTF16Stateful(const char *s,
2461			      Py_ssize_t size,
2462			      const char *errors,
2463			      int *byteorder,
2464			      Py_ssize_t *consumed)
2465{
2466    const char *starts = s;
2467    Py_ssize_t startinpos;
2468    Py_ssize_t endinpos;
2469    Py_ssize_t outpos;
2470    PyUnicodeObject *unicode;
2471    Py_UNICODE *p;
2472    const unsigned char *q, *e;
2473    int bo = 0;       /* assume native ordering by default */
2474    const char *errmsg = "";
2475    /* Offsets from q for retrieving byte pairs in the right order. */
2476#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2477    int ihi = 1, ilo = 0;
2478#else
2479    int ihi = 0, ilo = 1;
2480#endif
2481    PyObject *errorHandler = NULL;
2482    PyObject *exc = NULL;
2483
2484    /* Note: size will always be longer than the resulting Unicode
2485       character count */
2486    unicode = _PyUnicode_New(size);
2487    if (!unicode)
2488        return NULL;
2489    if (size == 0)
2490        return (PyObject *)unicode;
2491
2492    /* Unpack UTF-16 encoded data */
2493    p = unicode->str;
2494    q = (unsigned char *)s;
2495    e = q + size;
2496
2497    if (byteorder)
2498        bo = *byteorder;
2499
2500    /* Check for BOM marks (U+FEFF) in the input and adjust current
2501       byte order setting accordingly. In native mode, the leading BOM
2502       mark is skipped, in all other modes, it is copied to the output
2503       stream as-is (giving a ZWNBSP character). */
2504    if (bo == 0) {
2505        if (size >= 2) {
2506            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2507#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2508	    if (bom == 0xFEFF) {
2509		q += 2;
2510		bo = -1;
2511	    }
2512	    else if (bom == 0xFFFE) {
2513		q += 2;
2514		bo = 1;
2515	    }
2516#else
2517	    if (bom == 0xFEFF) {
2518		q += 2;
2519		bo = 1;
2520	    }
2521	    else if (bom == 0xFFFE) {
2522		q += 2;
2523		bo = -1;
2524	    }
2525#endif
2526	}
2527    }
2528
2529    if (bo == -1) {
2530        /* force LE */
2531        ihi = 1;
2532        ilo = 0;
2533    }
2534    else if (bo == 1) {
2535        /* force BE */
2536        ihi = 0;
2537        ilo = 1;
2538    }
2539
2540    while (q < e) {
2541	Py_UNICODE ch;
2542	/* remaining bytes at the end? (size should be even) */
2543	if (e-q<2) {
2544	    if (consumed)
2545		break;
2546	    errmsg = "truncated data";
2547	    startinpos = ((const char *)q)-starts;
2548	    endinpos = ((const char *)e)-starts;
2549	    goto utf16Error;
2550	    /* The remaining input chars are ignored if the callback
2551	       chooses to skip the input */
2552	}
2553	ch = (q[ihi] << 8) | q[ilo];
2554
2555	q += 2;
2556
2557	if (ch < 0xD800 || ch > 0xDFFF) {
2558	    *p++ = ch;
2559	    continue;
2560	}
2561
2562	/* UTF-16 code pair: */
2563	if (q >= e) {
2564	    errmsg = "unexpected end of data";
2565	    startinpos = (((const char *)q)-2)-starts;
2566	    endinpos = ((const char *)e)-starts;
2567	    goto utf16Error;
2568	}
2569	if (0xD800 <= ch && ch <= 0xDBFF) {
2570	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2571	    q += 2;
2572	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2573#ifndef Py_UNICODE_WIDE
2574		*p++ = ch;
2575		*p++ = ch2;
2576#else
2577		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2578#endif
2579		continue;
2580	    }
2581	    else {
2582                errmsg = "illegal UTF-16 surrogate";
2583		startinpos = (((const char *)q)-4)-starts;
2584		endinpos = startinpos+2;
2585		goto utf16Error;
2586	    }
2587
2588	}
2589	errmsg = "illegal encoding";
2590	startinpos = (((const char *)q)-2)-starts;
2591	endinpos = startinpos+2;
2592	/* Fall through to report the error */
2593
2594    utf16Error:
2595	outpos = p-PyUnicode_AS_UNICODE(unicode);
2596	if (unicode_decode_call_errorhandler(
2597	         errors, &errorHandler,
2598	         "utf16", errmsg,
2599	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2600	         (PyObject **)&unicode, &outpos, &p))
2601	    goto onError;
2602    }
2603
2604    if (byteorder)
2605        *byteorder = bo;
2606
2607    if (consumed)
2608	*consumed = (const char *)q-starts;
2609
2610    /* Adjust length */
2611    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2612        goto onError;
2613
2614    Py_XDECREF(errorHandler);
2615    Py_XDECREF(exc);
2616    return (PyObject *)unicode;
2617
2618onError:
2619    Py_DECREF(unicode);
2620    Py_XDECREF(errorHandler);
2621    Py_XDECREF(exc);
2622    return NULL;
2623}
2624
2625PyObject *
2626PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2627		      Py_ssize_t size,
2628		      const char *errors,
2629		      int byteorder)
2630{
2631    PyObject *v, *result;
2632    unsigned char *p;
2633#ifdef Py_UNICODE_WIDE
2634    int i, pairs;
2635#else
2636    const int pairs = 0;
2637#endif
2638    /* Offsets from p for storing byte pairs in the right order. */
2639#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640    int ihi = 1, ilo = 0;
2641#else
2642    int ihi = 0, ilo = 1;
2643#endif
2644
2645#define STORECHAR(CH)                   \
2646    do {                                \
2647        p[ihi] = ((CH) >> 8) & 0xff;    \
2648        p[ilo] = (CH) & 0xff;           \
2649        p += 2;                         \
2650    } while(0)
2651
2652#ifdef Py_UNICODE_WIDE
2653    for (i = pairs = 0; i < size; i++)
2654	if (s[i] >= 0x10000)
2655	    pairs++;
2656#endif
2657    v = PyByteArray_FromStringAndSize(NULL,
2658		  2 * (size + pairs + (byteorder == 0)));
2659    if (v == NULL)
2660        return NULL;
2661
2662    p = (unsigned char *)PyByteArray_AS_STRING(v);
2663    if (byteorder == 0)
2664	STORECHAR(0xFEFF);
2665    if (size == 0)
2666        goto done;
2667
2668    if (byteorder == -1) {
2669        /* force LE */
2670        ihi = 1;
2671        ilo = 0;
2672    }
2673    else if (byteorder == 1) {
2674        /* force BE */
2675        ihi = 0;
2676        ilo = 1;
2677    }
2678
2679    while (size-- > 0) {
2680	Py_UNICODE ch = *s++;
2681	Py_UNICODE ch2 = 0;
2682#ifdef Py_UNICODE_WIDE
2683	if (ch >= 0x10000) {
2684	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2685	    ch  = 0xD800 | ((ch-0x10000) >> 10);
2686	}
2687#endif
2688        STORECHAR(ch);
2689        if (ch2)
2690            STORECHAR(ch2);
2691    }
2692
2693  done:
2694    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2695    Py_DECREF(v);
2696    return result;
2697#undef STORECHAR
2698}
2699
2700PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2701{
2702    if (!PyUnicode_Check(unicode)) {
2703        PyErr_BadArgument();
2704        return NULL;
2705    }
2706    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2707				 PyUnicode_GET_SIZE(unicode),
2708				 NULL,
2709				 0);
2710}
2711
2712/* --- Unicode Escape Codec ----------------------------------------------- */
2713
2714static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2715
2716PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2717					Py_ssize_t size,
2718					const char *errors)
2719{
2720    const char *starts = s;
2721    Py_ssize_t startinpos;
2722    Py_ssize_t endinpos;
2723    Py_ssize_t outpos;
2724    int i;
2725    PyUnicodeObject *v;
2726    Py_UNICODE *p;
2727    const char *end;
2728    char* message;
2729    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2730    PyObject *errorHandler = NULL;
2731    PyObject *exc = NULL;
2732
2733    /* Escaped strings will always be longer than the resulting
2734       Unicode string, so we start with size here and then reduce the
2735       length after conversion to the true value.
2736       (but if the error callback returns a long replacement string
2737       we'll have to allocate more space) */
2738    v = _PyUnicode_New(size);
2739    if (v == NULL)
2740        goto onError;
2741    if (size == 0)
2742        return (PyObject *)v;
2743
2744    p = PyUnicode_AS_UNICODE(v);
2745    end = s + size;
2746
2747    while (s < end) {
2748        unsigned char c;
2749        Py_UNICODE x;
2750        int digits;
2751
2752        /* Non-escape characters are interpreted as Unicode ordinals */
2753        if (*s != '\\') {
2754            *p++ = (unsigned char) *s++;
2755            continue;
2756        }
2757
2758        startinpos = s-starts;
2759        /* \ - Escapes */
2760        s++;
2761        c = *s++;
2762        if (s > end)
2763            c = '\0'; /* Invalid after \ */
2764        switch (c) {
2765
2766        /* \x escapes */
2767        case '\n': break;
2768        case '\\': *p++ = '\\'; break;
2769        case '\'': *p++ = '\''; break;
2770        case '\"': *p++ = '\"'; break;
2771        case 'b': *p++ = '\b'; break;
2772        case 'f': *p++ = '\014'; break; /* FF */
2773        case 't': *p++ = '\t'; break;
2774        case 'n': *p++ = '\n'; break;
2775        case 'r': *p++ = '\r'; break;
2776        case 'v': *p++ = '\013'; break; /* VT */
2777        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2778
2779        /* \OOO (octal) escapes */
2780        case '0': case '1': case '2': case '3':
2781        case '4': case '5': case '6': case '7':
2782            x = s[-1] - '0';
2783            if (s < end && '0' <= *s && *s <= '7') {
2784                x = (x<<3) + *s++ - '0';
2785                if (s < end && '0' <= *s && *s <= '7')
2786                    x = (x<<3) + *s++ - '0';
2787            }
2788            *p++ = x;
2789            break;
2790
2791        /* hex escapes */
2792        /* \xXX */
2793        case 'x':
2794            digits = 2;
2795            message = "truncated \\xXX escape";
2796            goto hexescape;
2797
2798        /* \uXXXX */
2799        case 'u':
2800            digits = 4;
2801            message = "truncated \\uXXXX escape";
2802            goto hexescape;
2803
2804        /* \UXXXXXXXX */
2805        case 'U':
2806            digits = 8;
2807            message = "truncated \\UXXXXXXXX escape";
2808        hexescape:
2809            chr = 0;
2810            outpos = p-PyUnicode_AS_UNICODE(v);
2811            if (s+digits>end) {
2812                endinpos = size;
2813                if (unicode_decode_call_errorhandler(
2814                    errors, &errorHandler,
2815                    "unicodeescape", "end of string in escape sequence",
2816                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2817                    (PyObject **)&v, &outpos, &p))
2818                    goto onError;
2819                goto nextByte;
2820            }
2821            for (i = 0; i < digits; ++i) {
2822                c = (unsigned char) s[i];
2823                if (!ISXDIGIT(c)) {
2824                    endinpos = (s+i+1)-starts;
2825                    if (unicode_decode_call_errorhandler(
2826                        errors, &errorHandler,
2827                        "unicodeescape", message,
2828                        &starts, &end, &startinpos, &endinpos, &exc, &s,
2829                        (PyObject **)&v, &outpos, &p))
2830                        goto onError;
2831                    goto nextByte;
2832                }
2833                chr = (chr<<4) & ~0xF;
2834                if (c >= '0' && c <= '9')
2835                    chr += c - '0';
2836                else if (c >= 'a' && c <= 'f')
2837                    chr += 10 + c - 'a';
2838                else
2839                    chr += 10 + c - 'A';
2840            }
2841            s += i;
2842            if (chr == 0xffffffff && PyErr_Occurred())
2843                /* _decoding_error will have already written into the
2844                   target buffer. */
2845                break;
2846        store:
2847            /* when we get here, chr is a 32-bit unicode character */
2848            if (chr <= 0xffff)
2849                /* UCS-2 character */
2850                *p++ = (Py_UNICODE) chr;
2851            else if (chr <= 0x10ffff) {
2852                /* UCS-4 character. Either store directly, or as
2853                   surrogate pair. */
2854#ifdef Py_UNICODE_WIDE
2855                *p++ = chr;
2856#else
2857                chr -= 0x10000L;
2858                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2859                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2860#endif
2861            } else {
2862                endinpos = s-starts;
2863                outpos = p-PyUnicode_AS_UNICODE(v);
2864                if (unicode_decode_call_errorhandler(
2865                    errors, &errorHandler,
2866                    "unicodeescape", "illegal Unicode character",
2867                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2868                    (PyObject **)&v, &outpos, &p))
2869                    goto onError;
2870            }
2871            break;
2872
2873        /* \N{name} */
2874        case 'N':
2875            message = "malformed \\N character escape";
2876            if (ucnhash_CAPI == NULL) {
2877                /* load the unicode data module */
2878                PyObject *m, *api;
2879                m = PyImport_ImportModuleNoBlock("unicodedata");
2880                if (m == NULL)
2881                    goto ucnhashError;
2882                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2883                Py_DECREF(m);
2884                if (api == NULL)
2885                    goto ucnhashError;
2886                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2887                Py_DECREF(api);
2888                if (ucnhash_CAPI == NULL)
2889                    goto ucnhashError;
2890            }
2891            if (*s == '{') {
2892                const char *start = s+1;
2893                /* look for the closing brace */
2894                while (*s != '}' && s < end)
2895                    s++;
2896                if (s > start && s < end && *s == '}') {
2897                    /* found a name.  look it up in the unicode database */
2898                    message = "unknown Unicode character name";
2899                    s++;
2900                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2901                        goto store;
2902                }
2903            }
2904            endinpos = s-starts;
2905            outpos = p-PyUnicode_AS_UNICODE(v);
2906            if (unicode_decode_call_errorhandler(
2907                errors, &errorHandler,
2908                "unicodeescape", message,
2909                &starts, &end, &startinpos, &endinpos, &exc, &s,
2910                (PyObject **)&v, &outpos, &p))
2911                goto onError;
2912            break;
2913
2914        default:
2915            if (s > end) {
2916                message = "\\ at end of string";
2917                s--;
2918                endinpos = s-starts;
2919                outpos = p-PyUnicode_AS_UNICODE(v);
2920                if (unicode_decode_call_errorhandler(
2921                    errors, &errorHandler,
2922                    "unicodeescape", message,
2923                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2924                    (PyObject **)&v, &outpos, &p))
2925                    goto onError;
2926            }
2927            else {
2928                *p++ = '\\';
2929                *p++ = (unsigned char)s[-1];
2930            }
2931            break;
2932        }
2933        nextByte:
2934        ;
2935    }
2936    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2937        goto onError;
2938    Py_XDECREF(errorHandler);
2939    Py_XDECREF(exc);
2940    return (PyObject *)v;
2941
2942ucnhashError:
2943    PyErr_SetString(
2944        PyExc_UnicodeError,
2945        "\\N escapes not supported (can't load unicodedata module)"
2946        );
2947    Py_XDECREF(v);
2948    Py_XDECREF(errorHandler);
2949    Py_XDECREF(exc);
2950    return NULL;
2951
2952onError:
2953    Py_XDECREF(v);
2954    Py_XDECREF(errorHandler);
2955    Py_XDECREF(exc);
2956    return NULL;
2957}
2958
2959/* Return a Unicode-Escape string version of the Unicode object.
2960
2961   If quotes is true, the string is enclosed in u"" or u'' quotes as
2962   appropriate.
2963
2964*/
2965
2966Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2967                                      Py_ssize_t size,
2968                                      Py_UNICODE ch)
2969{
2970    /* like wcschr, but doesn't stop at NULL characters */
2971
2972    while (size-- > 0) {
2973        if (*s == ch)
2974            return s;
2975        s++;
2976    }
2977
2978    return NULL;
2979}
2980
2981static const char *hexdigits = "0123456789abcdef";
2982
2983PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2984					Py_ssize_t size)
2985{
2986    PyObject *repr, *result;
2987    char *p;
2988
2989    /* XXX(nnorwitz): rather than over-allocating, it would be
2990       better to choose a different scheme.  Perhaps scan the
2991       first N-chars of the string and allocate based on that size.
2992    */
2993    /* Initial allocation is based on the longest-possible unichr
2994       escape.
2995
2996       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2997       unichr, so in this case it's the longest unichr escape. In
2998       narrow (UTF-16) builds this is five chars per source unichr
2999       since there are two unichrs in the surrogate pair, so in narrow
3000       (UTF-16) builds it's not the longest unichr escape.
3001
3002       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3003       so in the narrow (UTF-16) build case it's the longest unichr
3004       escape.
3005    */
3006
3007    repr = PyByteArray_FromStringAndSize(NULL,
3008#ifdef Py_UNICODE_WIDE
3009        + 10*size
3010#else
3011        + 6*size
3012#endif
3013        + 1);
3014    if (repr == NULL)
3015        return NULL;
3016
3017    p = PyByteArray_AS_STRING(repr);
3018
3019    while (size-- > 0) {
3020        Py_UNICODE ch = *s++;
3021
3022        /* Escape backslashes */
3023        if (ch == '\\') {
3024            *p++ = '\\';
3025            *p++ = (char) ch;
3026            continue;
3027        }
3028
3029#ifdef Py_UNICODE_WIDE
3030        /* Map 21-bit characters to '\U00xxxxxx' */
3031        else if (ch >= 0x10000) {
3032            *p++ = '\\';
3033            *p++ = 'U';
3034            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3035            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3036            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3037            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3038            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3039            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3040            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3041            *p++ = hexdigits[ch & 0x0000000F];
3042	    continue;
3043        }
3044#else
3045	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3046	else if (ch >= 0xD800 && ch < 0xDC00) {
3047	    Py_UNICODE ch2;
3048	    Py_UCS4 ucs;
3049
3050	    ch2 = *s++;
3051	    size--;
3052	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3053		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3054		*p++ = '\\';
3055		*p++ = 'U';
3056		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3057		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3058		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3059		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3060		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3061		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3062		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3063		*p++ = hexdigits[ucs & 0x0000000F];
3064		continue;
3065	    }
3066	    /* Fall through: isolated surrogates are copied as-is */
3067	    s--;
3068	    size++;
3069	}
3070#endif
3071
3072        /* Map 16-bit characters to '\uxxxx' */
3073        if (ch >= 256) {
3074            *p++ = '\\';
3075            *p++ = 'u';
3076            *p++ = hexdigits[(ch >> 12) & 0x000F];
3077            *p++ = hexdigits[(ch >> 8) & 0x000F];
3078            *p++ = hexdigits[(ch >> 4) & 0x000F];
3079            *p++ = hexdigits[ch & 0x000F];
3080        }
3081
3082        /* Map special whitespace to '\t', \n', '\r' */
3083        else if (ch == '\t') {
3084            *p++ = '\\';
3085            *p++ = 't';
3086        }
3087        else if (ch == '\n') {
3088            *p++ = '\\';
3089            *p++ = 'n';
3090        }
3091        else if (ch == '\r') {
3092            *p++ = '\\';
3093            *p++ = 'r';
3094        }
3095
3096        /* Map non-printable US ASCII to '\xhh' */
3097        else if (ch < ' ' || ch >= 0x7F) {
3098            *p++ = '\\';
3099            *p++ = 'x';
3100            *p++ = hexdigits[(ch >> 4) & 0x000F];
3101            *p++ = hexdigits[ch & 0x000F];
3102        }
3103
3104        /* Copy everything else as-is */
3105        else
3106            *p++ = (char) ch;
3107    }
3108
3109    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
3110                                        p - PyByteArray_AS_STRING(repr));
3111    Py_DECREF(repr);
3112    return result;
3113}
3114
3115PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3116{
3117    PyObject *s, *result;
3118    if (!PyUnicode_Check(unicode)) {
3119        PyErr_BadArgument();
3120        return NULL;
3121    }
3122    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3123                                      PyUnicode_GET_SIZE(unicode));
3124
3125    if (!s)
3126        return NULL;
3127    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
3128                                        PyByteArray_GET_SIZE(s));
3129    Py_DECREF(s);
3130    return result;
3131}
3132
3133/* --- Raw Unicode Escape Codec ------------------------------------------- */
3134
3135PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3136					   Py_ssize_t size,
3137					   const char *errors)
3138{
3139    const char *starts = s;
3140    Py_ssize_t startinpos;
3141    Py_ssize_t endinpos;
3142    Py_ssize_t outpos;
3143    PyUnicodeObject *v;
3144    Py_UNICODE *p;
3145    const char *end;
3146    const char *bs;
3147    PyObject *errorHandler = NULL;
3148    PyObject *exc = NULL;
3149
3150    /* Escaped strings will always be longer than the resulting
3151       Unicode string, so we start with size here and then reduce the
3152       length after conversion to the true value. (But decoding error
3153       handler might have to resize the string) */
3154    v = _PyUnicode_New(size);
3155    if (v == NULL)
3156	goto onError;
3157    if (size == 0)
3158	return (PyObject *)v;
3159    p = PyUnicode_AS_UNICODE(v);
3160    end = s + size;
3161    while (s < end) {
3162	unsigned char c;
3163	Py_UCS4 x;
3164	int i;
3165        int count;
3166
3167	/* Non-escape characters are interpreted as Unicode ordinals */
3168	if (*s != '\\') {
3169	    *p++ = (unsigned char)*s++;
3170	    continue;
3171	}
3172	startinpos = s-starts;
3173
3174	/* \u-escapes are only interpreted iff the number of leading
3175	   backslashes if odd */
3176	bs = s;
3177	for (;s < end;) {
3178	    if (*s != '\\')
3179		break;
3180	    *p++ = (unsigned char)*s++;
3181	}
3182	if (((s - bs) & 1) == 0 ||
3183	    s >= end ||
3184	    (*s != 'u' && *s != 'U')) {
3185	    continue;
3186	}
3187	p--;
3188        count = *s=='u' ? 4 : 8;
3189	s++;
3190
3191	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3192	outpos = p-PyUnicode_AS_UNICODE(v);
3193	for (x = 0, i = 0; i < count; ++i, ++s) {
3194	    c = (unsigned char)*s;
3195	    if (!ISXDIGIT(c)) {
3196		endinpos = s-starts;
3197		if (unicode_decode_call_errorhandler(
3198		    errors, &errorHandler,
3199		    "rawunicodeescape", "truncated \\uXXXX",
3200		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3201		    (PyObject **)&v, &outpos, &p))
3202		    goto onError;
3203		goto nextByte;
3204	    }
3205	    x = (x<<4) & ~0xF;
3206	    if (c >= '0' && c <= '9')
3207		x += c - '0';
3208	    else if (c >= 'a' && c <= 'f')
3209		x += 10 + c - 'a';
3210	    else
3211		x += 10 + c - 'A';
3212	}
3213        if (x <= 0xffff)
3214                /* UCS-2 character */
3215                *p++ = (Py_UNICODE) x;
3216        else if (x <= 0x10ffff) {
3217                /* UCS-4 character. Either store directly, or as
3218                   surrogate pair. */
3219#ifdef Py_UNICODE_WIDE
3220                *p++ = (Py_UNICODE) x;
3221#else
3222                x -= 0x10000L;
3223                *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3224                *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3225#endif
3226        } else {
3227            endinpos = s-starts;
3228            outpos = p-PyUnicode_AS_UNICODE(v);
3229            if (unicode_decode_call_errorhandler(
3230                    errors, &errorHandler,
3231                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3232		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3233		    (PyObject **)&v, &outpos, &p))
3234		    goto onError;
3235        }
3236	nextByte:
3237	;
3238    }
3239    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3240	goto onError;
3241    Py_XDECREF(errorHandler);
3242    Py_XDECREF(exc);
3243    return (PyObject *)v;
3244
3245 onError:
3246    Py_XDECREF(v);
3247    Py_XDECREF(errorHandler);
3248    Py_XDECREF(exc);
3249    return NULL;
3250}
3251
3252PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3253					   Py_ssize_t size)
3254{
3255    PyObject *repr, *result;
3256    char *p;
3257    char *q;
3258
3259#ifdef Py_UNICODE_WIDE
3260    repr = PyByteArray_FromStringAndSize(NULL, 10 * size);
3261#else
3262    repr = PyByteArray_FromStringAndSize(NULL, 6 * size);
3263#endif
3264    if (repr == NULL)
3265        return NULL;
3266    if (size == 0)
3267        goto done;
3268
3269    p = q = PyByteArray_AS_STRING(repr);
3270    while (size-- > 0) {
3271        Py_UNICODE ch = *s++;
3272#ifdef Py_UNICODE_WIDE
3273	/* Map 32-bit characters to '\Uxxxxxxxx' */
3274	if (ch >= 0x10000) {
3275            *p++ = '\\';
3276            *p++ = 'U';
3277            *p++ = hexdigits[(ch >> 28) & 0xf];
3278            *p++ = hexdigits[(ch >> 24) & 0xf];
3279            *p++ = hexdigits[(ch >> 20) & 0xf];
3280            *p++ = hexdigits[(ch >> 16) & 0xf];
3281            *p++ = hexdigits[(ch >> 12) & 0xf];
3282            *p++ = hexdigits[(ch >> 8) & 0xf];
3283            *p++ = hexdigits[(ch >> 4) & 0xf];
3284            *p++ = hexdigits[ch & 15];
3285        }
3286        else
3287#else
3288	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3289	if (ch >= 0xD800 && ch < 0xDC00) {
3290	    Py_UNICODE ch2;
3291	    Py_UCS4 ucs;
3292
3293	    ch2 = *s++;
3294	    size--;
3295	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3296		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3297		*p++ = '\\';
3298		*p++ = 'U';
3299		*p++ = hexdigits[(ucs >> 28) & 0xf];
3300		*p++ = hexdigits[(ucs >> 24) & 0xf];
3301		*p++ = hexdigits[(ucs >> 20) & 0xf];
3302		*p++ = hexdigits[(ucs >> 16) & 0xf];
3303		*p++ = hexdigits[(ucs >> 12) & 0xf];
3304		*p++ = hexdigits[(ucs >> 8) & 0xf];
3305		*p++ = hexdigits[(ucs >> 4) & 0xf];
3306		*p++ = hexdigits[ucs & 0xf];
3307		continue;
3308	    }
3309	    /* Fall through: isolated surrogates are copied as-is */
3310	    s--;
3311	    size++;
3312	}
3313#endif
3314	/* Map 16-bit characters to '\uxxxx' */
3315	if (ch >= 256) {
3316            *p++ = '\\';
3317            *p++ = 'u';
3318            *p++ = hexdigits[(ch >> 12) & 0xf];
3319            *p++ = hexdigits[(ch >> 8) & 0xf];
3320            *p++ = hexdigits[(ch >> 4) & 0xf];
3321            *p++ = hexdigits[ch & 15];
3322        }
3323	/* Copy everything else as-is */
3324	else
3325            *p++ = (char) ch;
3326    }
3327    size = p - q;
3328
3329  done:
3330    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
3331    Py_DECREF(repr);
3332    return result;
3333}
3334
3335PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3336{
3337    PyObject *s, *result;
3338    if (!PyUnicode_Check(unicode)) {
3339        PyErr_BadArgument();
3340        return NULL;
3341    }
3342    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3343                                         PyUnicode_GET_SIZE(unicode));
3344
3345    if (!s)
3346        return NULL;
3347    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
3348                                        PyByteArray_GET_SIZE(s));
3349    Py_DECREF(s);
3350    return result;
3351}
3352
3353/* --- Unicode Internal Codec ------------------------------------------- */
3354
3355PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3356					   Py_ssize_t size,
3357					   const char *errors)
3358{
3359    const char *starts = s;
3360    Py_ssize_t startinpos;
3361    Py_ssize_t endinpos;
3362    Py_ssize_t outpos;
3363    PyUnicodeObject *v;
3364    Py_UNICODE *p;
3365    const char *end;
3366    const char *reason;
3367    PyObject *errorHandler = NULL;
3368    PyObject *exc = NULL;
3369
3370#ifdef Py_UNICODE_WIDE
3371    Py_UNICODE unimax = PyUnicode_GetMax();
3372#endif
3373
3374    /* XXX overflow detection missing */
3375    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3376    if (v == NULL)
3377	goto onError;
3378    if (PyUnicode_GetSize((PyObject *)v) == 0)
3379	return (PyObject *)v;
3380    p = PyUnicode_AS_UNICODE(v);
3381    end = s + size;
3382
3383    while (s < end) {
3384        memcpy(p, s, sizeof(Py_UNICODE));
3385        /* We have to sanity check the raw data, otherwise doom looms for
3386           some malformed UCS-4 data. */
3387        if (
3388            #ifdef Py_UNICODE_WIDE
3389            *p > unimax || *p < 0 ||
3390            #endif
3391            end-s < Py_UNICODE_SIZE
3392            )
3393            {
3394            startinpos = s - starts;
3395            if (end-s < Py_UNICODE_SIZE) {
3396                endinpos = end-starts;
3397                reason = "truncated input";
3398            }
3399            else {
3400                endinpos = s - starts + Py_UNICODE_SIZE;
3401                reason = "illegal code point (> 0x10FFFF)";
3402            }
3403            outpos = p - PyUnicode_AS_UNICODE(v);
3404            if (unicode_decode_call_errorhandler(
3405                    errors, &errorHandler,
3406                    "unicode_internal", reason,
3407                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3408                    (PyObject **)&v, &outpos, &p)) {
3409                goto onError;
3410            }
3411        }
3412        else {
3413            p++;
3414            s += Py_UNICODE_SIZE;
3415        }
3416    }
3417
3418    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3419        goto onError;
3420    Py_XDECREF(errorHandler);
3421    Py_XDECREF(exc);
3422    return (PyObject *)v;
3423
3424 onError:
3425    Py_XDECREF(v);
3426    Py_XDECREF(errorHandler);
3427    Py_XDECREF(exc);
3428    return NULL;
3429}
3430
3431/* --- Latin-1 Codec ------------------------------------------------------ */
3432
3433PyObject *PyUnicode_DecodeLatin1(const char *s,
3434				 Py_ssize_t size,
3435				 const char *errors)
3436{
3437    PyUnicodeObject *v;
3438    Py_UNICODE *p;
3439
3440    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3441    if (size == 1) {
3442	Py_UNICODE r = *(unsigned char*)s;
3443	return PyUnicode_FromUnicode(&r, 1);
3444    }
3445
3446    v = _PyUnicode_New(size);
3447    if (v == NULL)
3448	goto onError;
3449    if (size == 0)
3450	return (PyObject *)v;
3451    p = PyUnicode_AS_UNICODE(v);
3452    while (size-- > 0)
3453	*p++ = (unsigned char)*s++;
3454    return (PyObject *)v;
3455
3456 onError:
3457    Py_XDECREF(v);
3458    return NULL;
3459}
3460
3461/* create or adjust a UnicodeEncodeError */
3462static void make_encode_exception(PyObject **exceptionObject,
3463    const char *encoding,
3464    const Py_UNICODE *unicode, Py_ssize_t size,
3465    Py_ssize_t startpos, Py_ssize_t endpos,
3466    const char *reason)
3467{
3468    if (*exceptionObject == NULL) {
3469	*exceptionObject = PyUnicodeEncodeError_Create(
3470	    encoding, unicode, size, startpos, endpos, reason);
3471    }
3472    else {
3473	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3474	    goto onError;
3475	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3476	    goto onError;
3477	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3478	    goto onError;
3479	return;
3480	onError:
3481	Py_DECREF(*exceptionObject);
3482	*exceptionObject = NULL;
3483    }
3484}
3485
3486/* raises a UnicodeEncodeError */
3487static void raise_encode_exception(PyObject **exceptionObject,
3488    const char *encoding,
3489    const Py_UNICODE *unicode, Py_ssize_t size,
3490    Py_ssize_t startpos, Py_ssize_t endpos,
3491    const char *reason)
3492{
3493    make_encode_exception(exceptionObject,
3494	encoding, unicode, size, startpos, endpos, reason);
3495    if (*exceptionObject != NULL)
3496	PyCodec_StrictErrors(*exceptionObject);
3497}
3498
3499/* error handling callback helper:
3500   build arguments, call the callback and check the arguments,
3501   put the result into newpos and return the replacement string, which
3502   has to be freed by the caller */
3503static PyObject *unicode_encode_call_errorhandler(const char *errors,
3504    PyObject **errorHandler,
3505    const char *encoding, const char *reason,
3506    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3507    Py_ssize_t startpos, Py_ssize_t endpos,
3508    Py_ssize_t *newpos)
3509{
3510    static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3511
3512    PyObject *restuple;
3513    PyObject *resunicode;
3514
3515    if (*errorHandler == NULL) {
3516	*errorHandler = PyCodec_LookupError(errors);
3517        if (*errorHandler == NULL)
3518	    return NULL;
3519    }
3520
3521    make_encode_exception(exceptionObject,
3522	encoding, unicode, size, startpos, endpos, reason);
3523    if (*exceptionObject == NULL)
3524	return NULL;
3525
3526    restuple = PyObject_CallFunctionObjArgs(
3527	*errorHandler, *exceptionObject, NULL);
3528    if (restuple == NULL)
3529	return NULL;
3530    if (!PyTuple_Check(restuple)) {
3531	PyErr_Format(PyExc_TypeError, &argparse[4]);
3532	Py_DECREF(restuple);
3533	return NULL;
3534    }
3535    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3536	&resunicode, newpos)) {
3537	Py_DECREF(restuple);
3538	return NULL;
3539    }
3540    if (*newpos<0)
3541	*newpos = size+*newpos;
3542    if (*newpos<0 || *newpos>size) {
3543	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3544	Py_DECREF(restuple);
3545	return NULL;
3546    }
3547    Py_INCREF(resunicode);
3548    Py_DECREF(restuple);
3549    return resunicode;
3550}
3551
3552static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3553				 Py_ssize_t size,
3554				 const char *errors,
3555				 int limit)
3556{
3557    /* output object */
3558    PyObject *res;
3559    /* pointers to the beginning and end+1 of input */
3560    const Py_UNICODE *startp = p;
3561    const Py_UNICODE *endp = p + size;
3562    /* pointer to the beginning of the unencodable characters */
3563    /* const Py_UNICODE *badp = NULL; */
3564    /* pointer into the output */
3565    char *str;
3566    /* current output position */
3567    Py_ssize_t ressize;
3568    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3569    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3570    PyObject *errorHandler = NULL;
3571    PyObject *exc = NULL;
3572    PyObject *result = NULL;
3573    /* the following variable is used for caching string comparisons
3574     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3575    int known_errorHandler = -1;
3576
3577    /* allocate enough for a simple encoding without
3578       replacements, if we need more, we'll resize */
3579    if (size == 0)
3580        return PyBytes_FromStringAndSize(NULL, 0);
3581    res = PyByteArray_FromStringAndSize(NULL, size);
3582    if (res == NULL)
3583        return NULL;
3584    str = PyByteArray_AS_STRING(res);
3585    ressize = size;
3586
3587    while (p<endp) {
3588	Py_UNICODE c = *p;
3589
3590	/* can we encode this? */
3591	if (c<limit) {
3592	    /* no overflow check, because we know that the space is enough */
3593	    *str++ = (char)c;
3594	    ++p;
3595	}
3596	else {
3597	    Py_ssize_t unicodepos = p-startp;
3598	    Py_ssize_t requiredsize;
3599	    PyObject *repunicode;
3600	    Py_ssize_t repsize;
3601	    Py_ssize_t newpos;
3602	    Py_ssize_t respos;
3603	    Py_UNICODE *uni2;
3604	    /* startpos for collecting unencodable chars */
3605	    const Py_UNICODE *collstart = p;
3606	    const Py_UNICODE *collend = p;
3607	    /* find all unecodable characters */
3608	    while ((collend < endp) && ((*collend)>=limit))
3609		++collend;
3610	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3611	    if (known_errorHandler==-1) {
3612		if ((errors==NULL) || (!strcmp(errors, "strict")))
3613		    known_errorHandler = 1;
3614		else if (!strcmp(errors, "replace"))
3615		    known_errorHandler = 2;
3616		else if (!strcmp(errors, "ignore"))
3617		    known_errorHandler = 3;
3618		else if (!strcmp(errors, "xmlcharrefreplace"))
3619		    known_errorHandler = 4;
3620		else
3621		    known_errorHandler = 0;
3622	    }
3623	    switch (known_errorHandler) {
3624		case 1: /* strict */
3625		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3626		    goto onError;
3627		case 2: /* replace */
3628		    while (collstart++<collend)
3629			*str++ = '?'; /* fall through */
3630		case 3: /* ignore */
3631		    p = collend;
3632		    break;
3633		case 4: /* xmlcharrefreplace */
3634		    respos = str - PyByteArray_AS_STRING(res);
3635		    /* determine replacement size (temporarily (mis)uses p) */
3636		    for (p = collstart, repsize = 0; p < collend; ++p) {
3637			if (*p<10)
3638			    repsize += 2+1+1;
3639			else if (*p<100)
3640			    repsize += 2+2+1;
3641			else if (*p<1000)
3642			    repsize += 2+3+1;
3643			else if (*p<10000)
3644			    repsize += 2+4+1;
3645#ifndef Py_UNICODE_WIDE
3646			else
3647			    repsize += 2+5+1;
3648#else
3649			else if (*p<100000)
3650			    repsize += 2+5+1;
3651			else if (*p<1000000)
3652			    repsize += 2+6+1;
3653			else
3654			    repsize += 2+7+1;
3655#endif
3656		    }
3657		    requiredsize = respos+repsize+(endp-collend);
3658		    if (requiredsize > ressize) {
3659			if (requiredsize<2*ressize)
3660			    requiredsize = 2*ressize;
3661			if (PyByteArray_Resize(res, requiredsize))
3662			    goto onError;
3663			str = PyByteArray_AS_STRING(res) + respos;
3664			ressize = requiredsize;
3665		    }
3666		    /* generate replacement (temporarily (mis)uses p) */
3667		    for (p = collstart; p < collend; ++p) {
3668			str += sprintf(str, "&#%d;", (int)*p);
3669		    }
3670		    p = collend;
3671		    break;
3672		default:
3673		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3674			encoding, reason, startp, size, &exc,
3675			collstart-startp, collend-startp, &newpos);
3676		    if (repunicode == NULL)
3677			goto onError;
3678		    /* need more space? (at least enough for what we
3679		       have+the replacement+the rest of the string, so
3680		       we won't have to check space for encodable characters) */
3681		    respos = str - PyByteArray_AS_STRING(res);
3682		    repsize = PyUnicode_GET_SIZE(repunicode);
3683		    requiredsize = respos+repsize+(endp-collend);
3684		    if (requiredsize > ressize) {
3685			if (requiredsize<2*ressize)
3686			    requiredsize = 2*ressize;
3687			if (PyByteArray_Resize(res, requiredsize)) {
3688			    Py_DECREF(repunicode);
3689			    goto onError;
3690			}
3691			str = PyByteArray_AS_STRING(res) + respos;
3692			ressize = requiredsize;
3693		    }
3694		    /* check if there is anything unencodable in the replacement
3695		       and copy it to the output */
3696		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3697			c = *uni2;
3698			if (c >= limit) {
3699			    raise_encode_exception(&exc, encoding, startp, size,
3700				unicodepos, unicodepos+1, reason);
3701			    Py_DECREF(repunicode);
3702			    goto onError;
3703			}
3704			*str = (char)c;
3705		    }
3706		    p = startp + newpos;
3707		    Py_DECREF(repunicode);
3708	    }
3709	}
3710    }
3711    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
3712                                        str - PyByteArray_AS_STRING(res));
3713  onError:
3714    Py_DECREF(res);
3715    Py_XDECREF(errorHandler);
3716    Py_XDECREF(exc);
3717    return result;
3718}
3719
3720PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3721				 Py_ssize_t size,
3722				 const char *errors)
3723{
3724    return unicode_encode_ucs1(p, size, errors, 256);
3725}
3726
3727PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3728{
3729    if (!PyUnicode_Check(unicode)) {
3730	PyErr_BadArgument();
3731	return NULL;
3732    }
3733    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3734				  PyUnicode_GET_SIZE(unicode),
3735				  NULL);
3736}
3737
3738/* --- 7-bit ASCII Codec -------------------------------------------------- */
3739
3740PyObject *PyUnicode_DecodeASCII(const char *s,
3741				Py_ssize_t size,
3742				const char *errors)
3743{
3744    const char *starts = s;
3745    PyUnicodeObject *v;
3746    Py_UNICODE *p;
3747    Py_ssize_t startinpos;
3748    Py_ssize_t endinpos;
3749    Py_ssize_t outpos;
3750    const char *e;
3751    PyObject *errorHandler = NULL;
3752    PyObject *exc = NULL;
3753
3754    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3755    if (size == 1 && *(unsigned char*)s < 128) {
3756	Py_UNICODE r = *(unsigned char*)s;
3757	return PyUnicode_FromUnicode(&r, 1);
3758    }
3759
3760    v = _PyUnicode_New(size);
3761    if (v == NULL)
3762	goto onError;
3763    if (size == 0)
3764	return (PyObject *)v;
3765    p = PyUnicode_AS_UNICODE(v);
3766    e = s + size;
3767    while (s < e) {
3768	register unsigned char c = (unsigned char)*s;
3769	if (c < 128) {
3770	    *p++ = c;
3771	    ++s;
3772	}
3773	else {
3774	    startinpos = s-starts;
3775	    endinpos = startinpos + 1;
3776	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3777	    if (unicode_decode_call_errorhandler(
3778		 errors, &errorHandler,
3779		 "ascii", "ordinal not in range(128)",
3780		 &starts, &e, &startinpos, &endinpos, &exc, &s,
3781		 (PyObject **)&v, &outpos, &p))
3782		goto onError;
3783	}
3784    }
3785    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3786	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3787	    goto onError;
3788    Py_XDECREF(errorHandler);
3789    Py_XDECREF(exc);
3790    return (PyObject *)v;
3791
3792 onError:
3793    Py_XDECREF(v);
3794    Py_XDECREF(errorHandler);
3795    Py_XDECREF(exc);
3796    return NULL;
3797}
3798
3799PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3800				Py_ssize_t size,
3801				const char *errors)
3802{
3803    return unicode_encode_ucs1(p, size, errors, 128);
3804}
3805
3806PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3807{
3808    if (!PyUnicode_Check(unicode)) {
3809	PyErr_BadArgument();
3810	return NULL;
3811    }
3812    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3813				 PyUnicode_GET_SIZE(unicode),
3814				 NULL);
3815}
3816
3817#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3818
3819/* --- MBCS codecs for Windows -------------------------------------------- */
3820
3821#if SIZEOF_INT < SIZEOF_SSIZE_T
3822#define NEED_RETRY
3823#endif
3824
3825/* XXX This code is limited to "true" double-byte encodings, as
3826   a) it assumes an incomplete character consists of a single byte, and
3827   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3828      encodings, see IsDBCSLeadByteEx documentation. */
3829
3830static int is_dbcs_lead_byte(const char *s, int offset)
3831{
3832    const char *curr = s + offset;
3833
3834    if (IsDBCSLeadByte(*curr)) {
3835	const char *prev = CharPrev(s, curr);
3836	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3837    }
3838    return 0;
3839}
3840
3841/*
3842 * Decode MBCS string into unicode object. If 'final' is set, converts
3843 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3844 */
3845static int decode_mbcs(PyUnicodeObject **v,
3846			const char *s, /* MBCS string */
3847			int size, /* sizeof MBCS string */
3848			int final)
3849{
3850    Py_UNICODE *p;
3851    Py_ssize_t n = 0;
3852    int usize = 0;
3853
3854    assert(size >= 0);
3855
3856    /* Skip trailing lead-byte unless 'final' is set */
3857    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3858	--size;
3859
3860    /* First get the size of the result */
3861    if (size > 0) {
3862	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3863	if (usize == 0) {
3864	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3865	    return -1;
3866	}
3867    }
3868
3869    if (*v == NULL) {
3870	/* Create unicode object */
3871	*v = _PyUnicode_New(usize);
3872	if (*v == NULL)
3873	    return -1;
3874    }
3875    else {
3876	/* Extend unicode object */
3877	n = PyUnicode_GET_SIZE(*v);
3878	if (_PyUnicode_Resize(v, n + usize) < 0)
3879	    return -1;
3880    }
3881
3882    /* Do the conversion */
3883    if (size > 0) {
3884	p = PyUnicode_AS_UNICODE(*v) + n;
3885	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3886	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3887	    return -1;
3888	}
3889    }
3890
3891    return size;
3892}
3893
3894PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3895					Py_ssize_t size,
3896					const char *errors,
3897					Py_ssize_t *consumed)
3898{
3899    PyUnicodeObject *v = NULL;
3900    int done;
3901
3902    if (consumed)
3903	*consumed = 0;
3904
3905#ifdef NEED_RETRY
3906  retry:
3907    if (size > INT_MAX)
3908	done = decode_mbcs(&v, s, INT_MAX, 0);
3909    else
3910#endif
3911	done = decode_mbcs(&v, s, (int)size, !consumed);
3912
3913    if (done < 0) {
3914        Py_XDECREF(v);
3915	return NULL;
3916    }
3917
3918    if (consumed)
3919	*consumed += done;
3920
3921#ifdef NEED_RETRY
3922    if (size > INT_MAX) {
3923	s += done;
3924	size -= done;
3925	goto retry;
3926    }
3927#endif
3928
3929    return (PyObject *)v;
3930}
3931
3932PyObject *PyUnicode_DecodeMBCS(const char *s,
3933				Py_ssize_t size,
3934				const char *errors)
3935{
3936    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3937}
3938
3939/*
3940 * Convert unicode into string object (MBCS).
3941 * Returns 0 if succeed, -1 otherwise.
3942 */
3943static int encode_mbcs(PyObject **repr,
3944			const Py_UNICODE *p, /* unicode */
3945			int size) /* size of unicode */
3946{
3947    int mbcssize = 0;
3948    Py_ssize_t n = 0;
3949
3950    assert(size >= 0);
3951
3952    /* First get the size of the result */
3953    if (size > 0) {
3954	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3955	if (mbcssize == 0) {
3956	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3957	    return -1;
3958	}
3959    }
3960
3961    if (*repr == NULL) {
3962	/* Create string object */
3963	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
3964	if (*repr == NULL)
3965	    return -1;
3966    }
3967    else {
3968	/* Extend string object */
3969	n = PyBytes_Size(*repr);
3970	if (_PyBytes_Resize(repr, n + mbcssize) < 0)
3971	    return -1;
3972    }
3973
3974    /* Do the conversion */
3975    if (size > 0) {
3976	char *s = PyBytes_AS_STRING(*repr) + n;
3977	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3978	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3979	    return -1;
3980	}
3981    }
3982
3983    return 0;
3984}
3985
3986PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3987				Py_ssize_t size,
3988				const char *errors)
3989{
3990    PyObject *repr = NULL;
3991    int ret;
3992
3993#ifdef NEED_RETRY
3994 retry:
3995    if (size > INT_MAX)
3996	ret = encode_mbcs(&repr, p, INT_MAX);
3997    else
3998#endif
3999	ret = encode_mbcs(&repr, p, (int)size);
4000
4001    if (ret < 0) {
4002	Py_XDECREF(repr);
4003	return NULL;
4004    }
4005
4006#ifdef NEED_RETRY
4007    if (size > INT_MAX) {
4008	p += INT_MAX;
4009	size -= INT_MAX;
4010	goto retry;
4011    }
4012#endif
4013
4014    return repr;
4015}
4016
4017PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4018{
4019    if (!PyUnicode_Check(unicode)) {
4020        PyErr_BadArgument();
4021        return NULL;
4022    }
4023    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4024				PyUnicode_GET_SIZE(unicode),
4025				NULL);
4026}
4027
4028#undef NEED_RETRY
4029
4030#endif /* MS_WINDOWS */
4031
4032/* --- Character Mapping Codec -------------------------------------------- */
4033
4034PyObject *PyUnicode_DecodeCharmap(const char *s,
4035				  Py_ssize_t size,
4036				  PyObject *mapping,
4037				  const char *errors)
4038{
4039    const char *starts = s;
4040    Py_ssize_t startinpos;
4041    Py_ssize_t endinpos;
4042    Py_ssize_t outpos;
4043    const char *e;
4044    PyUnicodeObject *v;
4045    Py_UNICODE *p;
4046    Py_ssize_t extrachars = 0;
4047    PyObject *errorHandler = NULL;
4048    PyObject *exc = NULL;
4049    Py_UNICODE *mapstring = NULL;
4050    Py_ssize_t maplen = 0;
4051
4052    /* Default to Latin-1 */
4053    if (mapping == NULL)
4054	return PyUnicode_DecodeLatin1(s, size, errors);
4055
4056    v = _PyUnicode_New(size);
4057    if (v == NULL)
4058	goto onError;
4059    if (size == 0)
4060	return (PyObject *)v;
4061    p = PyUnicode_AS_UNICODE(v);
4062    e = s + size;
4063    if (PyUnicode_CheckExact(mapping)) {
4064	mapstring = PyUnicode_AS_UNICODE(mapping);
4065	maplen = PyUnicode_GET_SIZE(mapping);
4066	while (s < e) {
4067	    unsigned char ch = *s;
4068	    Py_UNICODE x = 0xfffe; /* illegal value */
4069
4070	    if (ch < maplen)
4071		x = mapstring[ch];
4072
4073	    if (x == 0xfffe) {
4074		/* undefined mapping */
4075		outpos = p-PyUnicode_AS_UNICODE(v);
4076		startinpos = s-starts;
4077		endinpos = startinpos+1;
4078		if (unicode_decode_call_errorhandler(
4079		     errors, &errorHandler,
4080		     "charmap", "character maps to <undefined>",
4081		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4082		     (PyObject **)&v, &outpos, &p)) {
4083		    goto onError;
4084		}
4085		continue;
4086	    }
4087	    *p++ = x;
4088	    ++s;
4089	}
4090    }
4091    else {
4092	while (s < e) {
4093	    unsigned char ch = *s;
4094	    PyObject *w, *x;
4095
4096	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
4097	    w = PyLong_FromLong((long)ch);
4098	    if (w == NULL)
4099		goto onError;
4100	    x = PyObject_GetItem(mapping, w);
4101	    Py_DECREF(w);
4102	    if (x == NULL) {
4103		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4104		    /* No mapping found means: mapping is undefined. */
4105		    PyErr_Clear();
4106		    x = Py_None;
4107		    Py_INCREF(x);
4108		} else
4109		    goto onError;
4110	    }
4111
4112	    /* Apply mapping */
4113	    if (PyLong_Check(x)) {
4114		long value = PyLong_AS_LONG(x);
4115		if (value < 0 || value > 65535) {
4116		    PyErr_SetString(PyExc_TypeError,
4117				    "character mapping must be in range(65536)");
4118		    Py_DECREF(x);
4119		    goto onError;
4120		}
4121		*p++ = (Py_UNICODE)value;
4122	    }
4123	    else if (x == Py_None) {
4124		/* undefined mapping */
4125		outpos = p-PyUnicode_AS_UNICODE(v);
4126		startinpos = s-starts;
4127		endinpos = startinpos+1;
4128		if (unicode_decode_call_errorhandler(
4129		     errors, &errorHandler,
4130		     "charmap", "character maps to <undefined>",
4131		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4132		     (PyObject **)&v, &outpos, &p)) {
4133		    Py_DECREF(x);
4134		    goto onError;
4135		}
4136		Py_DECREF(x);
4137		continue;
4138	    }
4139	    else if (PyUnicode_Check(x)) {
4140		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4141
4142		if (targetsize == 1)
4143		    /* 1-1 mapping */
4144		    *p++ = *PyUnicode_AS_UNICODE(x);
4145
4146		else if (targetsize > 1) {
4147		    /* 1-n mapping */
4148		    if (targetsize > extrachars) {
4149			/* resize first */
4150			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4151			Py_ssize_t needed = (targetsize - extrachars) + \
4152				     (targetsize << 2);
4153			extrachars += needed;
4154			/* XXX overflow detection missing */
4155			if (_PyUnicode_Resize(&v,
4156					     PyUnicode_GET_SIZE(v) + needed) < 0) {
4157			    Py_DECREF(x);
4158			    goto onError;
4159			}
4160			p = PyUnicode_AS_UNICODE(v) + oldpos;
4161		    }
4162		    Py_UNICODE_COPY(p,
4163				    PyUnicode_AS_UNICODE(x),
4164				    targetsize);
4165		    p += targetsize;
4166		    extrachars -= targetsize;
4167		}
4168		/* 1-0 mapping: skip the character */
4169	    }
4170	    else {
4171		/* wrong return value */
4172		PyErr_SetString(PyExc_TypeError,
4173		      "character mapping must return integer, None or unicode");
4174		Py_DECREF(x);
4175		goto onError;
4176	    }
4177	    Py_DECREF(x);
4178	    ++s;
4179	}
4180    }
4181    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4182	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4183	    goto onError;
4184    Py_XDECREF(errorHandler);
4185    Py_XDECREF(exc);
4186    return (PyObject *)v;
4187
4188 onError:
4189    Py_XDECREF(errorHandler);
4190    Py_XDECREF(exc);
4191    Py_XDECREF(v);
4192    return NULL;
4193}
4194
4195/* Charmap encoding: the lookup table */
4196
4197struct encoding_map{
4198  PyObject_HEAD
4199  unsigned char level1[32];
4200  int count2, count3;
4201  unsigned char level23[1];
4202};
4203
4204static PyObject*
4205encoding_map_size(PyObject *obj, PyObject* args)
4206{
4207    struct encoding_map *map = (struct encoding_map*)obj;
4208    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4209                          128*map->count3);
4210}
4211
4212static PyMethodDef encoding_map_methods[] = {
4213	{"size", encoding_map_size, METH_NOARGS,
4214         PyDoc_STR("Return the size (in bytes) of this object") },
4215        { 0 }
4216};
4217
4218static void
4219encoding_map_dealloc(PyObject* o)
4220{
4221	PyObject_FREE(o);
4222}
4223
4224static PyTypeObject EncodingMapType = {
4225	PyVarObject_HEAD_INIT(NULL, 0)
4226        "EncodingMap",          /*tp_name*/
4227        sizeof(struct encoding_map),   /*tp_basicsize*/
4228        0,                      /*tp_itemsize*/
4229        /* methods */
4230        encoding_map_dealloc,   /*tp_dealloc*/
4231        0,                      /*tp_print*/
4232        0,                      /*tp_getattr*/
4233        0,                      /*tp_setattr*/
4234        0,                      /*tp_compare*/
4235        0,                      /*tp_repr*/
4236        0,                      /*tp_as_number*/
4237        0,                      /*tp_as_sequence*/
4238        0,                      /*tp_as_mapping*/
4239        0,                      /*tp_hash*/
4240        0,                      /*tp_call*/
4241        0,                      /*tp_str*/
4242        0,                      /*tp_getattro*/
4243        0,                      /*tp_setattro*/
4244        0,                      /*tp_as_buffer*/
4245        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4246        0,                      /*tp_doc*/
4247        0,                      /*tp_traverse*/
4248        0,                      /*tp_clear*/
4249        0,                      /*tp_richcompare*/
4250        0,                      /*tp_weaklistoffset*/
4251        0,                      /*tp_iter*/
4252        0,                      /*tp_iternext*/
4253        encoding_map_methods,   /*tp_methods*/
4254        0,                      /*tp_members*/
4255        0,                      /*tp_getset*/
4256        0,                      /*tp_base*/
4257        0,                      /*tp_dict*/
4258        0,                      /*tp_descr_get*/
4259        0,                      /*tp_descr_set*/
4260        0,                      /*tp_dictoffset*/
4261        0,                      /*tp_init*/
4262        0,                      /*tp_alloc*/
4263        0,                      /*tp_new*/
4264        0,                      /*tp_free*/
4265        0,                      /*tp_is_gc*/
4266};
4267
4268PyObject*
4269PyUnicode_BuildEncodingMap(PyObject* string)
4270{
4271    Py_UNICODE *decode;
4272    PyObject *result;
4273    struct encoding_map *mresult;
4274    int i;
4275    int need_dict = 0;
4276    unsigned char level1[32];
4277    unsigned char level2[512];
4278    unsigned char *mlevel1, *mlevel2, *mlevel3;
4279    int count2 = 0, count3 = 0;
4280
4281    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4282        PyErr_BadArgument();
4283        return NULL;
4284    }
4285    decode = PyUnicode_AS_UNICODE(string);
4286    memset(level1, 0xFF, sizeof level1);
4287    memset(level2, 0xFF, sizeof level2);
4288
4289    /* If there isn't a one-to-one mapping of NULL to \0,
4290       or if there are non-BMP characters, we need to use
4291       a mapping dictionary. */
4292    if (decode[0] != 0)
4293        need_dict = 1;
4294    for (i = 1; i < 256; i++) {
4295        int l1, l2;
4296        if (decode[i] == 0
4297            #ifdef Py_UNICODE_WIDE
4298            || decode[i] > 0xFFFF
4299            #endif
4300        ) {
4301            need_dict = 1;
4302            break;
4303        }
4304        if (decode[i] == 0xFFFE)
4305            /* unmapped character */
4306            continue;
4307        l1 = decode[i] >> 11;
4308        l2 = decode[i] >> 7;
4309        if (level1[l1] == 0xFF)
4310            level1[l1] = count2++;
4311        if (level2[l2] == 0xFF)
4312            level2[l2] = count3++;
4313    }
4314
4315    if (count2 >= 0xFF || count3 >= 0xFF)
4316        need_dict = 1;
4317
4318    if (need_dict) {
4319        PyObject *result = PyDict_New();
4320        PyObject *key, *value;
4321        if (!result)
4322            return NULL;
4323        for (i = 0; i < 256; i++) {
4324            key = value = NULL;
4325            key = PyLong_FromLong(decode[i]);
4326            value = PyLong_FromLong(i);
4327            if (!key || !value)
4328                goto failed1;
4329            if (PyDict_SetItem(result, key, value) == -1)
4330                goto failed1;
4331            Py_DECREF(key);
4332            Py_DECREF(value);
4333        }
4334        return result;
4335      failed1:
4336        Py_XDECREF(key);
4337        Py_XDECREF(value);
4338        Py_DECREF(result);
4339        return NULL;
4340    }
4341
4342    /* Create a three-level trie */
4343    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4344                             16*count2 + 128*count3 - 1);
4345    if (!result)
4346        return PyErr_NoMemory();
4347    PyObject_Init(result, &EncodingMapType);
4348    mresult = (struct encoding_map*)result;
4349    mresult->count2 = count2;
4350    mresult->count3 = count3;
4351    mlevel1 = mresult->level1;
4352    mlevel2 = mresult->level23;
4353    mlevel3 = mresult->level23 + 16*count2;
4354    memcpy(mlevel1, level1, 32);
4355    memset(mlevel2, 0xFF, 16*count2);
4356    memset(mlevel3, 0, 128*count3);
4357    count3 = 0;
4358    for (i = 1; i < 256; i++) {
4359        int o1, o2, o3, i2, i3;
4360        if (decode[i] == 0xFFFE)
4361            /* unmapped character */
4362            continue;
4363        o1 = decode[i]>>11;
4364        o2 = (decode[i]>>7) & 0xF;
4365        i2 = 16*mlevel1[o1] + o2;
4366        if (mlevel2[i2] == 0xFF)
4367            mlevel2[i2] = count3++;
4368        o3 = decode[i] & 0x7F;
4369        i3 = 128*mlevel2[i2] + o3;
4370        mlevel3[i3] = i;
4371    }
4372    return result;
4373}
4374
4375static int
4376encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4377{
4378    struct encoding_map *map = (struct encoding_map*)mapping;
4379    int l1 = c>>11;
4380    int l2 = (c>>7) & 0xF;
4381    int l3 = c & 0x7F;
4382    int i;
4383
4384#ifdef Py_UNICODE_WIDE
4385    if (c > 0xFFFF) {
4386	return -1;
4387    }
4388#endif
4389    if (c == 0)
4390        return 0;
4391    /* level 1*/
4392    i = map->level1[l1];
4393    if (i == 0xFF) {
4394        return -1;
4395    }
4396    /* level 2*/
4397    i = map->level23[16*i+l2];
4398    if (i == 0xFF) {
4399        return -1;
4400    }
4401    /* level 3 */
4402    i = map->level23[16*map->count2 + 128*i + l3];
4403    if (i == 0) {
4404        return -1;
4405    }
4406    return i;
4407}
4408
4409/* Lookup the character ch in the mapping. If the character
4410   can't be found, Py_None is returned (or NULL, if another
4411   error occurred). */
4412static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4413{
4414    PyObject *w = PyLong_FromLong((long)c);
4415    PyObject *x;
4416
4417    if (w == NULL)
4418	 return NULL;
4419    x = PyObject_GetItem(mapping, w);
4420    Py_DECREF(w);
4421    if (x == NULL) {
4422	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4423	    /* No mapping found means: mapping is undefined. */
4424	    PyErr_Clear();
4425	    x = Py_None;
4426	    Py_INCREF(x);
4427	    return x;
4428	} else
4429	    return NULL;
4430    }
4431    else if (x == Py_None)
4432	return x;
4433    else if (PyLong_Check(x)) {
4434	long value = PyLong_AS_LONG(x);
4435	if (value < 0 || value > 255) {
4436	    PyErr_SetString(PyExc_TypeError,
4437			     "character mapping must be in range(256)");
4438	    Py_DECREF(x);
4439	    return NULL;
4440	}
4441	return x;
4442    }
4443    else if (PyBytes_Check(x))
4444	return x;
4445    else {
4446	/* wrong return value */
4447	PyErr_Format(PyExc_TypeError,
4448                "character mapping must return integer, bytes or None, not %.400s",
4449                x->ob_type->tp_name);
4450	Py_DECREF(x);
4451	return NULL;
4452    }
4453}
4454
4455static int
4456charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4457{
4458	Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4459	/* exponentially overallocate to minimize reallocations */
4460	if (requiredsize < 2*outsize)
4461	    requiredsize = 2*outsize;
4462	if (_PyBytes_Resize(outobj, requiredsize))
4463	    return -1;
4464	return 0;
4465}
4466
4467typedef enum charmapencode_result {
4468  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4469}charmapencode_result;
4470/* lookup the character, put the result in the output string and adjust
4471   various state variables. Resize the output bytes object if not enough
4472   space is available. Return a new reference to the object that
4473   was put in the output buffer, or Py_None, if the mapping was undefined
4474   (in which case no character was written) or NULL, if a
4475   reallocation error occurred. The caller must decref the result */
4476static
4477charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4478    PyObject **outobj, Py_ssize_t *outpos)
4479{
4480    PyObject *rep;
4481    char *outstart;
4482    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4483
4484    if (Py_TYPE(mapping) == &EncodingMapType) {
4485        int res = encoding_map_lookup(c, mapping);
4486	Py_ssize_t requiredsize = *outpos+1;
4487        if (res == -1)
4488            return enc_FAILED;
4489	if (outsize<requiredsize)
4490	    if (charmapencode_resize(outobj, outpos, requiredsize))
4491		return enc_EXCEPTION;
4492        outstart = PyBytes_AS_STRING(*outobj);
4493	outstart[(*outpos)++] = (char)res;
4494	return enc_SUCCESS;
4495    }
4496
4497    rep = charmapencode_lookup(c, mapping);
4498    if (rep==NULL)
4499	return enc_EXCEPTION;
4500    else if (rep==Py_None) {
4501	Py_DECREF(rep);
4502	return enc_FAILED;
4503    } else {
4504	if (PyLong_Check(rep)) {
4505	    Py_ssize_t requiredsize = *outpos+1;
4506	    if (outsize<requiredsize)
4507		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4508		    Py_DECREF(rep);
4509		    return enc_EXCEPTION;
4510		}
4511            outstart = PyBytes_AS_STRING(*outobj);
4512	    outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
4513	}
4514	else {
4515	    const char *repchars = PyBytes_AS_STRING(rep);
4516	    Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4517	    Py_ssize_t requiredsize = *outpos+repsize;
4518	    if (outsize<requiredsize)
4519		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4520		    Py_DECREF(rep);
4521		    return enc_EXCEPTION;
4522		}
4523            outstart = PyBytes_AS_STRING(*outobj);
4524	    memcpy(outstart + *outpos, repchars, repsize);
4525	    *outpos += repsize;
4526	}
4527    }
4528    Py_DECREF(rep);
4529    return enc_SUCCESS;
4530}
4531
4532/* handle an error in PyUnicode_EncodeCharmap
4533   Return 0 on success, -1 on error */
4534static
4535int charmap_encoding_error(
4536    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4537    PyObject **exceptionObject,
4538    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4539    PyObject **res, Py_ssize_t *respos)
4540{
4541    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4542    Py_ssize_t repsize;
4543    Py_ssize_t newpos;
4544    Py_UNICODE *uni2;
4545    /* startpos for collecting unencodable chars */
4546    Py_ssize_t collstartpos = *inpos;
4547    Py_ssize_t collendpos = *inpos+1;
4548    Py_ssize_t collpos;
4549    char *encoding = "charmap";
4550    char *reason = "character maps to <undefined>";
4551    charmapencode_result x;
4552
4553    /* find all unencodable characters */
4554    while (collendpos < size) {
4555        PyObject *rep;
4556        if (Py_TYPE(mapping) == &EncodingMapType) {
4557	    int res = encoding_map_lookup(p[collendpos], mapping);
4558	    if (res != -1)
4559		break;
4560	    ++collendpos;
4561	    continue;
4562	}
4563
4564	rep = charmapencode_lookup(p[collendpos], mapping);
4565	if (rep==NULL)
4566	    return -1;
4567	else if (rep!=Py_None) {
4568	    Py_DECREF(rep);
4569	    break;
4570	}
4571	Py_DECREF(rep);
4572	++collendpos;
4573    }
4574    /* cache callback name lookup
4575     * (if not done yet, i.e. it's the first error) */
4576    if (*known_errorHandler==-1) {
4577	if ((errors==NULL) || (!strcmp(errors, "strict")))
4578	    *known_errorHandler = 1;
4579	else if (!strcmp(errors, "replace"))
4580	    *known_errorHandler = 2;
4581	else if (!strcmp(errors, "ignore"))
4582	    *known_errorHandler = 3;
4583	else if (!strcmp(errors, "xmlcharrefreplace"))
4584	    *known_errorHandler = 4;
4585	else
4586	    *known_errorHandler = 0;
4587    }
4588    switch (*known_errorHandler) {
4589	case 1: /* strict */
4590	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4591	    return -1;
4592	case 2: /* replace */
4593	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4594		x = charmapencode_output('?', mapping, res, respos);
4595		if (x==enc_EXCEPTION) {
4596		    return -1;
4597		}
4598		else if (x==enc_FAILED) {
4599		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4600		    return -1;
4601		}
4602	    }
4603	    /* fall through */
4604	case 3: /* ignore */
4605	    *inpos = collendpos;
4606	    break;
4607	case 4: /* xmlcharrefreplace */
4608	    /* generate replacement (temporarily (mis)uses p) */
4609	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4610		char buffer[2+29+1+1];
4611		char *cp;
4612		sprintf(buffer, "&#%d;", (int)p[collpos]);
4613		for (cp = buffer; *cp; ++cp) {
4614		    x = charmapencode_output(*cp, mapping, res, respos);
4615		    if (x==enc_EXCEPTION)
4616			return -1;
4617		    else if (x==enc_FAILED) {
4618			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619			return -1;
4620		    }
4621		}
4622	    }
4623	    *inpos = collendpos;
4624	    break;
4625	default:
4626	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4627		encoding, reason, p, size, exceptionObject,
4628		collstartpos, collendpos, &newpos);
4629	    if (repunicode == NULL)
4630		return -1;
4631	    /* generate replacement  */
4632	    repsize = PyUnicode_GET_SIZE(repunicode);
4633	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4634		x = charmapencode_output(*uni2, mapping, res, respos);
4635		if (x==enc_EXCEPTION) {
4636		    return -1;
4637		}
4638		else if (x==enc_FAILED) {
4639		    Py_DECREF(repunicode);
4640		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4641		    return -1;
4642		}
4643	    }
4644	    *inpos = newpos;
4645	    Py_DECREF(repunicode);
4646    }
4647    return 0;
4648}
4649
4650PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4651				  Py_ssize_t size,
4652				  PyObject *mapping,
4653				  const char *errors)
4654{
4655    /* output object */
4656    PyObject *res = NULL;
4657    /* current input position */
4658    Py_ssize_t inpos = 0;
4659    /* current output position */
4660    Py_ssize_t respos = 0;
4661    PyObject *errorHandler = NULL;
4662    PyObject *exc = NULL;
4663    /* the following variable is used for caching string comparisons
4664     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4665     * 3=ignore, 4=xmlcharrefreplace */
4666    int known_errorHandler = -1;
4667
4668    /* Default to Latin-1 */
4669    if (mapping == NULL)
4670	return PyUnicode_EncodeLatin1(p, size, errors);
4671
4672    /* allocate enough for a simple encoding without
4673       replacements, if we need more, we'll resize */
4674    res = PyBytes_FromStringAndSize(NULL, size);
4675    if (res == NULL)
4676        goto onError;
4677    if (size == 0)
4678	return res;
4679
4680    while (inpos<size) {
4681	/* try to encode it */
4682	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4683	if (x==enc_EXCEPTION) /* error */
4684	    goto onError;
4685	if (x==enc_FAILED) { /* unencodable character */
4686	    if (charmap_encoding_error(p, size, &inpos, mapping,
4687		&exc,
4688		&known_errorHandler, &errorHandler, errors,
4689		&res, &respos)) {
4690		goto onError;
4691	    }
4692	}
4693	else
4694	    /* done with this character => adjust input position */
4695	    ++inpos;
4696    }
4697
4698    /* Resize if we allocated to much */
4699    if (respos<PyBytes_GET_SIZE(res))
4700	_PyBytes_Resize(&res, respos);
4701
4702    Py_XDECREF(exc);
4703    Py_XDECREF(errorHandler);
4704    return res;
4705
4706    onError:
4707    Py_XDECREF(res);
4708    Py_XDECREF(exc);
4709    Py_XDECREF(errorHandler);
4710    return NULL;
4711}
4712
4713PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4714				    PyObject *mapping)
4715{
4716    if (!PyUnicode_Check(unicode) || mapping == NULL) {
4717	PyErr_BadArgument();
4718	return NULL;
4719    }
4720    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4721				   PyUnicode_GET_SIZE(unicode),
4722				   mapping,
4723				   NULL);
4724}
4725
4726/* create or adjust a UnicodeTranslateError */
4727static void make_translate_exception(PyObject **exceptionObject,
4728    const Py_UNICODE *unicode, Py_ssize_t size,
4729    Py_ssize_t startpos, Py_ssize_t endpos,
4730    const char *reason)
4731{
4732    if (*exceptionObject == NULL) {
4733    	*exceptionObject = PyUnicodeTranslateError_Create(
4734	    unicode, size, startpos, endpos, reason);
4735    }
4736    else {
4737	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4738	    goto onError;
4739	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4740	    goto onError;
4741	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4742	    goto onError;
4743	return;
4744	onError:
4745	Py_DECREF(*exceptionObject);
4746	*exceptionObject = NULL;
4747    }
4748}
4749
4750/* raises a UnicodeTranslateError */
4751static void raise_translate_exception(PyObject **exceptionObject,
4752    const Py_UNICODE *unicode, Py_ssize_t size,
4753    Py_ssize_t startpos, Py_ssize_t endpos,
4754    const char *reason)
4755{
4756    make_translate_exception(exceptionObject,
4757	unicode, size, startpos, endpos, reason);
4758    if (*exceptionObject != NULL)
4759	PyCodec_StrictErrors(*exceptionObject);
4760}
4761
4762/* error handling callback helper:
4763   build arguments, call the callback and check the arguments,
4764   put the result into newpos and return the replacement string, which
4765   has to be freed by the caller */
4766static PyObject *unicode_translate_call_errorhandler(const char *errors,
4767    PyObject **errorHandler,
4768    const char *reason,
4769    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4770    Py_ssize_t startpos, Py_ssize_t endpos,
4771    Py_ssize_t *newpos)
4772{
4773    static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4774
4775    Py_ssize_t i_newpos;
4776    PyObject *restuple;
4777    PyObject *resunicode;
4778
4779    if (*errorHandler == NULL) {
4780	*errorHandler = PyCodec_LookupError(errors);
4781        if (*errorHandler == NULL)
4782	    return NULL;
4783    }
4784
4785    make_translate_exception(exceptionObject,
4786	unicode, size, startpos, endpos, reason);
4787    if (*exceptionObject == NULL)
4788	return NULL;
4789
4790    restuple = PyObject_CallFunctionObjArgs(
4791	*errorHandler, *exceptionObject, NULL);
4792    if (restuple == NULL)
4793	return NULL;
4794    if (!PyTuple_Check(restuple)) {
4795	PyErr_Format(PyExc_TypeError, &argparse[4]);
4796	Py_DECREF(restuple);
4797	return NULL;
4798    }
4799    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4800	&resunicode, &i_newpos)) {
4801	Py_DECREF(restuple);
4802	return NULL;
4803    }
4804    if (i_newpos<0)
4805	*newpos = size+i_newpos;
4806    else
4807        *newpos = i_newpos;
4808    if (*newpos<0 || *newpos>size) {
4809	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4810	Py_DECREF(restuple);
4811	return NULL;
4812    }
4813    Py_INCREF(resunicode);
4814    Py_DECREF(restuple);
4815    return resunicode;
4816}
4817
4818/* Lookup the character ch in the mapping and put the result in result,
4819   which must be decrefed by the caller.
4820   Return 0 on success, -1 on error */
4821static
4822int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4823{
4824    PyObject *w = PyLong_FromLong((long)c);
4825    PyObject *x;
4826
4827    if (w == NULL)
4828	 return -1;
4829    x = PyObject_GetItem(mapping, w);
4830    Py_DECREF(w);
4831    if (x == NULL) {
4832	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4833	    /* No mapping found means: use 1:1 mapping. */
4834	    PyErr_Clear();
4835	    *result = NULL;
4836	    return 0;
4837	} else
4838	    return -1;
4839    }
4840    else if (x == Py_None) {
4841	*result = x;
4842	return 0;
4843    }
4844    else if (PyLong_Check(x)) {
4845	long value = PyLong_AS_LONG(x);
4846	long max = PyUnicode_GetMax();
4847	if (value < 0 || value > max) {
4848	    PyErr_Format(PyExc_TypeError,
4849                         "character mapping must be in range(0x%x)", max+1);
4850	    Py_DECREF(x);
4851	    return -1;
4852	}
4853	*result = x;
4854	return 0;
4855    }
4856    else if (PyUnicode_Check(x)) {
4857	*result = x;
4858	return 0;
4859    }
4860    else {
4861	/* wrong return value */
4862	PyErr_SetString(PyExc_TypeError,
4863	      "character mapping must return integer, None or unicode");
4864	Py_DECREF(x);
4865	return -1;
4866    }
4867}
4868/* ensure that *outobj is at least requiredsize characters long,
4869if not reallocate and adjust various state variables.
4870Return 0 on success, -1 on error */
4871static
4872int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4873    Py_ssize_t requiredsize)
4874{
4875    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4876    if (requiredsize > oldsize) {
4877	/* remember old output position */
4878	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4879	/* exponentially overallocate to minimize reallocations */
4880	if (requiredsize < 2 * oldsize)
4881	    requiredsize = 2 * oldsize;
4882	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4883	    return -1;
4884	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4885    }
4886    return 0;
4887}
4888/* lookup the character, put the result in the output string and adjust
4889   various state variables. Return a new reference to the object that
4890   was put in the output buffer in *result, or Py_None, if the mapping was
4891   undefined (in which case no character was written).
4892   The called must decref result.
4893   Return 0 on success, -1 on error. */
4894static
4895int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4896    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4897    PyObject **res)
4898{
4899    if (charmaptranslate_lookup(*curinp, mapping, res))
4900	return -1;
4901    if (*res==NULL) {
4902	/* not found => default to 1:1 mapping */
4903	*(*outp)++ = *curinp;
4904    }
4905    else if (*res==Py_None)
4906	;
4907    else if (PyLong_Check(*res)) {
4908	/* no overflow check, because we know that the space is enough */
4909	*(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
4910    }
4911    else if (PyUnicode_Check(*res)) {
4912	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4913	if (repsize==1) {
4914	    /* no overflow check, because we know that the space is enough */
4915	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4916	}
4917	else if (repsize!=0) {
4918	    /* more than one character */
4919	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4920		(insize - (curinp-startinp)) +
4921		repsize - 1;
4922	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
4923		return -1;
4924	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4925	    *outp += repsize;
4926	}
4927    }
4928    else
4929	return -1;
4930    return 0;
4931}
4932
4933PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4934				     Py_ssize_t size,
4935				     PyObject *mapping,
4936				     const char *errors)
4937{
4938    /* output object */
4939    PyObject *res = NULL;
4940    /* pointers to the beginning and end+1 of input */
4941    const Py_UNICODE *startp = p;
4942    const Py_UNICODE *endp = p + size;
4943    /* pointer into the output */
4944    Py_UNICODE *str;
4945    /* current output position */
4946    Py_ssize_t respos = 0;
4947    char *reason = "character maps to <undefined>";
4948    PyObject *errorHandler = NULL;
4949    PyObject *exc = NULL;
4950    /* the following variable is used for caching string comparisons
4951     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4952     * 3=ignore, 4=xmlcharrefreplace */
4953    int known_errorHandler = -1;
4954
4955    if (mapping == NULL) {
4956	PyErr_BadArgument();
4957	return NULL;
4958    }
4959
4960    /* allocate enough for a simple 1:1 translation without
4961       replacements, if we need more, we'll resize */
4962    res = PyUnicode_FromUnicode(NULL, size);
4963    if (res == NULL)
4964	goto onError;
4965    if (size == 0)
4966	return res;
4967    str = PyUnicode_AS_UNICODE(res);
4968
4969    while (p<endp) {
4970	/* try to encode it */
4971	PyObject *x = NULL;
4972	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4973	    Py_XDECREF(x);
4974	    goto onError;
4975	}
4976	Py_XDECREF(x);
4977	if (x!=Py_None) /* it worked => adjust input pointer */
4978	    ++p;
4979	else { /* untranslatable character */
4980	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4981	    Py_ssize_t repsize;
4982	    Py_ssize_t newpos;
4983	    Py_UNICODE *uni2;
4984	    /* startpos for collecting untranslatable chars */
4985	    const Py_UNICODE *collstart = p;
4986	    const Py_UNICODE *collend = p+1;
4987	    const Py_UNICODE *coll;
4988
4989	    /* find all untranslatable characters */
4990	    while (collend < endp) {
4991		if (charmaptranslate_lookup(*collend, mapping, &x))
4992		    goto onError;
4993		Py_XDECREF(x);
4994		if (x!=Py_None)
4995		    break;
4996		++collend;
4997	    }
4998	    /* cache callback name lookup
4999	     * (if not done yet, i.e. it's the first error) */
5000	    if (known_errorHandler==-1) {
5001		if ((errors==NULL) || (!strcmp(errors, "strict")))
5002		    known_errorHandler = 1;
5003		else if (!strcmp(errors, "replace"))
5004		    known_errorHandler = 2;
5005		else if (!strcmp(errors, "ignore"))
5006		    known_errorHandler = 3;
5007		else if (!strcmp(errors, "xmlcharrefreplace"))
5008		    known_errorHandler = 4;
5009		else
5010		    known_errorHandler = 0;
5011	    }
5012	    switch (known_errorHandler) {
5013		case 1: /* strict */
5014		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5015		    goto onError;
5016		case 2: /* replace */
5017		    /* No need to check for space, this is a 1:1 replacement */
5018		    for (coll = collstart; coll<collend; ++coll)
5019			*str++ = '?';
5020		    /* fall through */
5021		case 3: /* ignore */
5022		    p = collend;
5023		    break;
5024		case 4: /* xmlcharrefreplace */
5025		    /* generate replacement (temporarily (mis)uses p) */
5026		    for (p = collstart; p < collend; ++p) {
5027			char buffer[2+29+1+1];
5028			char *cp;
5029			sprintf(buffer, "&#%d;", (int)*p);
5030			if (charmaptranslate_makespace(&res, &str,
5031			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5032			    goto onError;
5033			for (cp = buffer; *cp; ++cp)
5034			    *str++ = *cp;
5035		    }
5036		    p = collend;
5037		    break;
5038		default:
5039		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5040			reason, startp, size, &exc,
5041			collstart-startp, collend-startp, &newpos);
5042		    if (repunicode == NULL)
5043			goto onError;
5044		    /* generate replacement  */
5045		    repsize = PyUnicode_GET_SIZE(repunicode);
5046		    if (charmaptranslate_makespace(&res, &str,
5047			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5048			Py_DECREF(repunicode);
5049			goto onError;
5050		    }
5051		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5052			*str++ = *uni2;
5053		    p = startp + newpos;
5054		    Py_DECREF(repunicode);
5055	    }
5056	}
5057    }
5058    /* Resize if we allocated to much */
5059    respos = str-PyUnicode_AS_UNICODE(res);
5060    if (respos<PyUnicode_GET_SIZE(res)) {
5061	if (_PyUnicode_Resize(&res, respos) < 0)
5062	    goto onError;
5063    }
5064    Py_XDECREF(exc);
5065    Py_XDECREF(errorHandler);
5066    return res;
5067
5068    onError:
5069    Py_XDECREF(res);
5070    Py_XDECREF(exc);
5071    Py_XDECREF(errorHandler);
5072    return NULL;
5073}
5074
5075PyObject *PyUnicode_Translate(PyObject *str,
5076			      PyObject *mapping,
5077			      const char *errors)
5078{
5079    PyObject *result;
5080
5081    str = PyUnicode_FromObject(str);
5082    if (str == NULL)
5083	goto onError;
5084    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5085					PyUnicode_GET_SIZE(str),
5086					mapping,
5087					errors);
5088    Py_DECREF(str);
5089    return result;
5090
5091 onError:
5092    Py_XDECREF(str);
5093    return NULL;
5094}
5095
5096/* --- Decimal Encoder ---------------------------------------------------- */
5097
5098int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5099			    Py_ssize_t length,
5100			    char *output,
5101			    const char *errors)
5102{
5103    Py_UNICODE *p, *end;
5104    PyObject *errorHandler = NULL;
5105    PyObject *exc = NULL;
5106    const char *encoding = "decimal";
5107    const char *reason = "invalid decimal Unicode string";
5108    /* the following variable is used for caching string comparisons
5109     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5110    int known_errorHandler = -1;
5111
5112    if (output == NULL) {
5113	PyErr_BadArgument();
5114	return -1;
5115    }
5116
5117    p = s;
5118    end = s + length;
5119    while (p < end) {
5120	register Py_UNICODE ch = *p;
5121	int decimal;
5122	PyObject *repunicode;
5123	Py_ssize_t repsize;
5124	Py_ssize_t newpos;
5125	Py_UNICODE *uni2;
5126	Py_UNICODE *collstart;
5127	Py_UNICODE *collend;
5128
5129	if (Py_UNICODE_ISSPACE(ch)) {
5130	    *output++ = ' ';
5131	    ++p;
5132	    continue;
5133	}
5134	decimal = Py_UNICODE_TODECIMAL(ch);
5135	if (decimal >= 0) {
5136	    *output++ = '0' + decimal;
5137	    ++p;
5138	    continue;
5139	}
5140	if (0 < ch && ch < 256) {
5141	    *output++ = (char)ch;
5142	    ++p;
5143	    continue;
5144	}
5145	/* All other characters are considered unencodable */
5146	collstart = p;
5147	collend = p+1;
5148	while (collend < end) {
5149	    if ((0 < *collend && *collend < 256) ||
5150	        !Py_UNICODE_ISSPACE(*collend) ||
5151	        Py_UNICODE_TODECIMAL(*collend))
5152		break;
5153	}
5154	/* cache callback name lookup
5155	 * (if not done yet, i.e. it's the first error) */
5156	if (known_errorHandler==-1) {
5157	    if ((errors==NULL) || (!strcmp(errors, "strict")))
5158		known_errorHandler = 1;
5159	    else if (!strcmp(errors, "replace"))
5160		known_errorHandler = 2;
5161	    else if (!strcmp(errors, "ignore"))
5162		known_errorHandler = 3;
5163	    else if (!strcmp(errors, "xmlcharrefreplace"))
5164		known_errorHandler = 4;
5165	    else
5166		known_errorHandler = 0;
5167	}
5168	switch (known_errorHandler) {
5169	    case 1: /* strict */
5170		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5171		goto onError;
5172	    case 2: /* replace */
5173		for (p = collstart; p < collend; ++p)
5174		    *output++ = '?';
5175		/* fall through */
5176	    case 3: /* ignore */
5177		p = collend;
5178		break;
5179	    case 4: /* xmlcharrefreplace */
5180		/* generate replacement (temporarily (mis)uses p) */
5181		for (p = collstart; p < collend; ++p)
5182		    output += sprintf(output, "&#%d;", (int)*p);
5183		p = collend;
5184		break;
5185	    default:
5186		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5187		    encoding, reason, s, length, &exc,
5188		    collstart-s, collend-s, &newpos);
5189		if (repunicode == NULL)
5190		    goto onError;
5191		/* generate replacement  */
5192		repsize = PyUnicode_GET_SIZE(repunicode);
5193		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5194		    Py_UNICODE ch = *uni2;
5195		    if (Py_UNICODE_ISSPACE(ch))
5196			*output++ = ' ';
5197		    else {
5198			decimal = Py_UNICODE_TODECIMAL(ch);
5199			if (decimal >= 0)
5200			    *output++ = '0' + decimal;
5201			else if (0 < ch && ch < 256)
5202			    *output++ = (char)ch;
5203			else {
5204			    Py_DECREF(repunicode);
5205			    raise_encode_exception(&exc, encoding,
5206				s, length, collstart-s, collend-s, reason);
5207			    goto onError;
5208			}
5209		    }
5210		}
5211		p = s + newpos;
5212		Py_DECREF(repunicode);
5213	}
5214    }
5215    /* 0-terminate the output string */
5216    *output++ = '\0';
5217    Py_XDECREF(exc);
5218    Py_XDECREF(errorHandler);
5219    return 0;
5220
5221 onError:
5222    Py_XDECREF(exc);
5223    Py_XDECREF(errorHandler);
5224    return -1;
5225}
5226
5227/* --- Helpers ------------------------------------------------------------ */
5228
5229#include "stringlib/unicodedefs.h"
5230#include "stringlib/fastsearch.h"
5231#include "stringlib/count.h"
5232/* Include _ParseTupleFinds from find.h */
5233#define FROM_UNICODE
5234#include "stringlib/find.h"
5235#include "stringlib/partition.h"
5236
5237#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5238#include "stringlib/localeutil.h"
5239
5240/* helper macro to fixup start/end slice values */
5241#define FIX_START_END(obj)                      \
5242    if (start < 0)                              \
5243        start += (obj)->length;                 \
5244    if (start < 0)                              \
5245        start = 0;                              \
5246    if (end > (obj)->length)                    \
5247        end = (obj)->length;                    \
5248    if (end < 0)                                \
5249        end += (obj)->length;                   \
5250    if (end < 0)                                \
5251        end = 0;
5252
5253Py_ssize_t PyUnicode_Count(PyObject *str,
5254                           PyObject *substr,
5255                           Py_ssize_t start,
5256                           Py_ssize_t end)
5257{
5258    Py_ssize_t result;
5259    PyUnicodeObject* str_obj;
5260    PyUnicodeObject* sub_obj;
5261
5262    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5263    if (!str_obj)
5264	return -1;
5265    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5266    if (!sub_obj) {
5267	Py_DECREF(str_obj);
5268	return -1;
5269    }
5270
5271    FIX_START_END(str_obj);
5272
5273    result = stringlib_count(
5274        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5275        );
5276
5277    Py_DECREF(sub_obj);
5278    Py_DECREF(str_obj);
5279
5280    return result;
5281}
5282
5283Py_ssize_t PyUnicode_Find(PyObject *str,
5284                          PyObject *sub,
5285                          Py_ssize_t start,
5286                          Py_ssize_t end,
5287                          int direction)
5288{
5289    Py_ssize_t result;
5290
5291    str = PyUnicode_FromObject(str);
5292    if (!str)
5293	return -2;
5294    sub = PyUnicode_FromObject(sub);
5295    if (!sub) {
5296	Py_DECREF(str);
5297	return -2;
5298    }
5299
5300    if (direction > 0)
5301        result = stringlib_find_slice(
5302            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5303            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5304            start, end
5305            );
5306    else
5307        result = stringlib_rfind_slice(
5308            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5309            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5310            start, end
5311            );
5312
5313    Py_DECREF(str);
5314    Py_DECREF(sub);
5315
5316    return result;
5317}
5318
5319static
5320int tailmatch(PyUnicodeObject *self,
5321	      PyUnicodeObject *substring,
5322	      Py_ssize_t start,
5323	      Py_ssize_t end,
5324	      int direction)
5325{
5326    if (substring->length == 0)
5327        return 1;
5328
5329    FIX_START_END(self);
5330
5331    end -= substring->length;
5332    if (end < start)
5333	return 0;
5334
5335    if (direction > 0) {
5336	if (Py_UNICODE_MATCH(self, end, substring))
5337	    return 1;
5338    } else {
5339        if (Py_UNICODE_MATCH(self, start, substring))
5340	    return 1;
5341    }
5342
5343    return 0;
5344}
5345
5346Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5347			PyObject *substr,
5348			Py_ssize_t start,
5349			Py_ssize_t end,
5350			int direction)
5351{
5352    Py_ssize_t result;
5353
5354    str = PyUnicode_FromObject(str);
5355    if (str == NULL)
5356	return -1;
5357    substr = PyUnicode_FromObject(substr);
5358    if (substr == NULL) {
5359	Py_DECREF(str);
5360	return -1;
5361    }
5362
5363    result = tailmatch((PyUnicodeObject *)str,
5364		       (PyUnicodeObject *)substr,
5365		       start, end, direction);
5366    Py_DECREF(str);
5367    Py_DECREF(substr);
5368    return result;
5369}
5370
5371/* Apply fixfct filter to the Unicode object self and return a
5372   reference to the modified object */
5373
5374static
5375PyObject *fixup(PyUnicodeObject *self,
5376		int (*fixfct)(PyUnicodeObject *s))
5377{
5378
5379    PyUnicodeObject *u;
5380
5381    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5382    if (u == NULL)
5383	return NULL;
5384
5385    Py_UNICODE_COPY(u->str, self->str, self->length);
5386
5387    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5388	/* fixfct should return TRUE if it modified the buffer. If
5389	   FALSE, return a reference to the original buffer instead
5390	   (to save space, not time) */
5391	Py_INCREF(self);
5392	Py_DECREF(u);
5393	return (PyObject*) self;
5394    }
5395    return (PyObject*) u;
5396}
5397
5398static
5399int fixupper(PyUnicodeObject *self)
5400{
5401    Py_ssize_t len = self->length;
5402    Py_UNICODE *s = self->str;
5403    int status = 0;
5404
5405    while (len-- > 0) {
5406	register Py_UNICODE ch;
5407
5408	ch = Py_UNICODE_TOUPPER(*s);
5409	if (ch != *s) {
5410            status = 1;
5411	    *s = ch;
5412	}
5413        s++;
5414    }
5415
5416    return status;
5417}
5418
5419static
5420int fixlower(PyUnicodeObject *self)
5421{
5422    Py_ssize_t len = self->length;
5423    Py_UNICODE *s = self->str;
5424    int status = 0;
5425
5426    while (len-- > 0) {
5427	register Py_UNICODE ch;
5428
5429	ch = Py_UNICODE_TOLOWER(*s);
5430	if (ch != *s) {
5431            status = 1;
5432	    *s = ch;
5433	}
5434        s++;
5435    }
5436
5437    return status;
5438}
5439
5440static
5441int fixswapcase(PyUnicodeObject *self)
5442{
5443    Py_ssize_t len = self->length;
5444    Py_UNICODE *s = self->str;
5445    int status = 0;
5446
5447    while (len-- > 0) {
5448        if (Py_UNICODE_ISUPPER(*s)) {
5449            *s = Py_UNICODE_TOLOWER(*s);
5450            status = 1;
5451        } else if (Py_UNICODE_ISLOWER(*s)) {
5452            *s = Py_UNICODE_TOUPPER(*s);
5453            status = 1;
5454        }
5455        s++;
5456    }
5457
5458    return status;
5459}
5460
5461static
5462int fixcapitalize(PyUnicodeObject *self)
5463{
5464    Py_ssize_t len = self->length;
5465    Py_UNICODE *s = self->str;
5466    int status = 0;
5467
5468    if (len == 0)
5469	return 0;
5470    if (Py_UNICODE_ISLOWER(*s)) {
5471	*s = Py_UNICODE_TOUPPER(*s);
5472	status = 1;
5473    }
5474    s++;
5475    while (--len > 0) {
5476        if (Py_UNICODE_ISUPPER(*s)) {
5477            *s = Py_UNICODE_TOLOWER(*s);
5478            status = 1;
5479        }
5480        s++;
5481    }
5482    return status;
5483}
5484
5485static
5486int fixtitle(PyUnicodeObject *self)
5487{
5488    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5489    register Py_UNICODE *e;
5490    int previous_is_cased;
5491
5492    /* Shortcut for single character strings */
5493    if (PyUnicode_GET_SIZE(self) == 1) {
5494	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5495	if (*p != ch) {
5496	    *p = ch;
5497	    return 1;
5498	}
5499	else
5500	    return 0;
5501    }
5502
5503    e = p + PyUnicode_GET_SIZE(self);
5504    previous_is_cased = 0;
5505    for (; p < e; p++) {
5506	register const Py_UNICODE ch = *p;
5507
5508	if (previous_is_cased)
5509	    *p = Py_UNICODE_TOLOWER(ch);
5510	else
5511	    *p = Py_UNICODE_TOTITLE(ch);
5512
5513	if (Py_UNICODE_ISLOWER(ch) ||
5514	    Py_UNICODE_ISUPPER(ch) ||
5515	    Py_UNICODE_ISTITLE(ch))
5516	    previous_is_cased = 1;
5517	else
5518	    previous_is_cased = 0;
5519    }
5520    return 1;
5521}
5522
5523PyObject *
5524PyUnicode_Join(PyObject *separator, PyObject *seq)
5525{
5526    PyObject *internal_separator = NULL;
5527    const Py_UNICODE blank = ' ';
5528    const Py_UNICODE *sep = &blank;
5529    Py_ssize_t seplen = 1;
5530    PyUnicodeObject *res = NULL; /* the result */
5531    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5532    Py_ssize_t res_used;         /* # used bytes */
5533    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5534    PyObject *fseq;          /* PySequence_Fast(seq) */
5535    Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5536    PyObject *item;
5537    Py_ssize_t i;
5538
5539    fseq = PySequence_Fast(seq, "");
5540    if (fseq == NULL) {
5541    	return NULL;
5542    }
5543
5544    /* Grrrr.  A codec may be invoked to convert str objects to
5545     * Unicode, and so it's possible to call back into Python code
5546     * during PyUnicode_FromObject(), and so it's possible for a sick
5547     * codec to change the size of fseq (if seq is a list).  Therefore
5548     * we have to keep refetching the size -- can't assume seqlen
5549     * is invariant.
5550     */
5551    seqlen = PySequence_Fast_GET_SIZE(fseq);
5552    /* If empty sequence, return u"". */
5553    if (seqlen == 0) {
5554    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5555    	goto Done;
5556    }
5557    /* If singleton sequence with an exact Unicode, return that. */
5558    if (seqlen == 1) {
5559	item = PySequence_Fast_GET_ITEM(fseq, 0);
5560	if (PyUnicode_CheckExact(item)) {
5561	    Py_INCREF(item);
5562	    res = (PyUnicodeObject *)item;
5563	    goto Done;
5564	}
5565    }
5566
5567    /* At least two items to join, or one that isn't exact Unicode. */
5568    if (seqlen > 1) {
5569        /* Set up sep and seplen -- they're needed. */
5570    	if (separator == NULL) {
5571	    sep = &blank;
5572	    seplen = 1;
5573        }
5574    	else {
5575	    internal_separator = PyUnicode_FromObject(separator);
5576	    if (internal_separator == NULL)
5577	        goto onError;
5578	    sep = PyUnicode_AS_UNICODE(internal_separator);
5579	    seplen = PyUnicode_GET_SIZE(internal_separator);
5580	    /* In case PyUnicode_FromObject() mutated seq. */
5581	    seqlen = PySequence_Fast_GET_SIZE(fseq);
5582        }
5583    }
5584
5585    /* Get space. */
5586    res = _PyUnicode_New(res_alloc);
5587    if (res == NULL)
5588        goto onError;
5589    res_p = PyUnicode_AS_UNICODE(res);
5590    res_used = 0;
5591
5592    for (i = 0; i < seqlen; ++i) {
5593	Py_ssize_t itemlen;
5594	Py_ssize_t new_res_used;
5595
5596	item = PySequence_Fast_GET_ITEM(fseq, i);
5597	/* Convert item to Unicode. */
5598	if (!PyUnicode_Check(item)) {
5599	    PyErr_Format(PyExc_TypeError,
5600			 "sequence item %zd: expected str instance,"
5601			 " %.80s found",
5602			 i, Py_TYPE(item)->tp_name);
5603	    goto onError;
5604	}
5605	item = PyUnicode_FromObject(item);
5606	if (item == NULL)
5607	    goto onError;
5608	/* We own a reference to item from here on. */
5609
5610	/* In case PyUnicode_FromObject() mutated seq. */
5611	seqlen = PySequence_Fast_GET_SIZE(fseq);
5612
5613        /* Make sure we have enough space for the separator and the item. */
5614	itemlen = PyUnicode_GET_SIZE(item);
5615	new_res_used = res_used + itemlen;
5616	if (new_res_used < 0)
5617	    goto Overflow;
5618	if (i < seqlen - 1) {
5619	    new_res_used += seplen;
5620	    if (new_res_used < 0)
5621		goto Overflow;
5622	}
5623	if (new_res_used > res_alloc) {
5624	    /* double allocated size until it's big enough */
5625	    do {
5626	        res_alloc += res_alloc;
5627	        if (res_alloc <= 0)
5628	            goto Overflow;
5629	    } while (new_res_used > res_alloc);
5630	    if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5631		Py_DECREF(item);
5632		goto onError;
5633	    }
5634            res_p = PyUnicode_AS_UNICODE(res) + res_used;
5635	}
5636
5637	/* Copy item, and maybe the separator. */
5638	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5639	res_p += itemlen;
5640	if (i < seqlen - 1) {
5641	    Py_UNICODE_COPY(res_p, sep, seplen);
5642	    res_p += seplen;
5643	}
5644	Py_DECREF(item);
5645	res_used = new_res_used;
5646    }
5647
5648    /* Shrink res to match the used area; this probably can't fail,
5649     * but it's cheap to check.
5650     */
5651    if (_PyUnicode_Resize(&res, res_used) < 0)
5652	goto onError;
5653
5654 Done:
5655    Py_XDECREF(internal_separator);
5656    Py_DECREF(fseq);
5657    return (PyObject *)res;
5658
5659 Overflow:
5660    PyErr_SetString(PyExc_OverflowError,
5661                    "join() result is too long for a Python string");
5662    Py_DECREF(item);
5663    /* fall through */
5664
5665 onError:
5666    Py_XDECREF(internal_separator);
5667    Py_DECREF(fseq);
5668    Py_XDECREF(res);
5669    return NULL;
5670}
5671
5672static
5673PyUnicodeObject *pad(PyUnicodeObject *self,
5674		     Py_ssize_t left,
5675		     Py_ssize_t right,
5676		     Py_UNICODE fill)
5677{
5678    PyUnicodeObject *u;
5679
5680    if (left < 0)
5681        left = 0;
5682    if (right < 0)
5683        right = 0;
5684
5685    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5686        Py_INCREF(self);
5687        return self;
5688    }
5689
5690    u = _PyUnicode_New(left + self->length + right);
5691    if (u) {
5692        if (left)
5693            Py_UNICODE_FILL(u->str, fill, left);
5694        Py_UNICODE_COPY(u->str + left, self->str, self->length);
5695        if (right)
5696            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5697    }
5698
5699    return u;
5700}
5701
5702#define SPLIT_APPEND(data, left, right)					\
5703	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
5704	if (!str)							\
5705	    goto onError;						\
5706	if (PyList_Append(list, str)) {					\
5707	    Py_DECREF(str);						\
5708	    goto onError;						\
5709	}								\
5710        else								\
5711            Py_DECREF(str);
5712
5713static
5714PyObject *split_whitespace(PyUnicodeObject *self,
5715			   PyObject *list,
5716			   Py_ssize_t maxcount)
5717{
5718    register Py_ssize_t i;
5719    register Py_ssize_t j;
5720    Py_ssize_t len = self->length;
5721    PyObject *str;
5722    register const Py_UNICODE *buf = self->str;
5723
5724    for (i = j = 0; i < len; ) {
5725	/* find a token */
5726	while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5727	    i++;
5728	j = i;
5729	while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5730	    i++;
5731	if (j < i) {
5732	    if (maxcount-- <= 0)
5733		break;
5734	    SPLIT_APPEND(buf, j, i);
5735	    while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5736		i++;
5737	    j = i;
5738	}
5739    }
5740    if (j < len) {
5741	SPLIT_APPEND(buf, j, len);
5742    }
5743    return list;
5744
5745 onError:
5746    Py_DECREF(list);
5747    return NULL;
5748}
5749
5750PyObject *PyUnicode_Splitlines(PyObject *string,
5751			       int keepends)
5752{
5753    register Py_ssize_t i;
5754    register Py_ssize_t j;
5755    Py_ssize_t len;
5756    PyObject *list;
5757    PyObject *str;
5758    Py_UNICODE *data;
5759
5760    string = PyUnicode_FromObject(string);
5761    if (string == NULL)
5762	return NULL;
5763    data = PyUnicode_AS_UNICODE(string);
5764    len = PyUnicode_GET_SIZE(string);
5765
5766    list = PyList_New(0);
5767    if (!list)
5768        goto onError;
5769
5770    for (i = j = 0; i < len; ) {
5771	Py_ssize_t eol;
5772
5773	/* Find a line and append it */
5774	while (i < len && !BLOOM_LINEBREAK(data[i]))
5775	    i++;
5776
5777	/* Skip the line break reading CRLF as one line break */
5778	eol = i;
5779	if (i < len) {
5780	    if (data[i] == '\r' && i + 1 < len &&
5781		data[i+1] == '\n')
5782		i += 2;
5783	    else
5784		i++;
5785	    if (keepends)
5786		eol = i;
5787	}
5788	SPLIT_APPEND(data, j, eol);
5789	j = i;
5790    }
5791    if (j < len) {
5792	SPLIT_APPEND(data, j, len);
5793    }
5794
5795    Py_DECREF(string);
5796    return list;
5797
5798 onError:
5799    Py_XDECREF(list);
5800    Py_DECREF(string);
5801    return NULL;
5802}
5803
5804static
5805PyObject *split_char(PyUnicodeObject *self,
5806		     PyObject *list,
5807		     Py_UNICODE ch,
5808		     Py_ssize_t maxcount)
5809{
5810    register Py_ssize_t i;
5811    register Py_ssize_t j;
5812    Py_ssize_t len = self->length;
5813    PyObject *str;
5814    register const Py_UNICODE *buf = self->str;
5815
5816    for (i = j = 0; i < len; ) {
5817	if (buf[i] == ch) {
5818	    if (maxcount-- <= 0)
5819		break;
5820	    SPLIT_APPEND(buf, j, i);
5821	    i = j = i + 1;
5822	} else
5823	    i++;
5824    }
5825    if (j <= len) {
5826	SPLIT_APPEND(buf, j, len);
5827    }
5828    return list;
5829
5830 onError:
5831    Py_DECREF(list);
5832    return NULL;
5833}
5834
5835static
5836PyObject *split_substring(PyUnicodeObject *self,
5837			  PyObject *list,
5838			  PyUnicodeObject *substring,
5839			  Py_ssize_t maxcount)
5840{
5841    register Py_ssize_t i;
5842    register Py_ssize_t j;
5843    Py_ssize_t len = self->length;
5844    Py_ssize_t sublen = substring->length;
5845    PyObject *str;
5846
5847    for (i = j = 0; i <= len - sublen; ) {
5848	if (Py_UNICODE_MATCH(self, i, substring)) {
5849	    if (maxcount-- <= 0)
5850		break;
5851	    SPLIT_APPEND(self->str, j, i);
5852	    i = j = i + sublen;
5853	} else
5854	    i++;
5855    }
5856    if (j <= len) {
5857	SPLIT_APPEND(self->str, j, len);
5858    }
5859    return list;
5860
5861 onError:
5862    Py_DECREF(list);
5863    return NULL;
5864}
5865
5866static
5867PyObject *rsplit_whitespace(PyUnicodeObject *self,
5868			    PyObject *list,
5869			    Py_ssize_t maxcount)
5870{
5871    register Py_ssize_t i;
5872    register Py_ssize_t j;
5873    Py_ssize_t len = self->length;
5874    PyObject *str;
5875    register const Py_UNICODE *buf = self->str;
5876
5877    for (i = j = len - 1; i >= 0; ) {
5878	/* find a token */
5879	while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5880	    i--;
5881	j = i;
5882	while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5883	    i--;
5884	if (j > i) {
5885	    if (maxcount-- <= 0)
5886		break;
5887	    SPLIT_APPEND(buf, i + 1, j + 1);
5888	    while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5889		i--;
5890	    j = i;
5891	}
5892    }
5893    if (j >= 0) {
5894	SPLIT_APPEND(buf, 0, j + 1);
5895    }
5896    if (PyList_Reverse(list) < 0)
5897        goto onError;
5898    return list;
5899
5900 onError:
5901    Py_DECREF(list);
5902    return NULL;
5903}
5904
5905static
5906PyObject *rsplit_char(PyUnicodeObject *self,
5907		      PyObject *list,
5908		      Py_UNICODE ch,
5909		      Py_ssize_t maxcount)
5910{
5911    register Py_ssize_t i;
5912    register Py_ssize_t j;
5913    Py_ssize_t len = self->length;
5914    PyObject *str;
5915    register const Py_UNICODE *buf = self->str;
5916
5917    for (i = j = len - 1; i >= 0; ) {
5918	if (buf[i] == ch) {
5919	    if (maxcount-- <= 0)
5920		break;
5921	    SPLIT_APPEND(buf, i + 1, j + 1);
5922	    j = i = i - 1;
5923	} else
5924	    i--;
5925    }
5926    if (j >= -1) {
5927	SPLIT_APPEND(buf, 0, j + 1);
5928    }
5929    if (PyList_Reverse(list) < 0)
5930        goto onError;
5931    return list;
5932
5933 onError:
5934    Py_DECREF(list);
5935    return NULL;
5936}
5937
5938static
5939PyObject *rsplit_substring(PyUnicodeObject *self,
5940			   PyObject *list,
5941			   PyUnicodeObject *substring,
5942			   Py_ssize_t maxcount)
5943{
5944    register Py_ssize_t i;
5945    register Py_ssize_t j;
5946    Py_ssize_t len = self->length;
5947    Py_ssize_t sublen = substring->length;
5948    PyObject *str;
5949
5950    for (i = len - sublen, j = len; i >= 0; ) {
5951	if (Py_UNICODE_MATCH(self, i, substring)) {
5952	    if (maxcount-- <= 0)
5953		break;
5954	    SPLIT_APPEND(self->str, i + sublen, j);
5955	    j = i;
5956	    i -= sublen;
5957	} else
5958	    i--;
5959    }
5960    if (j >= 0) {
5961	SPLIT_APPEND(self->str, 0, j);
5962    }
5963    if (PyList_Reverse(list) < 0)
5964        goto onError;
5965    return list;
5966
5967 onError:
5968    Py_DECREF(list);
5969    return NULL;
5970}
5971
5972#undef SPLIT_APPEND
5973
5974static
5975PyObject *split(PyUnicodeObject *self,
5976		PyUnicodeObject *substring,
5977		Py_ssize_t maxcount)
5978{
5979    PyObject *list;
5980
5981    if (maxcount < 0)
5982        maxcount = PY_SSIZE_T_MAX;
5983
5984    list = PyList_New(0);
5985    if (!list)
5986        return NULL;
5987
5988    if (substring == NULL)
5989	return split_whitespace(self,list,maxcount);
5990
5991    else if (substring->length == 1)
5992	return split_char(self,list,substring->str[0],maxcount);
5993
5994    else if (substring->length == 0) {
5995	Py_DECREF(list);
5996	PyErr_SetString(PyExc_ValueError, "empty separator");
5997	return NULL;
5998    }
5999    else
6000	return split_substring(self,list,substring,maxcount);
6001}
6002
6003static
6004PyObject *rsplit(PyUnicodeObject *self,
6005		 PyUnicodeObject *substring,
6006		 Py_ssize_t maxcount)
6007{
6008    PyObject *list;
6009
6010    if (maxcount < 0)
6011        maxcount = PY_SSIZE_T_MAX;
6012
6013    list = PyList_New(0);
6014    if (!list)
6015        return NULL;
6016
6017    if (substring == NULL)
6018	return rsplit_whitespace(self,list,maxcount);
6019
6020    else if (substring->length == 1)
6021	return rsplit_char(self,list,substring->str[0],maxcount);
6022
6023    else if (substring->length == 0) {
6024	Py_DECREF(list);
6025	PyErr_SetString(PyExc_ValueError, "empty separator");
6026	return NULL;
6027    }
6028    else
6029	return rsplit_substring(self,list,substring,maxcount);
6030}
6031
6032static
6033PyObject *replace(PyUnicodeObject *self,
6034		  PyUnicodeObject *str1,
6035		  PyUnicodeObject *str2,
6036		  Py_ssize_t maxcount)
6037{
6038    PyUnicodeObject *u;
6039
6040    if (maxcount < 0)
6041	maxcount = PY_SSIZE_T_MAX;
6042
6043    if (str1->length == str2->length) {
6044        /* same length */
6045        Py_ssize_t i;
6046        if (str1->length == 1) {
6047            /* replace characters */
6048            Py_UNICODE u1, u2;
6049            if (!findchar(self->str, self->length, str1->str[0]))
6050                goto nothing;
6051            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6052            if (!u)
6053                return NULL;
6054            Py_UNICODE_COPY(u->str, self->str, self->length);
6055            u1 = str1->str[0];
6056            u2 = str2->str[0];
6057            for (i = 0; i < u->length; i++)
6058                if (u->str[i] == u1) {
6059                    if (--maxcount < 0)
6060                        break;
6061                    u->str[i] = u2;
6062                }
6063        } else {
6064            i = fastsearch(
6065                self->str, self->length, str1->str, str1->length, FAST_SEARCH
6066                );
6067            if (i < 0)
6068                goto nothing;
6069            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6070            if (!u)
6071                return NULL;
6072            Py_UNICODE_COPY(u->str, self->str, self->length);
6073            while (i <= self->length - str1->length)
6074                if (Py_UNICODE_MATCH(self, i, str1)) {
6075                    if (--maxcount < 0)
6076                        break;
6077                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6078                    i += str1->length;
6079                } else
6080                    i++;
6081        }
6082    } else {
6083
6084        Py_ssize_t n, i, j, e;
6085        Py_ssize_t product, new_size, delta;
6086        Py_UNICODE *p;
6087
6088        /* replace strings */
6089        n = stringlib_count(self->str, self->length, str1->str, str1->length);
6090        if (n > maxcount)
6091            n = maxcount;
6092        if (n == 0)
6093            goto nothing;
6094        /* new_size = self->length + n * (str2->length - str1->length)); */
6095        delta = (str2->length - str1->length);
6096        if (delta == 0) {
6097            new_size = self->length;
6098        } else {
6099            product = n * (str2->length - str1->length);
6100            if ((product / (str2->length - str1->length)) != n) {
6101                PyErr_SetString(PyExc_OverflowError,
6102                                "replace string is too long");
6103                return NULL;
6104            }
6105            new_size = self->length + product;
6106            if (new_size < 0) {
6107                PyErr_SetString(PyExc_OverflowError,
6108                                "replace string is too long");
6109                return NULL;
6110            }
6111        }
6112        u = _PyUnicode_New(new_size);
6113        if (!u)
6114            return NULL;
6115        i = 0;
6116        p = u->str;
6117        e = self->length - str1->length;
6118        if (str1->length > 0) {
6119            while (n-- > 0) {
6120                /* look for next match */
6121                j = i;
6122                while (j <= e) {
6123                    if (Py_UNICODE_MATCH(self, j, str1))
6124                        break;
6125                    j++;
6126                }
6127		if (j > i) {
6128                    if (j > e)
6129                        break;
6130                    /* copy unchanged part [i:j] */
6131                    Py_UNICODE_COPY(p, self->str+i, j-i);
6132                    p += j - i;
6133                }
6134                /* copy substitution string */
6135                if (str2->length > 0) {
6136                    Py_UNICODE_COPY(p, str2->str, str2->length);
6137                    p += str2->length;
6138                }
6139                i = j + str1->length;
6140            }
6141            if (i < self->length)
6142                /* copy tail [i:] */
6143                Py_UNICODE_COPY(p, self->str+i, self->length-i);
6144        } else {
6145            /* interleave */
6146            while (n > 0) {
6147                Py_UNICODE_COPY(p, str2->str, str2->length);
6148                p += str2->length;
6149                if (--n <= 0)
6150                    break;
6151                *p++ = self->str[i++];
6152            }
6153            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6154        }
6155    }
6156    return (PyObject *) u;
6157
6158nothing:
6159    /* nothing to replace; return original string (when possible) */
6160    if (PyUnicode_CheckExact(self)) {
6161        Py_INCREF(self);
6162        return (PyObject *) self;
6163    }
6164    return PyUnicode_FromUnicode(self->str, self->length);
6165}
6166
6167/* --- Unicode Object Methods --------------------------------------------- */
6168
6169PyDoc_STRVAR(title__doc__,
6170"S.title() -> unicode\n\
6171\n\
6172Return a titlecased version of S, i.e. words start with title case\n\
6173characters, all remaining cased characters have lower case.");
6174
6175static PyObject*
6176unicode_title(PyUnicodeObject *self)
6177{
6178    return fixup(self, fixtitle);
6179}
6180
6181PyDoc_STRVAR(capitalize__doc__,
6182"S.capitalize() -> unicode\n\
6183\n\
6184Return a capitalized version of S, i.e. make the first character\n\
6185have upper case.");
6186
6187static PyObject*
6188unicode_capitalize(PyUnicodeObject *self)
6189{
6190    return fixup(self, fixcapitalize);
6191}
6192
6193#if 0
6194PyDoc_STRVAR(capwords__doc__,
6195"S.capwords() -> unicode\n\
6196\n\
6197Apply .capitalize() to all words in S and return the result with\n\
6198normalized whitespace (all whitespace strings are replaced by ' ').");
6199
6200static PyObject*
6201unicode_capwords(PyUnicodeObject *self)
6202{
6203    PyObject *list;
6204    PyObject *item;
6205    Py_ssize_t i;
6206
6207    /* Split into words */
6208    list = split(self, NULL, -1);
6209    if (!list)
6210        return NULL;
6211
6212    /* Capitalize each word */
6213    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6214        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6215		     fixcapitalize);
6216        if (item == NULL)
6217            goto onError;
6218        Py_DECREF(PyList_GET_ITEM(list, i));
6219        PyList_SET_ITEM(list, i, item);
6220    }
6221
6222    /* Join the words to form a new string */
6223    item = PyUnicode_Join(NULL, list);
6224
6225onError:
6226    Py_DECREF(list);
6227    return (PyObject *)item;
6228}
6229#endif
6230
6231/* Argument converter.  Coerces to a single unicode character */
6232
6233static int
6234convert_uc(PyObject *obj, void *addr)
6235{
6236	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6237	PyObject *uniobj;
6238	Py_UNICODE *unistr;
6239
6240	uniobj = PyUnicode_FromObject(obj);
6241	if (uniobj == NULL) {
6242		PyErr_SetString(PyExc_TypeError,
6243			"The fill character cannot be converted to Unicode");
6244		return 0;
6245	}
6246	if (PyUnicode_GET_SIZE(uniobj) != 1) {
6247		PyErr_SetString(PyExc_TypeError,
6248			"The fill character must be exactly one character long");
6249		Py_DECREF(uniobj);
6250		return 0;
6251	}
6252	unistr = PyUnicode_AS_UNICODE(uniobj);
6253	*fillcharloc = unistr[0];
6254	Py_DECREF(uniobj);
6255	return 1;
6256}
6257
6258PyDoc_STRVAR(center__doc__,
6259"S.center(width[, fillchar]) -> unicode\n\
6260\n\
6261Return S centered in a Unicode string of length width. Padding is\n\
6262done using the specified fill character (default is a space)");
6263
6264static PyObject *
6265unicode_center(PyUnicodeObject *self, PyObject *args)
6266{
6267    Py_ssize_t marg, left;
6268    Py_ssize_t width;
6269    Py_UNICODE fillchar = ' ';
6270
6271    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6272        return NULL;
6273
6274    if (self->length >= width && PyUnicode_CheckExact(self)) {
6275        Py_INCREF(self);
6276        return (PyObject*) self;
6277    }
6278
6279    marg = width - self->length;
6280    left = marg / 2 + (marg & width & 1);
6281
6282    return (PyObject*) pad(self, left, marg - left, fillchar);
6283}
6284
6285#if 0
6286
6287/* This code should go into some future Unicode collation support
6288   module. The basic comparison should compare ordinals on a naive
6289   basis (this is what Java does and thus JPython too). */
6290
6291/* speedy UTF-16 code point order comparison */
6292/* gleaned from: */
6293/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6294
6295static short utf16Fixup[32] =
6296{
6297    0, 0, 0, 0, 0, 0, 0, 0,
6298    0, 0, 0, 0, 0, 0, 0, 0,
6299    0, 0, 0, 0, 0, 0, 0, 0,
6300    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6301};
6302
6303static int
6304unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6305{
6306    Py_ssize_t len1, len2;
6307
6308    Py_UNICODE *s1 = str1->str;
6309    Py_UNICODE *s2 = str2->str;
6310
6311    len1 = str1->length;
6312    len2 = str2->length;
6313
6314    while (len1 > 0 && len2 > 0) {
6315        Py_UNICODE c1, c2;
6316
6317        c1 = *s1++;
6318        c2 = *s2++;
6319
6320	if (c1 > (1<<11) * 26)
6321	    c1 += utf16Fixup[c1>>11];
6322	if (c2 > (1<<11) * 26)
6323            c2 += utf16Fixup[c2>>11];
6324        /* now c1 and c2 are in UTF-32-compatible order */
6325
6326        if (c1 != c2)
6327            return (c1 < c2) ? -1 : 1;
6328
6329        len1--; len2--;
6330    }
6331
6332    return (len1 < len2) ? -1 : (len1 != len2);
6333}
6334
6335#else
6336
6337static int
6338unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6339{
6340    register Py_ssize_t len1, len2;
6341
6342    Py_UNICODE *s1 = str1->str;
6343    Py_UNICODE *s2 = str2->str;
6344
6345    len1 = str1->length;
6346    len2 = str2->length;
6347
6348    while (len1 > 0 && len2 > 0) {
6349        Py_UNICODE c1, c2;
6350
6351        c1 = *s1++;
6352        c2 = *s2++;
6353
6354        if (c1 != c2)
6355            return (c1 < c2) ? -1 : 1;
6356
6357        len1--; len2--;
6358    }
6359
6360    return (len1 < len2) ? -1 : (len1 != len2);
6361}
6362
6363#endif
6364
6365int PyUnicode_Compare(PyObject *left,
6366		      PyObject *right)
6367{
6368    if (PyUnicode_Check(left) && PyUnicode_Check(right))
6369        return unicode_compare((PyUnicodeObject *)left,
6370                               (PyUnicodeObject *)right);
6371    PyErr_Format(PyExc_TypeError,
6372                 "Can't compare %.100s and %.100s",
6373                 left->ob_type->tp_name,
6374                 right->ob_type->tp_name);
6375    return -1;
6376}
6377
6378int
6379PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6380{
6381    int i;
6382    Py_UNICODE *id;
6383    assert(PyUnicode_Check(uni));
6384    id = PyUnicode_AS_UNICODE(uni);
6385    /* Compare Unicode string and source character set string */
6386    for (i = 0; id[i] && str[i]; i++)
6387	if (id[i] != str[i])
6388	    return ((int)id[i] < (int)str[i]) ? -1 : 1;
6389    if (id[i])
6390	return 1; /* uni is longer */
6391    if (str[i])
6392	return -1; /* str is longer */
6393    return 0;
6394}
6395
6396PyObject *PyUnicode_RichCompare(PyObject *left,
6397                                PyObject *right,
6398                                int op)
6399{
6400    int result;
6401
6402    result = PyUnicode_Compare(left, right);
6403    if (result == -1 && PyErr_Occurred())
6404        goto onError;
6405
6406    /* Convert the return value to a Boolean */
6407    switch (op) {
6408    case Py_EQ:
6409        result = (result == 0);
6410        break;
6411    case Py_NE:
6412        result = (result != 0);
6413        break;
6414    case Py_LE:
6415        result = (result <= 0);
6416        break;
6417    case Py_GE:
6418        result = (result >= 0);
6419        break;
6420    case Py_LT:
6421        result = (result == -1);
6422        break;
6423    case Py_GT:
6424        result = (result == 1);
6425        break;
6426    }
6427    return PyBool_FromLong(result);
6428
6429 onError:
6430
6431    /* Standard case
6432
6433       Type errors mean that PyUnicode_FromObject() could not convert
6434       one of the arguments (usually the right hand side) to Unicode,
6435       ie. we can't handle the comparison request. However, it is
6436       possible that the other object knows a comparison method, which
6437       is why we return Py_NotImplemented to give the other object a
6438       chance.
6439
6440    */
6441    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6442        PyErr_Clear();
6443        Py_INCREF(Py_NotImplemented);
6444        return Py_NotImplemented;
6445    }
6446    if (op != Py_EQ && op != Py_NE)
6447        return NULL;
6448
6449    /* Equality comparison.
6450
6451       This is a special case: we silence any PyExc_UnicodeDecodeError
6452       and instead turn it into a PyErr_UnicodeWarning.
6453
6454    */
6455    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6456        return NULL;
6457    PyErr_Clear();
6458    if (PyErr_WarnEx(PyExc_UnicodeWarning,
6459                     (op == Py_EQ) ?
6460                     "Unicode equal comparison "
6461                     "failed to convert both arguments to Unicode - "
6462                     "interpreting them as being unequal"
6463                     :
6464                     "Unicode unequal comparison "
6465                     "failed to convert both arguments to Unicode - "
6466                     "interpreting them as being unequal",
6467                     1) < 0)
6468        return NULL;
6469    result = (op == Py_NE);
6470    return PyBool_FromLong(result);
6471}
6472
6473int PyUnicode_Contains(PyObject *container,
6474		       PyObject *element)
6475{
6476    PyObject *str, *sub;
6477    int result;
6478
6479    /* Coerce the two arguments */
6480    sub = PyUnicode_FromObject(element);
6481    if (!sub) {
6482	PyErr_Format(PyExc_TypeError,
6483	    "'in <string>' requires string as left operand, not %s",
6484	    element->ob_type->tp_name);
6485        return -1;
6486    }
6487
6488    str = PyUnicode_FromObject(container);
6489    if (!str) {
6490        Py_DECREF(sub);
6491        return -1;
6492    }
6493
6494    result = stringlib_contains_obj(str, sub);
6495
6496    Py_DECREF(str);
6497    Py_DECREF(sub);
6498
6499    return result;
6500}
6501
6502/* Concat to string or Unicode object giving a new Unicode object. */
6503
6504PyObject *PyUnicode_Concat(PyObject *left,
6505			   PyObject *right)
6506{
6507    PyUnicodeObject *u = NULL, *v = NULL, *w;
6508
6509    /* Coerce the two arguments */
6510    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6511    if (u == NULL)
6512	goto onError;
6513    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6514    if (v == NULL)
6515	goto onError;
6516
6517    /* Shortcuts */
6518    if (v == unicode_empty) {
6519	Py_DECREF(v);
6520	return (PyObject *)u;
6521    }
6522    if (u == unicode_empty) {
6523	Py_DECREF(u);
6524	return (PyObject *)v;
6525    }
6526
6527    /* Concat the two Unicode strings */
6528    w = _PyUnicode_New(u->length + v->length);
6529    if (w == NULL)
6530	goto onError;
6531    Py_UNICODE_COPY(w->str, u->str, u->length);
6532    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6533
6534    Py_DECREF(u);
6535    Py_DECREF(v);
6536    return (PyObject *)w;
6537
6538onError:
6539    Py_XDECREF(u);
6540    Py_XDECREF(v);
6541    return NULL;
6542}
6543
6544void
6545PyUnicode_Append(PyObject **pleft, PyObject *right)
6546{
6547	PyObject *new;
6548	if (*pleft == NULL)
6549		return;
6550	if (right == NULL || !PyUnicode_Check(*pleft)) {
6551		Py_DECREF(*pleft);
6552		*pleft = NULL;
6553		return;
6554	}
6555	new = PyUnicode_Concat(*pleft, right);
6556	Py_DECREF(*pleft);
6557	*pleft = new;
6558}
6559
6560void
6561PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6562{
6563	PyUnicode_Append(pleft, right);
6564	Py_XDECREF(right);
6565}
6566
6567PyDoc_STRVAR(count__doc__,
6568"S.count(sub[, start[, end]]) -> int\n\
6569\n\
6570Return the number of non-overlapping occurrences of substring sub in\n\
6571Unicode string S[start:end].  Optional arguments start and end are\n\
6572interpreted as in slice notation.");
6573
6574static PyObject *
6575unicode_count(PyUnicodeObject *self, PyObject *args)
6576{
6577    PyUnicodeObject *substring;
6578    Py_ssize_t start = 0;
6579    Py_ssize_t end = PY_SSIZE_T_MAX;
6580    PyObject *result;
6581
6582    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6583		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6584        return NULL;
6585
6586    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6587        (PyObject *)substring);
6588    if (substring == NULL)
6589	return NULL;
6590
6591    FIX_START_END(self);
6592
6593    result = PyLong_FromSsize_t(
6594        stringlib_count(self->str + start, end - start,
6595                        substring->str, substring->length)
6596        );
6597
6598    Py_DECREF(substring);
6599
6600    return result;
6601}
6602
6603PyDoc_STRVAR(encode__doc__,
6604"S.encode([encoding[,errors]]) -> string or unicode\n\
6605\n\
6606Encodes S using the codec registered for encoding. encoding defaults\n\
6607to the default encoding. errors may be given to set a different error\n\
6608handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6609a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6610'xmlcharrefreplace' as well as any other name registered with\n\
6611codecs.register_error that can handle UnicodeEncodeErrors.");
6612
6613static PyObject *
6614unicode_encode(PyUnicodeObject *self, PyObject *args)
6615{
6616    char *encoding = NULL;
6617    char *errors = NULL;
6618    PyObject *v;
6619
6620    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6621        return NULL;
6622    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6623    if (v == NULL)
6624        goto onError;
6625    if (!PyBytes_Check(v)) {
6626        PyErr_Format(PyExc_TypeError,
6627                     "encoder did not return a bytes object "
6628                     "(type=%.400s)",
6629                     Py_TYPE(v)->tp_name);
6630        Py_DECREF(v);
6631        return NULL;
6632    }
6633    return v;
6634
6635 onError:
6636    return NULL;
6637}
6638
6639PyDoc_STRVAR(expandtabs__doc__,
6640"S.expandtabs([tabsize]) -> unicode\n\
6641\n\
6642Return a copy of S where all tab characters are expanded using spaces.\n\
6643If tabsize is not given, a tab size of 8 characters is assumed.");
6644
6645static PyObject*
6646unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6647{
6648    Py_UNICODE *e;
6649    Py_UNICODE *p;
6650    Py_UNICODE *q;
6651    Py_UNICODE *qe;
6652    Py_ssize_t i, j, incr;
6653    PyUnicodeObject *u;
6654    int tabsize = 8;
6655
6656    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6657	return NULL;
6658
6659    /* First pass: determine size of output string */
6660    i = 0; /* chars up to and including most recent \n or \r */
6661    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6662    e = self->str + self->length; /* end of input */
6663    for (p = self->str; p < e; p++)
6664        if (*p == '\t') {
6665	    if (tabsize > 0) {
6666		incr = tabsize - (j % tabsize); /* cannot overflow */
6667		if (j > PY_SSIZE_T_MAX - incr)
6668		    goto overflow1;
6669		j += incr;
6670            }
6671	}
6672        else {
6673	    if (j > PY_SSIZE_T_MAX - 1)
6674		goto overflow1;
6675            j++;
6676            if (*p == '\n' || *p == '\r') {
6677		if (i > PY_SSIZE_T_MAX - j)
6678		    goto overflow1;
6679                i += j;
6680                j = 0;
6681            }
6682        }
6683
6684    if (i > PY_SSIZE_T_MAX - j)
6685	goto overflow1;
6686
6687    /* Second pass: create output string and fill it */
6688    u = _PyUnicode_New(i + j);
6689    if (!u)
6690        return NULL;
6691
6692    j = 0; /* same as in first pass */
6693    q = u->str; /* next output char */
6694    qe = u->str + u->length; /* end of output */
6695
6696    for (p = self->str; p < e; p++)
6697        if (*p == '\t') {
6698	    if (tabsize > 0) {
6699		i = tabsize - (j % tabsize);
6700		j += i;
6701		while (i--) {
6702		    if (q >= qe)
6703			goto overflow2;
6704		    *q++ = ' ';
6705                }
6706	    }
6707	}
6708	else {
6709	    if (q >= qe)
6710		goto overflow2;
6711	    *q++ = *p;
6712            j++;
6713            if (*p == '\n' || *p == '\r')
6714                j = 0;
6715        }
6716
6717    return (PyObject*) u;
6718
6719  overflow2:
6720    Py_DECREF(u);
6721  overflow1:
6722    PyErr_SetString(PyExc_OverflowError, "new string is too long");
6723    return NULL;
6724}
6725
6726PyDoc_STRVAR(find__doc__,
6727"S.find(sub [,start [,end]]) -> int\n\
6728\n\
6729Return the lowest index in S where substring sub is found,\n\
6730such that sub is contained within s[start:end].  Optional\n\
6731arguments start and end are interpreted as in slice notation.\n\
6732\n\
6733Return -1 on failure.");
6734
6735static PyObject *
6736unicode_find(PyUnicodeObject *self, PyObject *args)
6737{
6738    PyObject *substring;
6739    Py_ssize_t start;
6740    Py_ssize_t end;
6741    Py_ssize_t result;
6742
6743    if (!_ParseTupleFinds(args, &substring, &start, &end))
6744        return NULL;
6745
6746    result = stringlib_find_slice(
6747        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6748        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6749        start, end
6750        );
6751
6752    Py_DECREF(substring);
6753
6754    return PyLong_FromSsize_t(result);
6755}
6756
6757static PyObject *
6758unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6759{
6760    if (index < 0 || index >= self->length) {
6761        PyErr_SetString(PyExc_IndexError, "string index out of range");
6762        return NULL;
6763    }
6764
6765    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6766}
6767
6768/* Believe it or not, this produces the same value for ASCII strings
6769   as string_hash(). */
6770static long
6771unicode_hash(PyUnicodeObject *self)
6772{
6773    Py_ssize_t len;
6774    Py_UNICODE *p;
6775    long x;
6776
6777    if (self->hash != -1)
6778        return self->hash;
6779    len = Py_SIZE(self);
6780    p = self->str;
6781    x = *p << 7;
6782    while (--len >= 0)
6783        x = (1000003*x) ^ *p++;
6784    x ^= Py_SIZE(self);
6785    if (x == -1)
6786        x = -2;
6787    self->hash = x;
6788    return x;
6789}
6790
6791PyDoc_STRVAR(index__doc__,
6792"S.index(sub [,start [,end]]) -> int\n\
6793\n\
6794Like S.find() but raise ValueError when the substring is not found.");
6795
6796static PyObject *
6797unicode_index(PyUnicodeObject *self, PyObject *args)
6798{
6799    Py_ssize_t result;
6800    PyObject *substring;
6801    Py_ssize_t start;
6802    Py_ssize_t end;
6803
6804    if (!_ParseTupleFinds(args, &substring, &start, &end))
6805        return NULL;
6806
6807    result = stringlib_find_slice(
6808        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6809        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6810        start, end
6811        );
6812
6813    Py_DECREF(substring);
6814
6815    if (result < 0) {
6816        PyErr_SetString(PyExc_ValueError, "substring not found");
6817        return NULL;
6818    }
6819
6820    return PyLong_FromSsize_t(result);
6821}
6822
6823PyDoc_STRVAR(islower__doc__,
6824"S.islower() -> bool\n\
6825\n\
6826Return True if all cased characters in S are lowercase and there is\n\
6827at least one cased character in S, False otherwise.");
6828
6829static PyObject*
6830unicode_islower(PyUnicodeObject *self)
6831{
6832    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6833    register const Py_UNICODE *e;
6834    int cased;
6835
6836    /* Shortcut for single character strings */
6837    if (PyUnicode_GET_SIZE(self) == 1)
6838	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6839
6840    /* Special case for empty strings */
6841    if (PyUnicode_GET_SIZE(self) == 0)
6842	return PyBool_FromLong(0);
6843
6844    e = p + PyUnicode_GET_SIZE(self);
6845    cased = 0;
6846    for (; p < e; p++) {
6847	register const Py_UNICODE ch = *p;
6848
6849	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6850	    return PyBool_FromLong(0);
6851	else if (!cased && Py_UNICODE_ISLOWER(ch))
6852	    cased = 1;
6853    }
6854    return PyBool_FromLong(cased);
6855}
6856
6857PyDoc_STRVAR(isupper__doc__,
6858"S.isupper() -> bool\n\
6859\n\
6860Return True if all cased characters in S are uppercase and there is\n\
6861at least one cased character in S, False otherwise.");
6862
6863static PyObject*
6864unicode_isupper(PyUnicodeObject *self)
6865{
6866    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6867    register const Py_UNICODE *e;
6868    int cased;
6869
6870    /* Shortcut for single character strings */
6871    if (PyUnicode_GET_SIZE(self) == 1)
6872	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6873
6874    /* Special case for empty strings */
6875    if (PyUnicode_GET_SIZE(self) == 0)
6876	return PyBool_FromLong(0);
6877
6878    e = p + PyUnicode_GET_SIZE(self);
6879    cased = 0;
6880    for (; p < e; p++) {
6881	register const Py_UNICODE ch = *p;
6882
6883	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6884	    return PyBool_FromLong(0);
6885	else if (!cased && Py_UNICODE_ISUPPER(ch))
6886	    cased = 1;
6887    }
6888    return PyBool_FromLong(cased);
6889}
6890
6891PyDoc_STRVAR(istitle__doc__,
6892"S.istitle() -> bool\n\
6893\n\
6894Return True if S is a titlecased string and there is at least one\n\
6895character in S, i.e. upper- and titlecase characters may only\n\
6896follow uncased characters and lowercase characters only cased ones.\n\
6897Return False otherwise.");
6898
6899static PyObject*
6900unicode_istitle(PyUnicodeObject *self)
6901{
6902    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6903    register const Py_UNICODE *e;
6904    int cased, previous_is_cased;
6905
6906    /* Shortcut for single character strings */
6907    if (PyUnicode_GET_SIZE(self) == 1)
6908	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6909			       (Py_UNICODE_ISUPPER(*p) != 0));
6910
6911    /* Special case for empty strings */
6912    if (PyUnicode_GET_SIZE(self) == 0)
6913	return PyBool_FromLong(0);
6914
6915    e = p + PyUnicode_GET_SIZE(self);
6916    cased = 0;
6917    previous_is_cased = 0;
6918    for (; p < e; p++) {
6919	register const Py_UNICODE ch = *p;
6920
6921	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6922	    if (previous_is_cased)
6923		return PyBool_FromLong(0);
6924	    previous_is_cased = 1;
6925	    cased = 1;
6926	}
6927	else if (Py_UNICODE_ISLOWER(ch)) {
6928	    if (!previous_is_cased)
6929		return PyBool_FromLong(0);
6930	    previous_is_cased = 1;
6931	    cased = 1;
6932	}
6933	else
6934	    previous_is_cased = 0;
6935    }
6936    return PyBool_FromLong(cased);
6937}
6938
6939PyDoc_STRVAR(isspace__doc__,
6940"S.isspace() -> bool\n\
6941\n\
6942Return True if all characters in S are whitespace\n\
6943and there is at least one character in S, False otherwise.");
6944
6945static PyObject*
6946unicode_isspace(PyUnicodeObject *self)
6947{
6948    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6949    register const Py_UNICODE *e;
6950
6951    /* Shortcut for single character strings */
6952    if (PyUnicode_GET_SIZE(self) == 1 &&
6953	Py_UNICODE_ISSPACE(*p))
6954	return PyBool_FromLong(1);
6955
6956    /* Special case for empty strings */
6957    if (PyUnicode_GET_SIZE(self) == 0)
6958	return PyBool_FromLong(0);
6959
6960    e = p + PyUnicode_GET_SIZE(self);
6961    for (; p < e; p++) {
6962	if (!Py_UNICODE_ISSPACE(*p))
6963	    return PyBool_FromLong(0);
6964    }
6965    return PyBool_FromLong(1);
6966}
6967
6968PyDoc_STRVAR(isalpha__doc__,
6969"S.isalpha() -> bool\n\
6970\n\
6971Return True if all characters in S are alphabetic\n\
6972and there is at least one character in S, False otherwise.");
6973
6974static PyObject*
6975unicode_isalpha(PyUnicodeObject *self)
6976{
6977    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6978    register const Py_UNICODE *e;
6979
6980    /* Shortcut for single character strings */
6981    if (PyUnicode_GET_SIZE(self) == 1 &&
6982	Py_UNICODE_ISALPHA(*p))
6983	return PyBool_FromLong(1);
6984
6985    /* Special case for empty strings */
6986    if (PyUnicode_GET_SIZE(self) == 0)
6987	return PyBool_FromLong(0);
6988
6989    e = p + PyUnicode_GET_SIZE(self);
6990    for (; p < e; p++) {
6991	if (!Py_UNICODE_ISALPHA(*p))
6992	    return PyBool_FromLong(0);
6993    }
6994    return PyBool_FromLong(1);
6995}
6996
6997PyDoc_STRVAR(isalnum__doc__,
6998"S.isalnum() -> bool\n\
6999\n\
7000Return True if all characters in S are alphanumeric\n\
7001and there is at least one character in S, False otherwise.");
7002
7003static PyObject*
7004unicode_isalnum(PyUnicodeObject *self)
7005{
7006    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7007    register const Py_UNICODE *e;
7008
7009    /* Shortcut for single character strings */
7010    if (PyUnicode_GET_SIZE(self) == 1 &&
7011	Py_UNICODE_ISALNUM(*p))
7012	return PyBool_FromLong(1);
7013
7014    /* Special case for empty strings */
7015    if (PyUnicode_GET_SIZE(self) == 0)
7016	return PyBool_FromLong(0);
7017
7018    e = p + PyUnicode_GET_SIZE(self);
7019    for (; p < e; p++) {
7020	if (!Py_UNICODE_ISALNUM(*p))
7021	    return PyBool_FromLong(0);
7022    }
7023    return PyBool_FromLong(1);
7024}
7025
7026PyDoc_STRVAR(isdecimal__doc__,
7027"S.isdecimal() -> bool\n\
7028\n\
7029Return True if there are only decimal characters in S,\n\
7030False otherwise.");
7031
7032static PyObject*
7033unicode_isdecimal(PyUnicodeObject *self)
7034{
7035    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7036    register const Py_UNICODE *e;
7037
7038    /* Shortcut for single character strings */
7039    if (PyUnicode_GET_SIZE(self) == 1 &&
7040	Py_UNICODE_ISDECIMAL(*p))
7041	return PyBool_FromLong(1);
7042
7043    /* Special case for empty strings */
7044    if (PyUnicode_GET_SIZE(self) == 0)
7045	return PyBool_FromLong(0);
7046
7047    e = p + PyUnicode_GET_SIZE(self);
7048    for (; p < e; p++) {
7049	if (!Py_UNICODE_ISDECIMAL(*p))
7050	    return PyBool_FromLong(0);
7051    }
7052    return PyBool_FromLong(1);
7053}
7054
7055PyDoc_STRVAR(isdigit__doc__,
7056"S.isdigit() -> bool\n\
7057\n\
7058Return True if all characters in S are digits\n\
7059and there is at least one character in S, False otherwise.");
7060
7061static PyObject*
7062unicode_isdigit(PyUnicodeObject *self)
7063{
7064    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7065    register const Py_UNICODE *e;
7066
7067    /* Shortcut for single character strings */
7068    if (PyUnicode_GET_SIZE(self) == 1 &&
7069	Py_UNICODE_ISDIGIT(*p))
7070	return PyBool_FromLong(1);
7071
7072    /* Special case for empty strings */
7073    if (PyUnicode_GET_SIZE(self) == 0)
7074	return PyBool_FromLong(0);
7075
7076    e = p + PyUnicode_GET_SIZE(self);
7077    for (; p < e; p++) {
7078	if (!Py_UNICODE_ISDIGIT(*p))
7079	    return PyBool_FromLong(0);
7080    }
7081    return PyBool_FromLong(1);
7082}
7083
7084PyDoc_STRVAR(isnumeric__doc__,
7085"S.isnumeric() -> bool\n\
7086\n\
7087Return True if there are only numeric characters in S,\n\
7088False otherwise.");
7089
7090static PyObject*
7091unicode_isnumeric(PyUnicodeObject *self)
7092{
7093    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7094    register const Py_UNICODE *e;
7095
7096    /* Shortcut for single character strings */
7097    if (PyUnicode_GET_SIZE(self) == 1 &&
7098	Py_UNICODE_ISNUMERIC(*p))
7099	return PyBool_FromLong(1);
7100
7101    /* Special case for empty strings */
7102    if (PyUnicode_GET_SIZE(self) == 0)
7103	return PyBool_FromLong(0);
7104
7105    e = p + PyUnicode_GET_SIZE(self);
7106    for (; p < e; p++) {
7107	if (!Py_UNICODE_ISNUMERIC(*p))
7108	    return PyBool_FromLong(0);
7109    }
7110    return PyBool_FromLong(1);
7111}
7112
7113int
7114PyUnicode_IsIdentifier(PyObject *self)
7115{
7116    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7117    register const Py_UNICODE *e;
7118
7119    /* Special case for empty strings */
7120    if (PyUnicode_GET_SIZE(self) == 0)
7121	return 0;
7122
7123    /* PEP 3131 says that the first character must be in
7124       XID_Start and subsequent characters in XID_Continue,
7125       and for the ASCII range, the 2.x rules apply (i.e
7126       start with letters and underscore, continue with
7127       letters, digits, underscore). However, given the current
7128       definition of XID_Start and XID_Continue, it is sufficient
7129       to check just for these, except that _ must be allowed
7130       as starting an identifier.  */
7131    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7132        return 0;
7133
7134    e = p + PyUnicode_GET_SIZE(self);
7135    for (p++; p < e; p++) {
7136	if (!_PyUnicode_IsXidContinue(*p))
7137	    return 0;
7138    }
7139    return 1;
7140}
7141
7142PyDoc_STRVAR(isidentifier__doc__,
7143"S.isidentifier() -> bool\n\
7144\n\
7145Return True if S is a valid identifier according\n\
7146to the language definition.");
7147
7148static PyObject*
7149unicode_isidentifier(PyObject *self)
7150{
7151    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7152}
7153
7154PyDoc_STRVAR(join__doc__,
7155"S.join(sequence) -> unicode\n\
7156\n\
7157Return a string which is the concatenation of the strings in the\n\
7158sequence.  The separator between elements is S.");
7159
7160static PyObject*
7161unicode_join(PyObject *self, PyObject *data)
7162{
7163    return PyUnicode_Join(self, data);
7164}
7165
7166static Py_ssize_t
7167unicode_length(PyUnicodeObject *self)
7168{
7169    return self->length;
7170}
7171
7172PyDoc_STRVAR(ljust__doc__,
7173"S.ljust(width[, fillchar]) -> int\n\
7174\n\
7175Return S left justified in a Unicode string of length width. Padding is\n\
7176done using the specified fill character (default is a space).");
7177
7178static PyObject *
7179unicode_ljust(PyUnicodeObject *self, PyObject *args)
7180{
7181    Py_ssize_t width;
7182    Py_UNICODE fillchar = ' ';
7183
7184    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7185        return NULL;
7186
7187    if (self->length >= width && PyUnicode_CheckExact(self)) {
7188        Py_INCREF(self);
7189        return (PyObject*) self;
7190    }
7191
7192    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7193}
7194
7195PyDoc_STRVAR(lower__doc__,
7196"S.lower() -> unicode\n\
7197\n\
7198Return a copy of the string S converted to lowercase.");
7199
7200static PyObject*
7201unicode_lower(PyUnicodeObject *self)
7202{
7203    return fixup(self, fixlower);
7204}
7205
7206#define LEFTSTRIP 0
7207#define RIGHTSTRIP 1
7208#define BOTHSTRIP 2
7209
7210/* Arrays indexed by above */
7211static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7212
7213#define STRIPNAME(i) (stripformat[i]+3)
7214
7215/* externally visible for str.strip(unicode) */
7216PyObject *
7217_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7218{
7219	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7220	Py_ssize_t len = PyUnicode_GET_SIZE(self);
7221	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7222	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7223	Py_ssize_t i, j;
7224
7225        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7226
7227	i = 0;
7228	if (striptype != RIGHTSTRIP) {
7229            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7230                i++;
7231            }
7232	}
7233
7234	j = len;
7235	if (striptype != LEFTSTRIP) {
7236            do {
7237                j--;
7238            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7239            j++;
7240	}
7241
7242	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7243            Py_INCREF(self);
7244            return (PyObject*)self;
7245	}
7246	else
7247            return PyUnicode_FromUnicode(s+i, j-i);
7248}
7249
7250
7251static PyObject *
7252do_strip(PyUnicodeObject *self, int striptype)
7253{
7254	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7255	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7256
7257	i = 0;
7258	if (striptype != RIGHTSTRIP) {
7259		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7260			i++;
7261		}
7262	}
7263
7264	j = len;
7265	if (striptype != LEFTSTRIP) {
7266		do {
7267			j--;
7268		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7269		j++;
7270	}
7271
7272	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7273		Py_INCREF(self);
7274		return (PyObject*)self;
7275	}
7276	else
7277		return PyUnicode_FromUnicode(s+i, j-i);
7278}
7279
7280
7281static PyObject *
7282do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7283{
7284	PyObject *sep = NULL;
7285
7286	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7287		return NULL;
7288
7289	if (sep != NULL && sep != Py_None) {
7290		if (PyUnicode_Check(sep))
7291			return _PyUnicode_XStrip(self, striptype, sep);
7292		else {
7293			PyErr_Format(PyExc_TypeError,
7294				     "%s arg must be None, unicode or str",
7295				     STRIPNAME(striptype));
7296			return NULL;
7297		}
7298	}
7299
7300	return do_strip(self, striptype);
7301}
7302
7303
7304PyDoc_STRVAR(strip__doc__,
7305"S.strip([chars]) -> unicode\n\
7306\n\
7307Return a copy of the string S with leading and trailing\n\
7308whitespace removed.\n\
7309If chars is given and not None, remove characters in chars instead.\n\
7310If chars is a str, it will be converted to unicode before stripping");
7311
7312static PyObject *
7313unicode_strip(PyUnicodeObject *self, PyObject *args)
7314{
7315	if (PyTuple_GET_SIZE(args) == 0)
7316		return do_strip(self, BOTHSTRIP); /* Common case */
7317	else
7318		return do_argstrip(self, BOTHSTRIP, args);
7319}
7320
7321
7322PyDoc_STRVAR(lstrip__doc__,
7323"S.lstrip([chars]) -> unicode\n\
7324\n\
7325Return a copy of the string S with leading whitespace removed.\n\
7326If chars is given and not None, remove characters in chars instead.\n\
7327If chars is a str, it will be converted to unicode before stripping");
7328
7329static PyObject *
7330unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7331{
7332	if (PyTuple_GET_SIZE(args) == 0)
7333		return do_strip(self, LEFTSTRIP); /* Common case */
7334	else
7335		return do_argstrip(self, LEFTSTRIP, args);
7336}
7337
7338
7339PyDoc_STRVAR(rstrip__doc__,
7340"S.rstrip([chars]) -> unicode\n\
7341\n\
7342Return a copy of the string S with trailing whitespace removed.\n\
7343If chars is given and not None, remove characters in chars instead.\n\
7344If chars is a str, it will be converted to unicode before stripping");
7345
7346static PyObject *
7347unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7348{
7349	if (PyTuple_GET_SIZE(args) == 0)
7350		return do_strip(self, RIGHTSTRIP); /* Common case */
7351	else
7352		return do_argstrip(self, RIGHTSTRIP, args);
7353}
7354
7355
7356static PyObject*
7357unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7358{
7359    PyUnicodeObject *u;
7360    Py_UNICODE *p;
7361    Py_ssize_t nchars;
7362    size_t nbytes;
7363
7364    if (len < 0)
7365        len = 0;
7366
7367    if (len == 1 && PyUnicode_CheckExact(str)) {
7368        /* no repeat, return original string */
7369        Py_INCREF(str);
7370        return (PyObject*) str;
7371    }
7372
7373    /* ensure # of chars needed doesn't overflow int and # of bytes
7374     * needed doesn't overflow size_t
7375     */
7376    nchars = len * str->length;
7377    if (len && nchars / len != str->length) {
7378        PyErr_SetString(PyExc_OverflowError,
7379                        "repeated string is too long");
7380        return NULL;
7381    }
7382    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7383    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7384        PyErr_SetString(PyExc_OverflowError,
7385                        "repeated string is too long");
7386        return NULL;
7387    }
7388    u = _PyUnicode_New(nchars);
7389    if (!u)
7390        return NULL;
7391
7392    p = u->str;
7393
7394    if (str->length == 1 && len > 0) {
7395        Py_UNICODE_FILL(p, str->str[0], len);
7396    } else {
7397	Py_ssize_t done = 0; /* number of characters copied this far */
7398	if (done < nchars) {
7399            Py_UNICODE_COPY(p, str->str, str->length);
7400            done = str->length;
7401	}
7402	while (done < nchars) {
7403            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7404            Py_UNICODE_COPY(p+done, p, n);
7405            done += n;
7406	}
7407    }
7408
7409    return (PyObject*) u;
7410}
7411
7412PyObject *PyUnicode_Replace(PyObject *obj,
7413			    PyObject *subobj,
7414			    PyObject *replobj,
7415			    Py_ssize_t maxcount)
7416{
7417    PyObject *self;
7418    PyObject *str1;
7419    PyObject *str2;
7420    PyObject *result;
7421
7422    self = PyUnicode_FromObject(obj);
7423    if (self == NULL)
7424	return NULL;
7425    str1 = PyUnicode_FromObject(subobj);
7426    if (str1 == NULL) {
7427	Py_DECREF(self);
7428	return NULL;
7429    }
7430    str2 = PyUnicode_FromObject(replobj);
7431    if (str2 == NULL) {
7432	Py_DECREF(self);
7433	Py_DECREF(str1);
7434	return NULL;
7435    }
7436    result = replace((PyUnicodeObject *)self,
7437		     (PyUnicodeObject *)str1,
7438		     (PyUnicodeObject *)str2,
7439		     maxcount);
7440    Py_DECREF(self);
7441    Py_DECREF(str1);
7442    Py_DECREF(str2);
7443    return result;
7444}
7445
7446PyDoc_STRVAR(replace__doc__,
7447"S.replace (old, new[, maxsplit]) -> unicode\n\
7448\n\
7449Return a copy of S with all occurrences of substring\n\
7450old replaced by new.  If the optional argument maxsplit is\n\
7451given, only the first maxsplit occurrences are replaced.");
7452
7453static PyObject*
7454unicode_replace(PyUnicodeObject *self, PyObject *args)
7455{
7456    PyUnicodeObject *str1;
7457    PyUnicodeObject *str2;
7458    Py_ssize_t maxcount = -1;
7459    PyObject *result;
7460
7461    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7462        return NULL;
7463    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7464    if (str1 == NULL)
7465	return NULL;
7466    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7467    if (str2 == NULL) {
7468	Py_DECREF(str1);
7469	return NULL;
7470    }
7471
7472    result = replace(self, str1, str2, maxcount);
7473
7474    Py_DECREF(str1);
7475    Py_DECREF(str2);
7476    return result;
7477}
7478
7479static
7480PyObject *unicode_repr(PyObject *unicode)
7481{
7482    PyObject *repr;
7483    Py_UNICODE *p;
7484    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7485    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7486
7487    /* XXX(nnorwitz): rather than over-allocating, it would be
7488       better to choose a different scheme.  Perhaps scan the
7489       first N-chars of the string and allocate based on that size.
7490    */
7491    /* Initial allocation is based on the longest-possible unichr
7492       escape.
7493
7494       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7495       unichr, so in this case it's the longest unichr escape. In
7496       narrow (UTF-16) builds this is five chars per source unichr
7497       since there are two unichrs in the surrogate pair, so in narrow
7498       (UTF-16) builds it's not the longest unichr escape.
7499
7500       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7501       so in the narrow (UTF-16) build case it's the longest unichr
7502       escape.
7503    */
7504
7505    repr = PyUnicode_FromUnicode(NULL,
7506        2 /* quotes */
7507#ifdef Py_UNICODE_WIDE
7508        + 10*size
7509#else
7510        + 6*size
7511#endif
7512        + 1);
7513    if (repr == NULL)
7514        return NULL;
7515
7516    p = PyUnicode_AS_UNICODE(repr);
7517
7518    /* Add quote */
7519    *p++ = (findchar(s, size, '\'') &&
7520            !findchar(s, size, '"')) ? '"' : '\'';
7521    while (size-- > 0) {
7522        Py_UNICODE ch = *s++;
7523
7524        /* Escape quotes and backslashes */
7525        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7526            *p++ = '\\';
7527            *p++ = ch;
7528            continue;
7529        }
7530
7531#ifdef Py_UNICODE_WIDE
7532        /* Map 21-bit characters to '\U00xxxxxx' */
7533        else if (ch >= 0x10000) {
7534            *p++ = '\\';
7535            *p++ = 'U';
7536            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7537            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7538            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7539            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7540            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7541            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7542            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7543            *p++ = hexdigits[ch & 0x0000000F];
7544	    continue;
7545        }
7546#else
7547	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7548	else if (ch >= 0xD800 && ch < 0xDC00) {
7549	    Py_UNICODE ch2;
7550	    Py_UCS4 ucs;
7551
7552	    ch2 = *s++;
7553	    size--;
7554	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7555		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7556		*p++ = '\\';
7557		*p++ = 'U';
7558		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7559		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7560		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7561		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7562		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7563		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7564		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7565		*p++ = hexdigits[ucs & 0x0000000F];
7566		continue;
7567	    }
7568	    /* Fall through: isolated surrogates are copied as-is */
7569	    s--;
7570	    size++;
7571	}
7572#endif
7573
7574        /* Map 16-bit characters to '\uxxxx' */
7575        if (ch >= 256) {
7576            *p++ = '\\';
7577            *p++ = 'u';
7578            *p++ = hexdigits[(ch >> 12) & 0x000F];
7579            *p++ = hexdigits[(ch >> 8) & 0x000F];
7580            *p++ = hexdigits[(ch >> 4) & 0x000F];
7581            *p++ = hexdigits[ch & 0x000F];
7582        }
7583
7584        /* Map special whitespace to '\t', \n', '\r' */
7585        else if (ch == '\t') {
7586            *p++ = '\\';
7587            *p++ = 't';
7588        }
7589        else if (ch == '\n') {
7590            *p++ = '\\';
7591            *p++ = 'n';
7592        }
7593        else if (ch == '\r') {
7594            *p++ = '\\';
7595            *p++ = 'r';
7596        }
7597
7598        /* Map non-printable US ASCII to '\xhh' */
7599        else if (ch < ' ' || ch >= 0x7F) {
7600            *p++ = '\\';
7601            *p++ = 'x';
7602            *p++ = hexdigits[(ch >> 4) & 0x000F];
7603            *p++ = hexdigits[ch & 0x000F];
7604        }
7605
7606        /* Copy everything else as-is */
7607        else
7608            *p++ = (char) ch;
7609    }
7610    /* Add quote */
7611    *p++ = PyUnicode_AS_UNICODE(repr)[0];
7612
7613    *p = '\0';
7614    _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7615    return repr;
7616}
7617
7618PyDoc_STRVAR(rfind__doc__,
7619"S.rfind(sub [,start [,end]]) -> int\n\
7620\n\
7621Return the highest index in S where substring sub is found,\n\
7622such that sub is contained within s[start:end].  Optional\n\
7623arguments start and end are interpreted as in slice notation.\n\
7624\n\
7625Return -1 on failure.");
7626
7627static PyObject *
7628unicode_rfind(PyUnicodeObject *self, PyObject *args)
7629{
7630    PyObject *substring;
7631    Py_ssize_t start;
7632    Py_ssize_t end;
7633    Py_ssize_t result;
7634
7635    if (!_ParseTupleFinds(args, &substring, &start, &end))
7636	    return NULL;
7637
7638    result = stringlib_rfind_slice(
7639        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7640        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7641        start, end
7642        );
7643
7644    Py_DECREF(substring);
7645
7646    return PyLong_FromSsize_t(result);
7647}
7648
7649PyDoc_STRVAR(rindex__doc__,
7650"S.rindex(sub [,start [,end]]) -> int\n\
7651\n\
7652Like S.rfind() but raise ValueError when the substring is not found.");
7653
7654static PyObject *
7655unicode_rindex(PyUnicodeObject *self, PyObject *args)
7656{
7657    PyObject *substring;
7658    Py_ssize_t start;
7659    Py_ssize_t end;
7660    Py_ssize_t result;
7661
7662    if (!_ParseTupleFinds(args, &substring, &start, &end))
7663	    return NULL;
7664
7665    result = stringlib_rfind_slice(
7666        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7667        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7668        start, end
7669        );
7670
7671    Py_DECREF(substring);
7672
7673    if (result < 0) {
7674        PyErr_SetString(PyExc_ValueError, "substring not found");
7675        return NULL;
7676    }
7677    return PyLong_FromSsize_t(result);
7678}
7679
7680PyDoc_STRVAR(rjust__doc__,
7681"S.rjust(width[, fillchar]) -> unicode\n\
7682\n\
7683Return S right justified in a Unicode string of length width. Padding is\n\
7684done using the specified fill character (default is a space).");
7685
7686static PyObject *
7687unicode_rjust(PyUnicodeObject *self, PyObject *args)
7688{
7689    Py_ssize_t width;
7690    Py_UNICODE fillchar = ' ';
7691
7692    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7693        return NULL;
7694
7695    if (self->length >= width && PyUnicode_CheckExact(self)) {
7696        Py_INCREF(self);
7697        return (PyObject*) self;
7698    }
7699
7700    return (PyObject*) pad(self, width - self->length, 0, fillchar);
7701}
7702
7703PyObject *PyUnicode_Split(PyObject *s,
7704			  PyObject *sep,
7705			  Py_ssize_t maxsplit)
7706{
7707    PyObject *result;
7708
7709    s = PyUnicode_FromObject(s);
7710    if (s == NULL)
7711	return NULL;
7712    if (sep != NULL) {
7713	sep = PyUnicode_FromObject(sep);
7714	if (sep == NULL) {
7715	    Py_DECREF(s);
7716	    return NULL;
7717	}
7718    }
7719
7720    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7721
7722    Py_DECREF(s);
7723    Py_XDECREF(sep);
7724    return result;
7725}
7726
7727PyDoc_STRVAR(split__doc__,
7728"S.split([sep [,maxsplit]]) -> list of strings\n\
7729\n\
7730Return a list of the words in S, using sep as the\n\
7731delimiter string.  If maxsplit is given, at most maxsplit\n\
7732splits are done. If sep is not specified or is None, any\n\
7733whitespace string is a separator and empty strings are\n\
7734removed from the result.");
7735
7736static PyObject*
7737unicode_split(PyUnicodeObject *self, PyObject *args)
7738{
7739    PyObject *substring = Py_None;
7740    Py_ssize_t maxcount = -1;
7741
7742    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7743        return NULL;
7744
7745    if (substring == Py_None)
7746	return split(self, NULL, maxcount);
7747    else if (PyUnicode_Check(substring))
7748	return split(self, (PyUnicodeObject *)substring, maxcount);
7749    else
7750	return PyUnicode_Split((PyObject *)self, substring, maxcount);
7751}
7752
7753PyObject *
7754PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7755{
7756    PyObject* str_obj;
7757    PyObject* sep_obj;
7758    PyObject* out;
7759
7760    str_obj = PyUnicode_FromObject(str_in);
7761    if (!str_obj)
7762	return NULL;
7763    sep_obj = PyUnicode_FromObject(sep_in);
7764    if (!sep_obj) {
7765        Py_DECREF(str_obj);
7766        return NULL;
7767    }
7768
7769    out = stringlib_partition(
7770        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7771        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7772        );
7773
7774    Py_DECREF(sep_obj);
7775    Py_DECREF(str_obj);
7776
7777    return out;
7778}
7779
7780
7781PyObject *
7782PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7783{
7784    PyObject* str_obj;
7785    PyObject* sep_obj;
7786    PyObject* out;
7787
7788    str_obj = PyUnicode_FromObject(str_in);
7789    if (!str_obj)
7790	return NULL;
7791    sep_obj = PyUnicode_FromObject(sep_in);
7792    if (!sep_obj) {
7793        Py_DECREF(str_obj);
7794        return NULL;
7795    }
7796
7797    out = stringlib_rpartition(
7798        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7799        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7800        );
7801
7802    Py_DECREF(sep_obj);
7803    Py_DECREF(str_obj);
7804
7805    return out;
7806}
7807
7808PyDoc_STRVAR(partition__doc__,
7809"S.partition(sep) -> (head, sep, tail)\n\
7810\n\
7811Searches for the separator sep in S, and returns the part before it,\n\
7812the separator itself, and the part after it.  If the separator is not\n\
7813found, returns S and two empty strings.");
7814
7815static PyObject*
7816unicode_partition(PyUnicodeObject *self, PyObject *separator)
7817{
7818    return PyUnicode_Partition((PyObject *)self, separator);
7819}
7820
7821PyDoc_STRVAR(rpartition__doc__,
7822"S.rpartition(sep) -> (tail, sep, head)\n\
7823\n\
7824Searches for the separator sep in S, starting at the end of S, and returns\n\
7825the part before it, the separator itself, and the part after it.  If the\n\
7826separator is not found, returns two empty strings and S.");
7827
7828static PyObject*
7829unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7830{
7831    return PyUnicode_RPartition((PyObject *)self, separator);
7832}
7833
7834PyObject *PyUnicode_RSplit(PyObject *s,
7835			   PyObject *sep,
7836			   Py_ssize_t maxsplit)
7837{
7838    PyObject *result;
7839
7840    s = PyUnicode_FromObject(s);
7841    if (s == NULL)
7842	return NULL;
7843    if (sep != NULL) {
7844	sep = PyUnicode_FromObject(sep);
7845	if (sep == NULL) {
7846	    Py_DECREF(s);
7847	    return NULL;
7848	}
7849    }
7850
7851    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7852
7853    Py_DECREF(s);
7854    Py_XDECREF(sep);
7855    return result;
7856}
7857
7858PyDoc_STRVAR(rsplit__doc__,
7859"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7860\n\
7861Return a list of the words in S, using sep as the\n\
7862delimiter string, starting at the end of the string and\n\
7863working to the front.  If maxsplit is given, at most maxsplit\n\
7864splits are done. If sep is not specified, any whitespace string\n\
7865is a separator.");
7866
7867static PyObject*
7868unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7869{
7870    PyObject *substring = Py_None;
7871    Py_ssize_t maxcount = -1;
7872
7873    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7874        return NULL;
7875
7876    if (substring == Py_None)
7877	return rsplit(self, NULL, maxcount);
7878    else if (PyUnicode_Check(substring))
7879	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7880    else
7881	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7882}
7883
7884PyDoc_STRVAR(splitlines__doc__,
7885"S.splitlines([keepends]]) -> list of strings\n\
7886\n\
7887Return a list of the lines in S, breaking at line boundaries.\n\
7888Line breaks are not included in the resulting list unless keepends\n\
7889is given and true.");
7890
7891static PyObject*
7892unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7893{
7894    int keepends = 0;
7895
7896    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7897        return NULL;
7898
7899    return PyUnicode_Splitlines((PyObject *)self, keepends);
7900}
7901
7902static
7903PyObject *unicode_str(PyObject *self)
7904{
7905    if (PyUnicode_CheckExact(self)) {
7906        Py_INCREF(self);
7907        return self;
7908    } else
7909        /* Subtype -- return genuine unicode string with the same value. */
7910        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7911                                     PyUnicode_GET_SIZE(self));
7912}
7913
7914PyDoc_STRVAR(swapcase__doc__,
7915"S.swapcase() -> unicode\n\
7916\n\
7917Return a copy of S with uppercase characters converted to lowercase\n\
7918and vice versa.");
7919
7920static PyObject*
7921unicode_swapcase(PyUnicodeObject *self)
7922{
7923    return fixup(self, fixswapcase);
7924}
7925
7926PyDoc_STRVAR(maketrans__doc__,
7927"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
7928\n\
7929Return a translation table usable for str.translate().\n\
7930If there is only one argument, it must be a dictionary mapping Unicode\n\
7931ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
7932Character keys will then be converted to ordinals.\n\
7933If there are two arguments, they must be strings of equal length, and\n\
7934in the resulting dictionary, each character in x will be mapped to the\n\
7935character at the same position in y. If there is a third argument, it\n\
7936must be a string, whose characters will be mapped to None in the result.");
7937
7938static PyObject*
7939unicode_maketrans(PyUnicodeObject *null, PyObject *args)
7940{
7941    PyObject *x, *y = NULL, *z = NULL;
7942    PyObject *new = NULL, *key, *value;
7943    Py_ssize_t i = 0;
7944    int res;
7945
7946    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
7947        return NULL;
7948    new = PyDict_New();
7949    if (!new)
7950        return NULL;
7951    if (y != NULL) {
7952        /* x must be a string too, of equal length */
7953        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
7954        if (!PyUnicode_Check(x)) {
7955            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
7956                            "be a string if there is a second argument");
7957            goto err;
7958        }
7959        if (PyUnicode_GET_SIZE(x) != ylen) {
7960            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
7961                            "arguments must have equal length");
7962            goto err;
7963        }
7964        /* create entries for translating chars in x to those in y */
7965        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
7966            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
7967            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
7968            if (!key || !value)
7969                goto err;
7970            res = PyDict_SetItem(new, key, value);
7971            Py_DECREF(key);
7972            Py_DECREF(value);
7973            if (res < 0)
7974                goto err;
7975        }
7976        /* create entries for deleting chars in z */
7977        if (z != NULL) {
7978            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
7979                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
7980                if (!key)
7981                    goto err;
7982                res = PyDict_SetItem(new, key, Py_None);
7983                Py_DECREF(key);
7984                if (res < 0)
7985                    goto err;
7986            }
7987        }
7988    } else {
7989        /* x must be a dict */
7990        if (!PyDict_Check(x)) {
7991            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
7992                            "to maketrans it must be a dict");
7993            goto err;
7994        }
7995        /* copy entries into the new dict, converting string keys to int keys */
7996        while (PyDict_Next(x, &i, &key, &value)) {
7997            if (PyUnicode_Check(key)) {
7998                /* convert string keys to integer keys */
7999                PyObject *newkey;
8000                if (PyUnicode_GET_SIZE(key) != 1) {
8001                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8002                                    "table must be of length 1");
8003                    goto err;
8004                }
8005                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8006                if (!newkey)
8007                    goto err;
8008                res = PyDict_SetItem(new, newkey, value);
8009                Py_DECREF(newkey);
8010                if (res < 0)
8011                    goto err;
8012            } else if (PyLong_Check(key)) {
8013                /* just keep integer keys */
8014                if (PyDict_SetItem(new, key, value) < 0)
8015                    goto err;
8016            } else {
8017                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8018                                "be strings or integers");
8019                goto err;
8020            }
8021        }
8022    }
8023    return new;
8024  err:
8025    Py_DECREF(new);
8026    return NULL;
8027}
8028
8029PyDoc_STRVAR(translate__doc__,
8030"S.translate(table) -> unicode\n\
8031\n\
8032Return a copy of the string S, where all characters have been mapped\n\
8033through the given translation table, which must be a mapping of\n\
8034Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
8035Unmapped characters are left untouched. Characters mapped to None\n\
8036are deleted.");
8037
8038static PyObject*
8039unicode_translate(PyUnicodeObject *self, PyObject *table)
8040{
8041    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8042}
8043
8044PyDoc_STRVAR(upper__doc__,
8045"S.upper() -> unicode\n\
8046\n\
8047Return a copy of S converted to uppercase.");
8048
8049static PyObject*
8050unicode_upper(PyUnicodeObject *self)
8051{
8052    return fixup(self, fixupper);
8053}
8054
8055PyDoc_STRVAR(zfill__doc__,
8056"S.zfill(width) -> unicode\n\
8057\n\
8058Pad a numeric string x with zeros on the left, to fill a field\n\
8059of the specified width. The string x is never truncated.");
8060
8061static PyObject *
8062unicode_zfill(PyUnicodeObject *self, PyObject *args)
8063{
8064    Py_ssize_t fill;
8065    PyUnicodeObject *u;
8066
8067    Py_ssize_t width;
8068    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8069        return NULL;
8070
8071    if (self->length >= width) {
8072        if (PyUnicode_CheckExact(self)) {
8073            Py_INCREF(self);
8074            return (PyObject*) self;
8075        }
8076        else
8077            return PyUnicode_FromUnicode(
8078                PyUnicode_AS_UNICODE(self),
8079                PyUnicode_GET_SIZE(self)
8080            );
8081    }
8082
8083    fill = width - self->length;
8084
8085    u = pad(self, fill, 0, '0');
8086
8087    if (u == NULL)
8088        return NULL;
8089
8090    if (u->str[fill] == '+' || u->str[fill] == '-') {
8091        /* move sign to beginning of string */
8092        u->str[0] = u->str[fill];
8093        u->str[fill] = '0';
8094    }
8095
8096    return (PyObject*) u;
8097}
8098
8099#if 0
8100static PyObject*
8101unicode_freelistsize(PyUnicodeObject *self)
8102{
8103    return PyLong_FromLong(numfree);
8104}
8105#endif
8106
8107PyDoc_STRVAR(startswith__doc__,
8108"S.startswith(prefix[, start[, end]]) -> bool\n\
8109\n\
8110Return True if S starts with the specified prefix, False otherwise.\n\
8111With optional start, test S beginning at that position.\n\
8112With optional end, stop comparing S at that position.\n\
8113prefix can also be a tuple of strings to try.");
8114
8115static PyObject *
8116unicode_startswith(PyUnicodeObject *self,
8117		   PyObject *args)
8118{
8119    PyObject *subobj;
8120    PyUnicodeObject *substring;
8121    Py_ssize_t start = 0;
8122    Py_ssize_t end = PY_SSIZE_T_MAX;
8123    int result;
8124
8125    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8126		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8127	return NULL;
8128    if (PyTuple_Check(subobj)) {
8129        Py_ssize_t i;
8130        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8131            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8132                            PyTuple_GET_ITEM(subobj, i));
8133            if (substring == NULL)
8134                return NULL;
8135            result = tailmatch(self, substring, start, end, -1);
8136            Py_DECREF(substring);
8137            if (result) {
8138                Py_RETURN_TRUE;
8139            }
8140        }
8141        /* nothing matched */
8142        Py_RETURN_FALSE;
8143    }
8144    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8145    if (substring == NULL)
8146         return NULL;
8147    result = tailmatch(self, substring, start, end, -1);
8148    Py_DECREF(substring);
8149    return PyBool_FromLong(result);
8150}
8151
8152
8153PyDoc_STRVAR(endswith__doc__,
8154"S.endswith(suffix[, start[, end]]) -> bool\n\
8155\n\
8156Return True if S ends with the specified suffix, False otherwise.\n\
8157With optional start, test S beginning at that position.\n\
8158With optional end, stop comparing S at that position.\n\
8159suffix can also be a tuple of strings to try.");
8160
8161static PyObject *
8162unicode_endswith(PyUnicodeObject *self,
8163		 PyObject *args)
8164{
8165    PyObject *subobj;
8166    PyUnicodeObject *substring;
8167    Py_ssize_t start = 0;
8168    Py_ssize_t end = PY_SSIZE_T_MAX;
8169    int result;
8170
8171    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8172        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8173	return NULL;
8174    if (PyTuple_Check(subobj)) {
8175        Py_ssize_t i;
8176        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8177            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8178                            PyTuple_GET_ITEM(subobj, i));
8179            if (substring == NULL)
8180            return NULL;
8181            result = tailmatch(self, substring, start, end, +1);
8182            Py_DECREF(substring);
8183            if (result) {
8184                Py_RETURN_TRUE;
8185            }
8186        }
8187        Py_RETURN_FALSE;
8188    }
8189    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8190    if (substring == NULL)
8191    return NULL;
8192
8193    result = tailmatch(self, substring, start, end, +1);
8194    Py_DECREF(substring);
8195    return PyBool_FromLong(result);
8196}
8197
8198#include "stringlib/string_format.h"
8199
8200PyDoc_STRVAR(format__doc__,
8201"S.format(*args, **kwargs) -> unicode\n\
8202\n\
8203");
8204
8205PyDoc_STRVAR(p_format__doc__,
8206"S.__format__(format_spec) -> unicode\n\
8207\n\
8208");
8209
8210static PyObject *
8211unicode_getnewargs(PyUnicodeObject *v)
8212{
8213	return Py_BuildValue("(u#)", v->str, v->length);
8214}
8215
8216
8217static PyMethodDef unicode_methods[] = {
8218
8219    /* Order is according to common usage: often used methods should
8220       appear first, since lookup is done sequentially. */
8221
8222    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8223    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8224    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8225    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8226    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8227    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8228    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8229    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8230    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8231    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8232    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8233    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8234    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8235    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8236    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8237    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8238    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8239    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8240    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8241    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8242    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8243    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8244    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8245    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8246    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8247    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8248    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8249    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8250    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8251    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8252    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8253    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8254    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8255    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8256    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8257    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8258    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8259    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8260    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8261    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8262    {"__format__", (PyCFunction) unicode_unicode__format__, METH_VARARGS, p_format__doc__},
8263    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8264    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8265    {"maketrans", (PyCFunction) unicode_maketrans,
8266     METH_VARARGS | METH_STATIC, maketrans__doc__},
8267#if 0
8268    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8269#endif
8270
8271#if 0
8272    /* This one is just used for debugging the implementation. */
8273    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8274#endif
8275
8276    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
8277    {NULL, NULL}
8278};
8279
8280static PyObject *
8281unicode_mod(PyObject *v, PyObject *w)
8282{
8283       if (!PyUnicode_Check(v)) {
8284               Py_INCREF(Py_NotImplemented);
8285               return Py_NotImplemented;
8286       }
8287       return PyUnicode_Format(v, w);
8288}
8289
8290static PyNumberMethods unicode_as_number = {
8291	0,				/*nb_add*/
8292	0,				/*nb_subtract*/
8293	0,				/*nb_multiply*/
8294	unicode_mod,			/*nb_remainder*/
8295};
8296
8297static PySequenceMethods unicode_as_sequence = {
8298    (lenfunc) unicode_length, 		/* sq_length */
8299    PyUnicode_Concat,		 	/* sq_concat */
8300    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
8301    (ssizeargfunc) unicode_getitem, 	/* sq_item */
8302    0,				 	/* sq_slice */
8303    0, 					/* sq_ass_item */
8304    0, 					/* sq_ass_slice */
8305    PyUnicode_Contains, 		/* sq_contains */
8306};
8307
8308static PyObject*
8309unicode_subscript(PyUnicodeObject* self, PyObject* item)
8310{
8311    if (PyIndex_Check(item)) {
8312        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8313        if (i == -1 && PyErr_Occurred())
8314            return NULL;
8315        if (i < 0)
8316            i += PyUnicode_GET_SIZE(self);
8317        return unicode_getitem(self, i);
8318    } else if (PySlice_Check(item)) {
8319        Py_ssize_t start, stop, step, slicelength, cur, i;
8320        Py_UNICODE* source_buf;
8321        Py_UNICODE* result_buf;
8322        PyObject* result;
8323
8324        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8325				 &start, &stop, &step, &slicelength) < 0) {
8326            return NULL;
8327        }
8328
8329        if (slicelength <= 0) {
8330            return PyUnicode_FromUnicode(NULL, 0);
8331        } else if (start == 0 && step == 1 && slicelength == self->length &&
8332                   PyUnicode_CheckExact(self)) {
8333            Py_INCREF(self);
8334            return (PyObject *)self;
8335        } else if (step == 1) {
8336            return PyUnicode_FromUnicode(self->str + start, slicelength);
8337        } else {
8338            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8339            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8340                                                       sizeof(Py_UNICODE));
8341
8342	    if (result_buf == NULL)
8343		    return PyErr_NoMemory();
8344
8345            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8346                result_buf[i] = source_buf[cur];
8347            }
8348
8349            result = PyUnicode_FromUnicode(result_buf, slicelength);
8350            PyObject_FREE(result_buf);
8351            return result;
8352        }
8353    } else {
8354        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8355        return NULL;
8356    }
8357}
8358
8359static PyMappingMethods unicode_as_mapping = {
8360    (lenfunc)unicode_length,		/* mp_length */
8361    (binaryfunc)unicode_subscript,	/* mp_subscript */
8362    (objobjargproc)0,			/* mp_ass_subscript */
8363};
8364
8365
8366/* Helpers for PyUnicode_Format() */
8367
8368static PyObject *
8369getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8370{
8371    Py_ssize_t argidx = *p_argidx;
8372    if (argidx < arglen) {
8373	(*p_argidx)++;
8374	if (arglen < 0)
8375	    return args;
8376	else
8377	    return PyTuple_GetItem(args, argidx);
8378    }
8379    PyErr_SetString(PyExc_TypeError,
8380		    "not enough arguments for format string");
8381    return NULL;
8382}
8383
8384static Py_ssize_t
8385strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8386{
8387    register Py_ssize_t i;
8388    Py_ssize_t len = strlen(charbuffer);
8389    for (i = len - 1; i >= 0; i--)
8390	buffer[i] = (Py_UNICODE) charbuffer[i];
8391
8392    return len;
8393}
8394
8395static int
8396doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8397{
8398    Py_ssize_t result;
8399
8400    PyOS_ascii_formatd((char *)buffer, len, format, x);
8401    result = strtounicode(buffer, (char *)buffer);
8402    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8403}
8404
8405#if 0
8406static int
8407longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8408{
8409    Py_ssize_t result;
8410
8411    PyOS_snprintf((char *)buffer, len, format, x);
8412    result = strtounicode(buffer, (char *)buffer);
8413    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8414}
8415#endif
8416
8417/* XXX To save some code duplication, formatfloat/long/int could have been
8418   shared with stringobject.c, converting from 8-bit to Unicode after the
8419   formatting is done. */
8420
8421static int
8422formatfloat(Py_UNICODE *buf,
8423	    size_t buflen,
8424	    int flags,
8425	    int prec,
8426	    int type,
8427	    PyObject *v)
8428{
8429    /* fmt = '%#.' + `prec` + `type`
8430       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8431    char fmt[20];
8432    double x;
8433
8434    x = PyFloat_AsDouble(v);
8435    if (x == -1.0 && PyErr_Occurred())
8436	return -1;
8437    if (prec < 0)
8438	prec = 6;
8439    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8440	type = 'g';
8441    /* Worst case length calc to ensure no buffer overrun:
8442
8443       'g' formats:
8444	 fmt = %#.<prec>g
8445	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8446	    for any double rep.)
8447	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8448
8449       'f' formats:
8450	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8451	 len = 1 + 50 + 1 + prec = 52 + prec
8452
8453       If prec=0 the effective precision is 1 (the leading digit is
8454       always given), therefore increase the length by one.
8455
8456    */
8457    if (((type == 'g' || type == 'G') &&
8458          buflen <= (size_t)10 + (size_t)prec) ||
8459	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8460	PyErr_SetString(PyExc_OverflowError,
8461			"formatted float is too long (precision too large?)");
8462	return -1;
8463    }
8464    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8465		  (flags&F_ALT) ? "#" : "",
8466		  prec, type);
8467    return doubletounicode(buf, buflen, fmt, x);
8468}
8469
8470static PyObject*
8471formatlong(PyObject *val, int flags, int prec, int type)
8472{
8473	char *buf;
8474	int len;
8475	PyObject *str; /* temporary string object. */
8476	PyObject *result;
8477
8478	str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8479	if (!str)
8480		return NULL;
8481	result = PyUnicode_FromStringAndSize(buf, len);
8482	Py_DECREF(str);
8483	return result;
8484}
8485
8486#if 0
8487static int
8488formatint(Py_UNICODE *buf,
8489	  size_t buflen,
8490	  int flags,
8491	  int prec,
8492	  int type,
8493	  PyObject *v)
8494{
8495    /* fmt = '%#.' + `prec` + 'l' + `type`
8496     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8497     *                     + 1 + 1
8498     *                   = 24
8499     */
8500    char fmt[64]; /* plenty big enough! */
8501    char *sign;
8502    long x;
8503
8504    x = PyLong_AsLong(v);
8505    if (x == -1 && PyErr_Occurred())
8506        return -1;
8507    if (x < 0 && type == 'u') {
8508        type = 'd';
8509    }
8510    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8511        sign = "-";
8512    else
8513        sign = "";
8514    if (prec < 0)
8515        prec = 1;
8516
8517    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8518     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8519     */
8520    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8521        PyErr_SetString(PyExc_OverflowError,
8522    	        "formatted integer is too long (precision too large?)");
8523        return -1;
8524    }
8525
8526    if ((flags & F_ALT) &&
8527        (type == 'x' || type == 'X' || type == 'o')) {
8528        /* When converting under %#o, %#x or %#X, there are a number
8529         * of issues that cause pain:
8530	 * - for %#o, we want a different base marker than C
8531         * - when 0 is being converted, the C standard leaves off
8532         *   the '0x' or '0X', which is inconsistent with other
8533         *   %#x/%#X conversions and inconsistent with Python's
8534         *   hex() function
8535         * - there are platforms that violate the standard and
8536         *   convert 0 with the '0x' or '0X'
8537         *   (Metrowerks, Compaq Tru64)
8538         * - there are platforms that give '0x' when converting
8539         *   under %#X, but convert 0 in accordance with the
8540         *   standard (OS/2 EMX)
8541         *
8542         * We can achieve the desired consistency by inserting our
8543         * own '0x' or '0X' prefix, and substituting %x/%X in place
8544         * of %#x/%#X.
8545         *
8546         * Note that this is the same approach as used in
8547         * formatint() in stringobject.c
8548         */
8549        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8550                      sign, type, prec, type);
8551    }
8552    else {
8553        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8554                      sign, (flags&F_ALT) ? "#" : "",
8555                      prec, type);
8556    }
8557    if (sign[0])
8558        return longtounicode(buf, buflen, fmt, -x);
8559    else
8560        return longtounicode(buf, buflen, fmt, x);
8561}
8562#endif
8563
8564static int
8565formatchar(Py_UNICODE *buf,
8566           size_t buflen,
8567           PyObject *v)
8568{
8569    /* presume that the buffer is at least 2 characters long */
8570    if (PyUnicode_Check(v)) {
8571	if (PyUnicode_GET_SIZE(v) != 1)
8572	    goto onError;
8573	buf[0] = PyUnicode_AS_UNICODE(v)[0];
8574    }
8575    else {
8576	/* Integer input truncated to a character */
8577        long x;
8578	x = PyLong_AsLong(v);
8579	if (x == -1 && PyErr_Occurred())
8580	    goto onError;
8581#ifdef Py_UNICODE_WIDE
8582	if (x < 0 || x > 0x10ffff) {
8583	    PyErr_SetString(PyExc_OverflowError,
8584			    "%c arg not in range(0x110000) "
8585			    "(wide Python build)");
8586	    return -1;
8587	}
8588#else
8589	if (x < 0 || x > 0xffff) {
8590	    PyErr_SetString(PyExc_OverflowError,
8591			    "%c arg not in range(0x10000) "
8592			    "(narrow Python build)");
8593	    return -1;
8594	}
8595#endif
8596	buf[0] = (Py_UNICODE) x;
8597    }
8598    buf[1] = '\0';
8599    return 1;
8600
8601 onError:
8602    PyErr_SetString(PyExc_TypeError,
8603		    "%c requires int or char");
8604    return -1;
8605}
8606
8607/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8608
8609   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8610   chars are formatted. XXX This is a magic number. Each formatting
8611   routine does bounds checking to ensure no overflow, but a better
8612   solution may be to malloc a buffer of appropriate size for each
8613   format. For now, the current solution is sufficient.
8614*/
8615#define FORMATBUFLEN (size_t)120
8616
8617PyObject *PyUnicode_Format(PyObject *format,
8618			   PyObject *args)
8619{
8620    Py_UNICODE *fmt, *res;
8621    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8622    int args_owned = 0;
8623    PyUnicodeObject *result = NULL;
8624    PyObject *dict = NULL;
8625    PyObject *uformat;
8626
8627    if (format == NULL || args == NULL) {
8628	PyErr_BadInternalCall();
8629	return NULL;
8630    }
8631    uformat = PyUnicode_FromObject(format);
8632    if (uformat == NULL)
8633	return NULL;
8634    fmt = PyUnicode_AS_UNICODE(uformat);
8635    fmtcnt = PyUnicode_GET_SIZE(uformat);
8636
8637    reslen = rescnt = fmtcnt + 100;
8638    result = _PyUnicode_New(reslen);
8639    if (result == NULL)
8640	goto onError;
8641    res = PyUnicode_AS_UNICODE(result);
8642
8643    if (PyTuple_Check(args)) {
8644	arglen = PyTuple_Size(args);
8645	argidx = 0;
8646    }
8647    else {
8648	arglen = -1;
8649	argidx = -2;
8650    }
8651    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8652        !PyUnicode_Check(args))
8653	dict = args;
8654
8655    while (--fmtcnt >= 0) {
8656	if (*fmt != '%') {
8657	    if (--rescnt < 0) {
8658		rescnt = fmtcnt + 100;
8659		reslen += rescnt;
8660		if (_PyUnicode_Resize(&result, reslen) < 0)
8661		    goto onError;
8662		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8663		--rescnt;
8664	    }
8665	    *res++ = *fmt++;
8666	}
8667	else {
8668	    /* Got a format specifier */
8669	    int flags = 0;
8670	    Py_ssize_t width = -1;
8671	    int prec = -1;
8672	    Py_UNICODE c = '\0';
8673	    Py_UNICODE fill;
8674	    int isnumok;
8675	    PyObject *v = NULL;
8676	    PyObject *temp = NULL;
8677	    Py_UNICODE *pbuf;
8678	    Py_UNICODE sign;
8679	    Py_ssize_t len;
8680	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8681
8682	    fmt++;
8683	    if (*fmt == '(') {
8684		Py_UNICODE *keystart;
8685		Py_ssize_t keylen;
8686		PyObject *key;
8687		int pcount = 1;
8688
8689		if (dict == NULL) {
8690		    PyErr_SetString(PyExc_TypeError,
8691				    "format requires a mapping");
8692		    goto onError;
8693		}
8694		++fmt;
8695		--fmtcnt;
8696		keystart = fmt;
8697		/* Skip over balanced parentheses */
8698		while (pcount > 0 && --fmtcnt >= 0) {
8699		    if (*fmt == ')')
8700			--pcount;
8701		    else if (*fmt == '(')
8702			++pcount;
8703		    fmt++;
8704		}
8705		keylen = fmt - keystart - 1;
8706		if (fmtcnt < 0 || pcount > 0) {
8707		    PyErr_SetString(PyExc_ValueError,
8708				    "incomplete format key");
8709		    goto onError;
8710		}
8711#if 0
8712		/* keys are converted to strings using UTF-8 and
8713		   then looked up since Python uses strings to hold
8714		   variables names etc. in its namespaces and we
8715		   wouldn't want to break common idioms. */
8716		key = PyUnicode_EncodeUTF8(keystart,
8717					   keylen,
8718					   NULL);
8719#else
8720		key = PyUnicode_FromUnicode(keystart, keylen);
8721#endif
8722		if (key == NULL)
8723		    goto onError;
8724		if (args_owned) {
8725		    Py_DECREF(args);
8726		    args_owned = 0;
8727		}
8728		args = PyObject_GetItem(dict, key);
8729		Py_DECREF(key);
8730		if (args == NULL) {
8731		    goto onError;
8732		}
8733		args_owned = 1;
8734		arglen = -1;
8735		argidx = -2;
8736	    }
8737	    while (--fmtcnt >= 0) {
8738		switch (c = *fmt++) {
8739		case '-': flags |= F_LJUST; continue;
8740		case '+': flags |= F_SIGN; continue;
8741		case ' ': flags |= F_BLANK; continue;
8742		case '#': flags |= F_ALT; continue;
8743		case '0': flags |= F_ZERO; continue;
8744		}
8745		break;
8746	    }
8747	    if (c == '*') {
8748		v = getnextarg(args, arglen, &argidx);
8749		if (v == NULL)
8750		    goto onError;
8751		if (!PyLong_Check(v)) {
8752		    PyErr_SetString(PyExc_TypeError,
8753				    "* wants int");
8754		    goto onError;
8755		}
8756		width = PyLong_AsLong(v);
8757		if (width == -1 && PyErr_Occurred())
8758			goto onError;
8759		if (width < 0) {
8760		    flags |= F_LJUST;
8761		    width = -width;
8762		}
8763		if (--fmtcnt >= 0)
8764		    c = *fmt++;
8765	    }
8766	    else if (c >= '0' && c <= '9') {
8767		width = c - '0';
8768		while (--fmtcnt >= 0) {
8769		    c = *fmt++;
8770		    if (c < '0' || c > '9')
8771			break;
8772		    if ((width*10) / 10 != width) {
8773			PyErr_SetString(PyExc_ValueError,
8774					"width too big");
8775			goto onError;
8776		    }
8777		    width = width*10 + (c - '0');
8778		}
8779	    }
8780	    if (c == '.') {
8781		prec = 0;
8782		if (--fmtcnt >= 0)
8783		    c = *fmt++;
8784		if (c == '*') {
8785		    v = getnextarg(args, arglen, &argidx);
8786		    if (v == NULL)
8787			goto onError;
8788		    if (!PyLong_Check(v)) {
8789			PyErr_SetString(PyExc_TypeError,
8790					"* wants int");
8791			goto onError;
8792		    }
8793		    prec = PyLong_AsLong(v);
8794		    if (prec == -1 && PyErr_Occurred())
8795			goto onError;
8796		    if (prec < 0)
8797			prec = 0;
8798		    if (--fmtcnt >= 0)
8799			c = *fmt++;
8800		}
8801		else if (c >= '0' && c <= '9') {
8802		    prec = c - '0';
8803		    while (--fmtcnt >= 0) {
8804			c = Py_CHARMASK(*fmt++);
8805			if (c < '0' || c > '9')
8806			    break;
8807			if ((prec*10) / 10 != prec) {
8808			    PyErr_SetString(PyExc_ValueError,
8809					    "prec too big");
8810			    goto onError;
8811			}
8812			prec = prec*10 + (c - '0');
8813		    }
8814		}
8815	    } /* prec */
8816	    if (fmtcnt >= 0) {
8817		if (c == 'h' || c == 'l' || c == 'L') {
8818		    if (--fmtcnt >= 0)
8819			c = *fmt++;
8820		}
8821	    }
8822	    if (fmtcnt < 0) {
8823		PyErr_SetString(PyExc_ValueError,
8824				"incomplete format");
8825		goto onError;
8826	    }
8827	    if (c != '%') {
8828		v = getnextarg(args, arglen, &argidx);
8829		if (v == NULL)
8830		    goto onError;
8831	    }
8832	    sign = 0;
8833	    fill = ' ';
8834	    switch (c) {
8835
8836	    case '%':
8837		pbuf = formatbuf;
8838		/* presume that buffer length is at least 1 */
8839		pbuf[0] = '%';
8840		len = 1;
8841		break;
8842
8843	    case 's':
8844	    case 'r':
8845		if (PyUnicode_Check(v) && c == 's') {
8846		    temp = v;
8847		    Py_INCREF(temp);
8848		}
8849		else {
8850		    if (c == 's')
8851			temp = PyObject_Str(v);
8852		    else
8853			temp = PyObject_Repr(v);
8854		    if (temp == NULL)
8855			goto onError;
8856                    if (PyUnicode_Check(temp))
8857                        /* nothing to do */;
8858		    else {
8859			Py_DECREF(temp);
8860			PyErr_SetString(PyExc_TypeError,
8861					"%s argument has non-string str()");
8862			goto onError;
8863		    }
8864		}
8865		pbuf = PyUnicode_AS_UNICODE(temp);
8866		len = PyUnicode_GET_SIZE(temp);
8867		if (prec >= 0 && len > prec)
8868		    len = prec;
8869		break;
8870
8871	    case 'i':
8872	    case 'd':
8873	    case 'u':
8874	    case 'o':
8875	    case 'x':
8876	    case 'X':
8877		if (c == 'i')
8878		    c = 'd';
8879		isnumok = 0;
8880		if (PyNumber_Check(v)) {
8881			PyObject *iobj=NULL;
8882
8883			if (PyLong_Check(v)) {
8884				iobj = v;
8885				Py_INCREF(iobj);
8886			}
8887			else {
8888				iobj = PyNumber_Long(v);
8889			}
8890			if (iobj!=NULL) {
8891				if (PyLong_Check(iobj)) {
8892					isnumok = 1;
8893					temp = formatlong(iobj, flags, prec, c);
8894					Py_DECREF(iobj);
8895					if (!temp)
8896					    goto onError;
8897					pbuf = PyUnicode_AS_UNICODE(temp);
8898					len = PyUnicode_GET_SIZE(temp);
8899					sign = 1;
8900				}
8901				else {
8902					Py_DECREF(iobj);
8903				}
8904			}
8905		}
8906		if (!isnumok) {
8907			PyErr_Format(PyExc_TypeError,
8908			    "%%%c format: a number is required, "
8909                                     "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8910			goto onError;
8911		}
8912		if (flags & F_ZERO)
8913		    fill = '0';
8914		break;
8915
8916	    case 'e':
8917	    case 'E':
8918	    case 'f':
8919	    case 'F':
8920	    case 'g':
8921	    case 'G':
8922		if (c == 'F')
8923			c = 'f';
8924		pbuf = formatbuf;
8925		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8926			flags, prec, c, v);
8927		if (len < 0)
8928		    goto onError;
8929		sign = 1;
8930		if (flags & F_ZERO)
8931		    fill = '0';
8932		break;
8933
8934	    case 'c':
8935		pbuf = formatbuf;
8936		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8937		if (len < 0)
8938		    goto onError;
8939		break;
8940
8941	    default:
8942		PyErr_Format(PyExc_ValueError,
8943			     "unsupported format character '%c' (0x%x) "
8944			     "at index %zd",
8945			     (31<=c && c<=126) ? (char)c : '?',
8946                             (int)c,
8947			     (Py_ssize_t)(fmt - 1 -
8948					  PyUnicode_AS_UNICODE(uformat)));
8949		goto onError;
8950	    }
8951	    if (sign) {
8952		if (*pbuf == '-' || *pbuf == '+') {
8953		    sign = *pbuf++;
8954		    len--;
8955		}
8956		else if (flags & F_SIGN)
8957		    sign = '+';
8958		else if (flags & F_BLANK)
8959		    sign = ' ';
8960		else
8961		    sign = 0;
8962	    }
8963	    if (width < len)
8964		width = len;
8965	    if (rescnt - (sign != 0) < width) {
8966		reslen -= rescnt;
8967		rescnt = width + fmtcnt + 100;
8968		reslen += rescnt;
8969		if (reslen < 0) {
8970		    Py_XDECREF(temp);
8971		    PyErr_NoMemory();
8972		    goto onError;
8973		}
8974		if (_PyUnicode_Resize(&result, reslen) < 0) {
8975		    Py_XDECREF(temp);
8976		    goto onError;
8977		}
8978		res = PyUnicode_AS_UNICODE(result)
8979		    + reslen - rescnt;
8980	    }
8981	    if (sign) {
8982		if (fill != ' ')
8983		    *res++ = sign;
8984		rescnt--;
8985		if (width > len)
8986		    width--;
8987	    }
8988	    if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
8989		assert(pbuf[0] == '0');
8990		assert(pbuf[1] == c);
8991		if (fill != ' ') {
8992		    *res++ = *pbuf++;
8993		    *res++ = *pbuf++;
8994		}
8995		rescnt -= 2;
8996		width -= 2;
8997		if (width < 0)
8998		    width = 0;
8999		len -= 2;
9000	    }
9001	    if (width > len && !(flags & F_LJUST)) {
9002		do {
9003		    --rescnt;
9004		    *res++ = fill;
9005		} while (--width > len);
9006	    }
9007	    if (fill == ' ') {
9008		if (sign)
9009		    *res++ = sign;
9010		if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9011		    assert(pbuf[0] == '0');
9012		    assert(pbuf[1] == c);
9013		    *res++ = *pbuf++;
9014		    *res++ = *pbuf++;
9015		}
9016	    }
9017	    Py_UNICODE_COPY(res, pbuf, len);
9018	    res += len;
9019	    rescnt -= len;
9020	    while (--width >= len) {
9021		--rescnt;
9022		*res++ = ' ';
9023	    }
9024	    if (dict && (argidx < arglen) && c != '%') {
9025		PyErr_SetString(PyExc_TypeError,
9026				"not all arguments converted during string formatting");
9027                Py_XDECREF(temp);
9028		goto onError;
9029	    }
9030	    Py_XDECREF(temp);
9031	} /* '%' */
9032    } /* until end */
9033    if (argidx < arglen && !dict) {
9034	PyErr_SetString(PyExc_TypeError,
9035			"not all arguments converted during string formatting");
9036	goto onError;
9037    }
9038
9039    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9040	goto onError;
9041    if (args_owned) {
9042	Py_DECREF(args);
9043    }
9044    Py_DECREF(uformat);
9045    return (PyObject *)result;
9046
9047 onError:
9048    Py_XDECREF(result);
9049    Py_DECREF(uformat);
9050    if (args_owned) {
9051	Py_DECREF(args);
9052    }
9053    return NULL;
9054}
9055
9056static PyObject *
9057unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9058
9059static PyObject *
9060unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9061{
9062        PyObject *x = NULL;
9063	static char *kwlist[] = {"object", "encoding", "errors", 0};
9064	char *encoding = NULL;
9065	char *errors = NULL;
9066
9067	if (type != &PyUnicode_Type)
9068		return unicode_subtype_new(type, args, kwds);
9069	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9070					  kwlist, &x, &encoding, &errors))
9071	    return NULL;
9072	if (x == NULL)
9073		return (PyObject *)_PyUnicode_New(0);
9074	if (encoding == NULL && errors == NULL)
9075	    return PyObject_Str(x);
9076	else
9077	return PyUnicode_FromEncodedObject(x, encoding, errors);
9078}
9079
9080static PyObject *
9081unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9082{
9083	PyUnicodeObject *tmp, *pnew;
9084	Py_ssize_t n;
9085
9086	assert(PyType_IsSubtype(type, &PyUnicode_Type));
9087	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9088	if (tmp == NULL)
9089		return NULL;
9090	assert(PyUnicode_Check(tmp));
9091	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9092	if (pnew == NULL) {
9093		Py_DECREF(tmp);
9094		return NULL;
9095	}
9096	pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9097	if (pnew->str == NULL) {
9098		_Py_ForgetReference((PyObject *)pnew);
9099		PyObject_Del(pnew);
9100		Py_DECREF(tmp);
9101		return PyErr_NoMemory();
9102	}
9103	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9104	pnew->length = n;
9105	pnew->hash = tmp->hash;
9106	Py_DECREF(tmp);
9107	return (PyObject *)pnew;
9108}
9109
9110PyDoc_STRVAR(unicode_doc,
9111"str(string [, encoding[, errors]]) -> object\n\
9112\n\
9113Create a new string object from the given encoded string.\n\
9114encoding defaults to the current default string encoding.\n\
9115errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9116
9117static PyObject *unicode_iter(PyObject *seq);
9118
9119PyTypeObject PyUnicode_Type = {
9120    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9121    "str", 				/* tp_name */
9122    sizeof(PyUnicodeObject), 		/* tp_size */
9123    0, 					/* tp_itemsize */
9124    /* Slots */
9125    (destructor)unicode_dealloc, 	/* tp_dealloc */
9126    0, 					/* tp_print */
9127    0,				 	/* tp_getattr */
9128    0, 					/* tp_setattr */
9129    0, 					/* tp_compare */
9130    unicode_repr, 			/* tp_repr */
9131    &unicode_as_number, 		/* tp_as_number */
9132    &unicode_as_sequence, 		/* tp_as_sequence */
9133    &unicode_as_mapping, 		/* tp_as_mapping */
9134    (hashfunc) unicode_hash, 		/* tp_hash*/
9135    0, 					/* tp_call*/
9136    (reprfunc) unicode_str,	 	/* tp_str */
9137    PyObject_GenericGetAttr, 		/* tp_getattro */
9138    0,			 		/* tp_setattro */
9139    0, 					/* tp_as_buffer */
9140    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9141        Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
9142    unicode_doc,			/* tp_doc */
9143    0,					/* tp_traverse */
9144    0,					/* tp_clear */
9145    PyUnicode_RichCompare,		/* tp_richcompare */
9146    0,					/* tp_weaklistoffset */
9147    unicode_iter,			/* tp_iter */
9148    0,					/* tp_iternext */
9149    unicode_methods,			/* tp_methods */
9150    0,					/* tp_members */
9151    0,					/* tp_getset */
9152    &PyBaseObject_Type,			/* tp_base */
9153    0,					/* tp_dict */
9154    0,					/* tp_descr_get */
9155    0,					/* tp_descr_set */
9156    0,					/* tp_dictoffset */
9157    0,					/* tp_init */
9158    0,					/* tp_alloc */
9159    unicode_new,			/* tp_new */
9160    PyObject_Del,      		/* tp_free */
9161};
9162
9163/* Initialize the Unicode implementation */
9164
9165void _PyUnicode_Init(void)
9166{
9167    int i;
9168
9169    /* XXX - move this array to unicodectype.c ? */
9170    Py_UNICODE linebreak[] = {
9171        0x000A, /* LINE FEED */
9172        0x000D, /* CARRIAGE RETURN */
9173        0x001C, /* FILE SEPARATOR */
9174        0x001D, /* GROUP SEPARATOR */
9175        0x001E, /* RECORD SEPARATOR */
9176        0x0085, /* NEXT LINE */
9177        0x2028, /* LINE SEPARATOR */
9178        0x2029, /* PARAGRAPH SEPARATOR */
9179    };
9180
9181    /* Init the implementation */
9182    free_list = NULL;
9183    numfree = 0;
9184    unicode_empty = _PyUnicode_New(0);
9185    if (!unicode_empty)
9186	return;
9187
9188    for (i = 0; i < 256; i++)
9189	unicode_latin1[i] = NULL;
9190    if (PyType_Ready(&PyUnicode_Type) < 0)
9191	Py_FatalError("Can't initialize 'unicode'");
9192
9193    /* initialize the linebreak bloom filter */
9194    bloom_linebreak = make_bloom_mask(
9195        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9196        );
9197
9198    PyType_Ready(&EncodingMapType);
9199}
9200
9201/* Finalize the Unicode implementation */
9202
9203int
9204PyUnicode_ClearFreeList(void)
9205{
9206    int freelist_size = numfree;
9207    PyUnicodeObject *u;
9208
9209    for (u = free_list; u != NULL;) {
9210	PyUnicodeObject *v = u;
9211	u = *(PyUnicodeObject **)u;
9212	if (v->str)
9213	    PyObject_DEL(v->str);
9214	Py_XDECREF(v->defenc);
9215	PyObject_Del(v);
9216	numfree--;
9217    }
9218    free_list = NULL;
9219    assert(numfree == 0);
9220    return freelist_size;
9221}
9222
9223void
9224_PyUnicode_Fini(void)
9225{
9226    int i;
9227
9228    Py_XDECREF(unicode_empty);
9229    unicode_empty = NULL;
9230
9231    for (i = 0; i < 256; i++) {
9232	if (unicode_latin1[i]) {
9233	    Py_DECREF(unicode_latin1[i]);
9234	    unicode_latin1[i] = NULL;
9235	}
9236    }
9237    (void)PyUnicode_ClearFreeList();
9238}
9239
9240void
9241PyUnicode_InternInPlace(PyObject **p)
9242{
9243	register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9244	PyObject *t;
9245	if (s == NULL || !PyUnicode_Check(s))
9246		Py_FatalError(
9247		    "PyUnicode_InternInPlace: unicode strings only please!");
9248	/* If it's a subclass, we don't really know what putting
9249	   it in the interned dict might do. */
9250	if (!PyUnicode_CheckExact(s))
9251		return;
9252	if (PyUnicode_CHECK_INTERNED(s))
9253		return;
9254	if (interned == NULL) {
9255		interned = PyDict_New();
9256		if (interned == NULL) {
9257			PyErr_Clear(); /* Don't leave an exception */
9258			return;
9259		}
9260	}
9261	/* It might be that the GetItem call fails even
9262	   though the key is present in the dictionary,
9263	   namely when this happens during a stack overflow. */
9264	Py_ALLOW_RECURSION
9265	t = PyDict_GetItem(interned, (PyObject *)s);
9266	Py_END_ALLOW_RECURSION
9267
9268	if (t) {
9269		Py_INCREF(t);
9270		Py_DECREF(*p);
9271		*p = t;
9272		return;
9273	}
9274
9275	PyThreadState_GET()->recursion_critical = 1;
9276	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9277		PyErr_Clear();
9278		PyThreadState_GET()->recursion_critical = 0;
9279		return;
9280	}
9281	PyThreadState_GET()->recursion_critical = 0;
9282	/* The two references in interned are not counted by refcnt.
9283	   The deallocator will take care of this */
9284	Py_REFCNT(s) -= 2;
9285	PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9286}
9287
9288void
9289PyUnicode_InternImmortal(PyObject **p)
9290{
9291	PyUnicode_InternInPlace(p);
9292	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9293		PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9294		Py_INCREF(*p);
9295	}
9296}
9297
9298PyObject *
9299PyUnicode_InternFromString(const char *cp)
9300{
9301	PyObject *s = PyUnicode_FromString(cp);
9302	if (s == NULL)
9303		return NULL;
9304	PyUnicode_InternInPlace(&s);
9305	return s;
9306}
9307
9308void _Py_ReleaseInternedUnicodeStrings(void)
9309{
9310	PyObject *keys;
9311	PyUnicodeObject *s;
9312	Py_ssize_t i, n;
9313	Py_ssize_t immortal_size = 0, mortal_size = 0;
9314
9315	if (interned == NULL || !PyDict_Check(interned))
9316		return;
9317	keys = PyDict_Keys(interned);
9318	if (keys == NULL || !PyList_Check(keys)) {
9319		PyErr_Clear();
9320		return;
9321	}
9322
9323	/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9324	   detector, interned unicode strings are not forcibly deallocated;
9325	   rather, we give them their stolen references back, and then clear
9326	   and DECREF the interned dict. */
9327
9328	n = PyList_GET_SIZE(keys);
9329	fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9330		n);
9331	for (i = 0; i < n; i++) {
9332		s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9333		switch (s->state) {
9334		case SSTATE_NOT_INTERNED:
9335			/* XXX Shouldn't happen */
9336			break;
9337		case SSTATE_INTERNED_IMMORTAL:
9338			Py_REFCNT(s) += 1;
9339			immortal_size += s->length;
9340			break;
9341		case SSTATE_INTERNED_MORTAL:
9342			Py_REFCNT(s) += 2;
9343			mortal_size += s->length;
9344			break;
9345		default:
9346			Py_FatalError("Inconsistent interned string state.");
9347		}
9348		s->state = SSTATE_NOT_INTERNED;
9349	}
9350	fprintf(stderr, "total size of all interned strings: "
9351			"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9352			"mortal/immortal\n", mortal_size, immortal_size);
9353	Py_DECREF(keys);
9354	PyDict_Clear(interned);
9355	Py_DECREF(interned);
9356	interned = NULL;
9357}
9358
9359
9360/********************* Unicode Iterator **************************/
9361
9362typedef struct {
9363	PyObject_HEAD
9364	Py_ssize_t it_index;
9365	PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9366} unicodeiterobject;
9367
9368static void
9369unicodeiter_dealloc(unicodeiterobject *it)
9370{
9371	_PyObject_GC_UNTRACK(it);
9372	Py_XDECREF(it->it_seq);
9373	PyObject_GC_Del(it);
9374}
9375
9376static int
9377unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9378{
9379	Py_VISIT(it->it_seq);
9380	return 0;
9381}
9382
9383static PyObject *
9384unicodeiter_next(unicodeiterobject *it)
9385{
9386	PyUnicodeObject *seq;
9387	PyObject *item;
9388
9389	assert(it != NULL);
9390	seq = it->it_seq;
9391	if (seq == NULL)
9392		return NULL;
9393	assert(PyUnicode_Check(seq));
9394
9395	if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9396		item = PyUnicode_FromUnicode(
9397                    PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9398		if (item != NULL)
9399			++it->it_index;
9400		return item;
9401	}
9402
9403	Py_DECREF(seq);
9404	it->it_seq = NULL;
9405	return NULL;
9406}
9407
9408static PyObject *
9409unicodeiter_len(unicodeiterobject *it)
9410{
9411	Py_ssize_t len = 0;
9412	if (it->it_seq)
9413		len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9414	return PyLong_FromSsize_t(len);
9415}
9416
9417PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9418
9419static PyMethodDef unicodeiter_methods[] = {
9420	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9421         length_hint_doc},
9422 	{NULL,		NULL}		/* sentinel */
9423};
9424
9425PyTypeObject PyUnicodeIter_Type = {
9426	PyVarObject_HEAD_INIT(&PyType_Type, 0)
9427	"str_iterator",			/* tp_name */
9428	sizeof(unicodeiterobject),		/* tp_basicsize */
9429	0,					/* tp_itemsize */
9430	/* methods */
9431	(destructor)unicodeiter_dealloc,	/* tp_dealloc */
9432	0,					/* tp_print */
9433	0,					/* tp_getattr */
9434	0,					/* tp_setattr */
9435	0,					/* tp_compare */
9436	0,					/* tp_repr */
9437	0,					/* tp_as_number */
9438	0,					/* tp_as_sequence */
9439	0,					/* tp_as_mapping */
9440	0,					/* tp_hash */
9441	0,					/* tp_call */
9442	0,					/* tp_str */
9443	PyObject_GenericGetAttr,		/* tp_getattro */
9444	0,					/* tp_setattro */
9445	0,					/* tp_as_buffer */
9446	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9447	0,					/* tp_doc */
9448	(traverseproc)unicodeiter_traverse,	/* tp_traverse */
9449	0,					/* tp_clear */
9450	0,					/* tp_richcompare */
9451	0,					/* tp_weaklistoffset */
9452	PyObject_SelfIter,			/* tp_iter */
9453	(iternextfunc)unicodeiter_next,		/* tp_iternext */
9454	unicodeiter_methods,			/* tp_methods */
9455	0,
9456};
9457
9458static PyObject *
9459unicode_iter(PyObject *seq)
9460{
9461	unicodeiterobject *it;
9462
9463	if (!PyUnicode_Check(seq)) {
9464		PyErr_BadInternalCall();
9465		return NULL;
9466	}
9467	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9468	if (it == NULL)
9469		return NULL;
9470	it->it_index = 0;
9471	Py_INCREF(seq);
9472	it->it_seq = (PyUnicodeObject *)seq;
9473	_PyObject_GC_TRACK(it);
9474	return (PyObject *)it;
9475}
9476
9477size_t
9478Py_UNICODE_strlen(const Py_UNICODE *u)
9479{
9480    int res = 0;
9481    while(*u++)
9482        res++;
9483    return res;
9484}
9485
9486Py_UNICODE*
9487Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9488{
9489    Py_UNICODE *u = s1;
9490    while ((*u++ = *s2++));
9491    return s1;
9492}
9493
9494Py_UNICODE*
9495Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9496{
9497    Py_UNICODE *u = s1;
9498    while ((*u++ = *s2++))
9499        if (n-- == 0)
9500            break;
9501    return s1;
9502}
9503
9504int
9505Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9506{
9507    while (*s1 && *s2 && *s1 == *s2)
9508        s1++, s2++;
9509    if (*s1 && *s2)
9510        return (*s1 < *s2) ? -1 : +1;
9511    if (*s1)
9512        return 1;
9513    if (*s2)
9514        return -1;
9515    return 0;
9516}
9517
9518Py_UNICODE*
9519Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9520{
9521    const Py_UNICODE *p;
9522    for (p = s; *p; p++)
9523        if (*p == c)
9524            return (Py_UNICODE*)p;
9525    return NULL;
9526}
9527
9528
9529#ifdef __cplusplus
9530}
9531#endif
9532
9533
9534/*
9535Local variables:
9536c-basic-offset: 4
9537indent-tabs-mode: nil
9538End:
9539*/
9540