unicodeobject.c revision af14b79ccea4dd04376fc2720905d8d2f29c5b6a
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "bytes_methods.h"
45
46#include "unicodeobject.h"
47#include "ucnhash.h"
48
49#ifdef MS_WINDOWS
50#include <windows.h>
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96/* This dictionary holds all interned unicode strings.  Note that references
97   to strings in this dictionary are *not* counted in the string's ob_refcnt.
98   When the interned string reaches a refcnt of 0 the string deallocation
99   function will delete the reference from this dictionary.
100
101   Another way to look at this is that to say that the actual reference
102   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
103*/
104static PyObject *interned;
105
106/* Free list for Unicode objects */
107static PyUnicodeObject *free_list;
108static int numfree;
109
110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114   shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
117/* Default encoding to use and assume when NULL is passed as encoding
118   parameter; it is fixed to "utf-8".  Always use the
119   PyUnicode_GetDefaultEncoding() API to access this global.
120
121   Don't forget to alter Py_FileSystemDefaultEncoding if you change the
122   hard coded default!
123*/
124static const char unicode_default_encoding[] = "utf-8";
125
126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128	0, 0, 0, 0, 0, 0, 0, 0,
129//     case 0x0009: /* HORIZONTAL TABULATION */
130//     case 0x000A: /* LINE FEED */
131//     case 0x000B: /* VERTICAL TABULATION */
132//     case 0x000C: /* FORM FEED */
133//     case 0x000D: /* CARRIAGE RETURN */
134	0, 1, 1, 1, 1, 1, 0, 0,
135	0, 0, 0, 0, 0, 0, 0, 0,
136//     case 0x001C: /* FILE SEPARATOR */
137//     case 0x001D: /* GROUP SEPARATOR */
138//     case 0x001E: /* RECORD SEPARATOR */
139//     case 0x001F: /* UNIT SEPARATOR */
140	0, 0, 0, 0, 1, 1, 1, 1,
141//     case 0x0020: /* SPACE */
142	1, 0, 0, 0, 0, 0, 0, 0,
143	0, 0, 0, 0, 0, 0, 0, 0,
144	0, 0, 0, 0, 0, 0, 0, 0,
145	0, 0, 0, 0, 0, 0, 0, 0,
146
147	0, 0, 0, 0, 0, 0, 0, 0,
148	0, 0, 0, 0, 0, 0, 0, 0,
149	0, 0, 0, 0, 0, 0, 0, 0,
150	0, 0, 0, 0, 0, 0, 0, 0,
151	0, 0, 0, 0, 0, 0, 0, 0,
152	0, 0, 0, 0, 0, 0, 0, 0,
153	0, 0, 0, 0, 0, 0, 0, 0,
154	0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159	0, 0, 0, 0, 0, 0, 0, 0,
160//         0x000A, /* LINE FEED */
161//         0x000D, /* CARRIAGE RETURN */
162	0, 0, 1, 0, 0, 1, 0, 0,
163	0, 0, 0, 0, 0, 0, 0, 0,
164//         0x001C, /* FILE SEPARATOR */
165//         0x001D, /* GROUP SEPARATOR */
166//         0x001E, /* RECORD SEPARATOR */
167	0, 0, 0, 0, 1, 1, 1, 0,
168	0, 0, 0, 0, 0, 0, 0, 0,
169	0, 0, 0, 0, 0, 0, 0, 0,
170	0, 0, 0, 0, 0, 0, 0, 0,
171	0, 0, 0, 0, 0, 0, 0, 0,
172
173	0, 0, 0, 0, 0, 0, 0, 0,
174	0, 0, 0, 0, 0, 0, 0, 0,
175	0, 0, 0, 0, 0, 0, 0, 0,
176	0, 0, 0, 0, 0, 0, 0, 0,
177	0, 0, 0, 0, 0, 0, 0, 0,
178	0, 0, 0, 0, 0, 0, 0, 0,
179	0, 0, 0, 0, 0, 0, 0, 0,
180	0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
184Py_UNICODE
185PyUnicode_GetMax(void)
186{
187#ifdef Py_UNICODE_WIDE
188	return 0x10FFFF;
189#else
190	/* This is actually an illegal character, so it should
191	   not be passed to unichr. */
192	return 0xFFFF;
193#endif
194}
195
196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199   to keep things simple, we use a single bitmask, using the least 5
200   bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
210#define BLOOM_LINEBREAK(ch) \
211    ((ch) < 128U ? ascii_linebreak[(ch)] : \
212    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216    /* calculate simple bloom-style bitmask for a given unicode string */
217
218    long mask;
219    Py_ssize_t i;
220
221    mask = 0;
222    for (i = 0; i < len; i++)
223        mask |= (1 << (ptr[i] & 0x1F));
224
225    return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230    Py_ssize_t i;
231
232    for (i = 0; i < setlen; i++)
233        if (set[i] == chr)
234            return 1;
235
236    return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
242/* --- Unicode Object ----------------------------------------------------- */
243
244static
245int unicode_resize(register PyUnicodeObject *unicode,
246                      Py_ssize_t length)
247{
248    void *oldstr;
249
250    /* Shortcut if there's nothing much to do. */
251    if (unicode->length == length)
252	goto reset;
253
254    /* Resizing shared object (unicode_empty or single character
255       objects) in-place is not allowed. Use PyUnicode_Resize()
256       instead ! */
257
258    if (unicode == unicode_empty ||
259	(unicode->length == 1 &&
260	 unicode->str[0] < 256U &&
261	 unicode_latin1[unicode->str[0]] == unicode)) {
262        PyErr_SetString(PyExc_SystemError,
263                        "can't resize shared str objects");
264        return -1;
265    }
266
267    /* We allocate one more byte to make sure the string is Ux0000 terminated.
268       The overallocation is also used by fastsearch, which assumes that it's
269       safe to look at str[length] (without making any assumptions about what
270       it contains). */
271
272    oldstr = unicode->str;
273    unicode->str = PyObject_REALLOC(unicode->str,
274				    sizeof(Py_UNICODE) * (length + 1));
275    if (!unicode->str) {
276	unicode->str = (Py_UNICODE *)oldstr;
277        PyErr_NoMemory();
278        return -1;
279    }
280    unicode->str[length] = 0;
281    unicode->length = length;
282
283 reset:
284    /* Reset the object caches */
285    if (unicode->defenc) {
286        Py_DECREF(unicode->defenc);
287        unicode->defenc = NULL;
288    }
289    unicode->hash = -1;
290
291    return 0;
292}
293
294/* We allocate one more byte to make sure the string is
295   Ux0000 terminated; some code (e.g. new_identifier)
296   relies on that.
297
298   XXX This allocator could further be enhanced by assuring that the
299       free list never reduces its size below 1.
300
301*/
302
303static
304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305{
306    register PyUnicodeObject *unicode;
307
308    /* Optimization for empty strings */
309    if (length == 0 && unicode_empty != NULL) {
310        Py_INCREF(unicode_empty);
311        return unicode_empty;
312    }
313
314    /* Unicode freelist & memory allocation */
315    if (free_list) {
316        unicode = free_list;
317        free_list = *(PyUnicodeObject **)unicode;
318        numfree--;
319	if (unicode->str) {
320	    /* Keep-Alive optimization: we only upsize the buffer,
321	       never downsize it. */
322	    if ((unicode->length < length) &&
323                unicode_resize(unicode, length) < 0) {
324		PyObject_DEL(unicode->str);
325		unicode->str = NULL;
326	    }
327	}
328        else {
329	    size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330	    unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
331        }
332        PyObject_INIT(unicode, &PyUnicode_Type);
333    }
334    else {
335	size_t new_size;
336        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
337        if (unicode == NULL)
338            return NULL;
339	new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340	unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
341    }
342
343    if (!unicode->str) {
344	PyErr_NoMemory();
345	goto onError;
346    }
347    /* Initialize the first element to guard against cases where
348     * the caller fails before initializing str -- unicode_resize()
349     * reads str[0], and the Keep-Alive optimization can keep memory
350     * allocated for str alive across a call to unicode_dealloc(unicode).
351     * We don't want unicode_resize to read uninitialized memory in
352     * that case.
353     */
354    unicode->str[0] = 0;
355    unicode->str[length] = 0;
356    unicode->length = length;
357    unicode->hash = -1;
358    unicode->state = 0;
359    unicode->defenc = NULL;
360    return unicode;
361
362 onError:
363    /* XXX UNREF/NEWREF interface should be more symmetrical */
364    _Py_DEC_REFTOTAL;
365    _Py_ForgetReference((PyObject *)unicode);
366    PyObject_Del(unicode);
367    return NULL;
368}
369
370static
371void unicode_dealloc(register PyUnicodeObject *unicode)
372{
373    switch (PyUnicode_CHECK_INTERNED(unicode)) {
374        case SSTATE_NOT_INTERNED:
375            break;
376
377        case SSTATE_INTERNED_MORTAL:
378            /* revive dead object temporarily for DelItem */
379            Py_REFCNT(unicode) = 3;
380            if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
381                Py_FatalError(
382                    "deletion of interned string failed");
383            break;
384
385        case SSTATE_INTERNED_IMMORTAL:
386            Py_FatalError("Immortal interned string died.");
387
388        default:
389            Py_FatalError("Inconsistent interned string state.");
390    }
391
392    if (PyUnicode_CheckExact(unicode) &&
393	numfree < PyUnicode_MAXFREELIST) {
394        /* Keep-Alive optimization */
395	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
396	    PyObject_DEL(unicode->str);
397	    unicode->str = NULL;
398	    unicode->length = 0;
399	}
400	if (unicode->defenc) {
401	    Py_DECREF(unicode->defenc);
402	    unicode->defenc = NULL;
403	}
404	/* Add to free list */
405        *(PyUnicodeObject **)unicode = free_list;
406        free_list = unicode;
407        numfree++;
408    }
409    else {
410	PyObject_DEL(unicode->str);
411	Py_XDECREF(unicode->defenc);
412	Py_TYPE(unicode)->tp_free((PyObject *)unicode);
413    }
414}
415
416int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
417{
418    register PyUnicodeObject *v;
419
420    /* Argument checks */
421    if (unicode == NULL) {
422	PyErr_BadInternalCall();
423	return -1;
424    }
425    v = (PyUnicodeObject *)*unicode;
426    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
427	PyErr_BadInternalCall();
428	return -1;
429    }
430
431    /* Resizing unicode_empty and single character objects is not
432       possible since these are being shared. We simply return a fresh
433       copy with the same Unicode content. */
434    if (v->length != length &&
435	(v == unicode_empty || v->length == 1)) {
436	PyUnicodeObject *w = _PyUnicode_New(length);
437	if (w == NULL)
438	    return -1;
439	Py_UNICODE_COPY(w->str, v->str,
440			length < v->length ? length : v->length);
441	Py_DECREF(*unicode);
442	*unicode = (PyObject *)w;
443	return 0;
444    }
445
446    /* Note that we don't have to modify *unicode for unshared Unicode
447       objects, since we can modify them in-place. */
448    return unicode_resize(v, length);
449}
450
451/* Internal API for use in unicodeobject.c only ! */
452#define _PyUnicode_Resize(unicodevar, length) \
453        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
454
455PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
456				Py_ssize_t size)
457{
458    PyUnicodeObject *unicode;
459
460    /* If the Unicode data is known at construction time, we can apply
461       some optimizations which share commonly used objects. */
462    if (u != NULL) {
463
464	/* Optimization for empty strings */
465	if (size == 0 && unicode_empty != NULL) {
466	    Py_INCREF(unicode_empty);
467	    return (PyObject *)unicode_empty;
468	}
469
470	/* Single character Unicode objects in the Latin-1 range are
471	   shared when using this constructor */
472	if (size == 1 && *u < 256) {
473	    unicode = unicode_latin1[*u];
474	    if (!unicode) {
475		unicode = _PyUnicode_New(1);
476		if (!unicode)
477		    return NULL;
478		unicode->str[0] = *u;
479		unicode_latin1[*u] = unicode;
480	    }
481	    Py_INCREF(unicode);
482	    return (PyObject *)unicode;
483	}
484    }
485
486    unicode = _PyUnicode_New(size);
487    if (!unicode)
488        return NULL;
489
490    /* Copy the Unicode data into the new object */
491    if (u != NULL)
492	Py_UNICODE_COPY(unicode->str, u, size);
493
494    return (PyObject *)unicode;
495}
496
497PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
498{
499    PyUnicodeObject *unicode;
500
501	if (size < 0) {
502		PyErr_SetString(PyExc_SystemError,
503		    "Negative size passed to PyUnicode_FromStringAndSize");
504		return NULL;
505	}
506
507    /* If the Unicode data is known at construction time, we can apply
508       some optimizations which share commonly used objects.
509       Also, this means the input must be UTF-8, so fall back to the
510       UTF-8 decoder at the end. */
511    if (u != NULL) {
512
513	/* Optimization for empty strings */
514	if (size == 0 && unicode_empty != NULL) {
515	    Py_INCREF(unicode_empty);
516	    return (PyObject *)unicode_empty;
517	}
518
519	/* Single characters are shared when using this constructor.
520           Restrict to ASCII, since the input must be UTF-8. */
521	if (size == 1 && Py_CHARMASK(*u) < 128) {
522	    unicode = unicode_latin1[Py_CHARMASK(*u)];
523	    if (!unicode) {
524		unicode = _PyUnicode_New(1);
525		if (!unicode)
526		    return NULL;
527		unicode->str[0] = Py_CHARMASK(*u);
528		unicode_latin1[Py_CHARMASK(*u)] = unicode;
529	    }
530	    Py_INCREF(unicode);
531	    return (PyObject *)unicode;
532	}
533
534        return PyUnicode_DecodeUTF8(u, size, NULL);
535    }
536
537    unicode = _PyUnicode_New(size);
538    if (!unicode)
539        return NULL;
540
541    return (PyObject *)unicode;
542}
543
544PyObject *PyUnicode_FromString(const char *u)
545{
546    size_t size = strlen(u);
547    if (size > PY_SSIZE_T_MAX) {
548        PyErr_SetString(PyExc_OverflowError, "input too long");
549        return NULL;
550    }
551
552    return PyUnicode_FromStringAndSize(u, size);
553}
554
555#ifdef HAVE_WCHAR_H
556
557PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
558				 Py_ssize_t size)
559{
560    PyUnicodeObject *unicode;
561
562    if (w == NULL) {
563        if (size == 0)
564            return PyUnicode_FromStringAndSize(NULL, 0);
565	PyErr_BadInternalCall();
566	return NULL;
567    }
568
569    if (size == -1) {
570        size = wcslen(w);
571    }
572
573    unicode = _PyUnicode_New(size);
574    if (!unicode)
575        return NULL;
576
577    /* Copy the wchar_t data into the new object */
578#ifdef HAVE_USABLE_WCHAR_T
579    memcpy(unicode->str, w, size * sizeof(wchar_t));
580#else
581    {
582	register Py_UNICODE *u;
583	register Py_ssize_t i;
584	u = PyUnicode_AS_UNICODE(unicode);
585	for (i = size; i > 0; i--)
586	    *u++ = *w++;
587    }
588#endif
589
590    return (PyObject *)unicode;
591}
592
593static void
594makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
595{
596	*fmt++ = '%';
597	if (width) {
598		if (zeropad)
599			*fmt++ = '0';
600		fmt += sprintf(fmt, "%d", width);
601	}
602	if (precision)
603		fmt += sprintf(fmt, ".%d", precision);
604	if (longflag)
605		*fmt++ = 'l';
606	else if (size_tflag) {
607		char *f = PY_FORMAT_SIZE_T;
608		while (*f)
609			*fmt++ = *f++;
610	}
611	*fmt++ = c;
612	*fmt = '\0';
613}
614
615#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
616
617PyObject *
618PyUnicode_FromFormatV(const char *format, va_list vargs)
619{
620	va_list count;
621	Py_ssize_t callcount = 0;
622	PyObject **callresults = NULL;
623	PyObject **callresult = NULL;
624	Py_ssize_t n = 0;
625	int width = 0;
626	int precision = 0;
627	int zeropad;
628	const char* f;
629	Py_UNICODE *s;
630	PyObject *string;
631	/* used by sprintf */
632	char buffer[21];
633	/* use abuffer instead of buffer, if we need more space
634	 * (which can happen if there's a format specifier with width). */
635	char *abuffer = NULL;
636	char *realbuffer;
637	Py_ssize_t abuffersize = 0;
638	char fmt[60]; /* should be enough for %0width.precisionld */
639	const char *copy;
640
641#ifdef VA_LIST_IS_ARRAY
642	Py_MEMCPY(count, vargs, sizeof(va_list));
643#else
644#ifdef  __va_copy
645	__va_copy(count, vargs);
646#else
647	count = vargs;
648#endif
649#endif
650	/* step 1: count the number of %S/%R/%A format specifications
651	 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
652	 * these objects once during step 3 and put the result in
653	   an array) */
654	for (f = format; *f; f++) {
655		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
656			++callcount;
657	}
658	/* step 2: allocate memory for the results of
659	 * PyObject_Str()/PyObject_Repr() calls */
660	if (callcount) {
661		callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
662		if (!callresults) {
663			PyErr_NoMemory();
664			return NULL;
665		}
666		callresult = callresults;
667	}
668	/* step 3: figure out how large a buffer we need */
669	for (f = format; *f; f++) {
670		if (*f == '%') {
671			const char* p = f;
672			width = 0;
673			while (ISDIGIT((unsigned)*f))
674				width = (width*10) + *f++ - '0';
675			while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
676				;
677
678			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
679			 * they don't affect the amount of space we reserve.
680			 */
681			if ((*f == 'l' || *f == 'z') &&
682					(f[1] == 'd' || f[1] == 'u'))
683                                ++f;
684
685			switch (*f) {
686			case 'c':
687				(void)va_arg(count, int);
688				/* fall through... */
689			case '%':
690				n++;
691				break;
692			case 'd': case 'u': case 'i': case 'x':
693				(void) va_arg(count, int);
694				/* 20 bytes is enough to hold a 64-bit
695				   integer.  Decimal takes the most space.
696				   This isn't enough for octal.
697				   If a width is specified we need more
698				   (which we allocate later). */
699				if (width < 20)
700					width = 20;
701				n += width;
702				if (abuffersize < width)
703					abuffersize = width;
704				break;
705			case 's':
706			{
707				/* UTF-8 */
708				unsigned char*s;
709				s = va_arg(count, unsigned char*);
710				while (*s) {
711					if (*s < 128) {
712						n++; s++;
713					} else if (*s < 0xc0) {
714						/* invalid UTF-8 */
715						n++; s++;
716					} else if (*s < 0xc0) {
717						n++;
718						s++; if(!*s)break;
719						s++;
720					} else if (*s < 0xe0) {
721						n++;
722						s++; if(!*s)break;
723						s++; if(!*s)break;
724						s++;
725					} else {
726						#ifdef Py_UNICODE_WIDE
727						n++;
728						#else
729						n+=2;
730						#endif
731						s++; if(!*s)break;
732						s++; if(!*s)break;
733						s++; if(!*s)break;
734						s++;
735					}
736				}
737				break;
738			}
739			case 'U':
740			{
741				PyObject *obj = va_arg(count, PyObject *);
742				assert(obj && PyUnicode_Check(obj));
743				n += PyUnicode_GET_SIZE(obj);
744				break;
745			}
746			case 'V':
747			{
748				PyObject *obj = va_arg(count, PyObject *);
749				const char *str = va_arg(count, const char *);
750				assert(obj || str);
751				assert(!obj || PyUnicode_Check(obj));
752				if (obj)
753					n += PyUnicode_GET_SIZE(obj);
754				else
755					n += strlen(str);
756				break;
757			}
758			case 'S':
759			{
760				PyObject *obj = va_arg(count, PyObject *);
761				PyObject *str;
762				assert(obj);
763				str = PyObject_Str(obj);
764				if (!str)
765					goto fail;
766				n += PyUnicode_GET_SIZE(str);
767				/* Remember the str and switch to the next slot */
768				*callresult++ = str;
769				break;
770			}
771			case 'R':
772			{
773				PyObject *obj = va_arg(count, PyObject *);
774				PyObject *repr;
775				assert(obj);
776				repr = PyObject_Repr(obj);
777				if (!repr)
778					goto fail;
779				n += PyUnicode_GET_SIZE(repr);
780				/* Remember the repr and switch to the next slot */
781				*callresult++ = repr;
782				break;
783			}
784			case 'A':
785			{
786				PyObject *obj = va_arg(count, PyObject *);
787				PyObject *ascii;
788				assert(obj);
789				ascii = PyObject_ASCII(obj);
790				if (!ascii)
791					goto fail;
792				n += PyUnicode_GET_SIZE(ascii);
793				/* Remember the repr and switch to the next slot */
794				*callresult++ = ascii;
795				break;
796			}
797			case 'p':
798				(void) va_arg(count, int);
799				/* maximum 64-bit pointer representation:
800				 * 0xffffffffffffffff
801				 * so 19 characters is enough.
802				 * XXX I count 18 -- what's the extra for?
803				 */
804				n += 19;
805				break;
806			default:
807				/* if we stumble upon an unknown
808				   formatting code, copy the rest of
809				   the format string to the output
810				   string. (we cannot just skip the
811				   code, since there's no way to know
812				   what's in the argument list) */
813				n += strlen(p);
814				goto expand;
815			}
816		} else
817			n++;
818	}
819 expand:
820	if (abuffersize > 20) {
821		abuffer = PyObject_Malloc(abuffersize);
822		if (!abuffer) {
823			PyErr_NoMemory();
824			goto fail;
825		}
826		realbuffer = abuffer;
827	}
828	else
829		realbuffer = buffer;
830	/* step 4: fill the buffer */
831	/* Since we've analyzed how much space we need for the worst case,
832	   we don't have to resize the string.
833	   There can be no errors beyond this point. */
834	string = PyUnicode_FromUnicode(NULL, n);
835	if (!string)
836		goto fail;
837
838	s = PyUnicode_AS_UNICODE(string);
839	callresult = callresults;
840
841	for (f = format; *f; f++) {
842		if (*f == '%') {
843			const char* p = f++;
844			int longflag = 0;
845			int size_tflag = 0;
846			zeropad = (*f == '0');
847			/* parse the width.precision part */
848			width = 0;
849			while (ISDIGIT((unsigned)*f))
850				width = (width*10) + *f++ - '0';
851			precision = 0;
852			if (*f == '.') {
853				f++;
854				while (ISDIGIT((unsigned)*f))
855					precision = (precision*10) + *f++ - '0';
856			}
857			/* handle the long flag, but only for %ld and %lu.
858			   others can be added when necessary. */
859			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
860				longflag = 1;
861				++f;
862			}
863			/* handle the size_t flag. */
864			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
865				size_tflag = 1;
866				++f;
867			}
868
869			switch (*f) {
870			case 'c':
871				*s++ = va_arg(vargs, int);
872				break;
873			case 'd':
874				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
875				if (longflag)
876					sprintf(realbuffer, fmt, va_arg(vargs, long));
877				else if (size_tflag)
878					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
879				else
880					sprintf(realbuffer, fmt, va_arg(vargs, int));
881				appendstring(realbuffer);
882				break;
883			case 'u':
884				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
885				if (longflag)
886					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
887				else if (size_tflag)
888					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
889				else
890					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
891				appendstring(realbuffer);
892				break;
893			case 'i':
894				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
895				sprintf(realbuffer, fmt, va_arg(vargs, int));
896				appendstring(realbuffer);
897				break;
898			case 'x':
899				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
900				sprintf(realbuffer, fmt, va_arg(vargs, int));
901				appendstring(realbuffer);
902				break;
903			case 's':
904			{
905				/* Parameter must be UTF-8 encoded.
906				   In case of encoding errors, use
907				   the replacement character. */
908				PyObject *u;
909				p = va_arg(vargs, char*);
910				u = PyUnicode_DecodeUTF8(p, strlen(p),
911							 "replace");
912				if (!u)
913					goto fail;
914				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
915						PyUnicode_GET_SIZE(u));
916				s += PyUnicode_GET_SIZE(u);
917				Py_DECREF(u);
918				break;
919			}
920			case 'U':
921			{
922				PyObject *obj = va_arg(vargs, PyObject *);
923				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
924				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
925				s += size;
926				break;
927			}
928			case 'V':
929			{
930				PyObject *obj = va_arg(vargs, PyObject *);
931				const char *str = va_arg(vargs, const char *);
932				if (obj) {
933					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
934					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
935					s += size;
936				} else {
937					appendstring(str);
938				}
939				break;
940			}
941			case 'S':
942			case 'R':
943			{
944				Py_UNICODE *ucopy;
945				Py_ssize_t usize;
946				Py_ssize_t upos;
947				/* unused, since we already have the result */
948				(void) va_arg(vargs, PyObject *);
949				ucopy = PyUnicode_AS_UNICODE(*callresult);
950				usize = PyUnicode_GET_SIZE(*callresult);
951				for (upos = 0; upos<usize;)
952					*s++ = ucopy[upos++];
953				/* We're done with the unicode()/repr() => forget it */
954				Py_DECREF(*callresult);
955				/* switch to next unicode()/repr() result */
956				++callresult;
957				break;
958			}
959			case 'p':
960				sprintf(buffer, "%p", va_arg(vargs, void*));
961				/* %p is ill-defined:  ensure leading 0x. */
962				if (buffer[1] == 'X')
963					buffer[1] = 'x';
964				else if (buffer[1] != 'x') {
965					memmove(buffer+2, buffer, strlen(buffer)+1);
966					buffer[0] = '0';
967					buffer[1] = 'x';
968				}
969				appendstring(buffer);
970				break;
971			case '%':
972				*s++ = '%';
973				break;
974			default:
975				appendstring(p);
976				goto end;
977			}
978		} else
979			*s++ = *f;
980	}
981
982 end:
983	if (callresults)
984		PyObject_Free(callresults);
985	if (abuffer)
986		PyObject_Free(abuffer);
987	_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
988	return string;
989 fail:
990	if (callresults) {
991		PyObject **callresult2 = callresults;
992		while (callresult2 < callresult) {
993			Py_DECREF(*callresult2);
994			++callresult2;
995		}
996		PyObject_Free(callresults);
997	}
998	if (abuffer)
999		PyObject_Free(abuffer);
1000	return NULL;
1001}
1002
1003#undef appendstring
1004
1005PyObject *
1006PyUnicode_FromFormat(const char *format, ...)
1007{
1008	PyObject* ret;
1009	va_list vargs;
1010
1011#ifdef HAVE_STDARG_PROTOTYPES
1012	va_start(vargs, format);
1013#else
1014	va_start(vargs);
1015#endif
1016	ret = PyUnicode_FromFormatV(format, vargs);
1017	va_end(vargs);
1018	return ret;
1019}
1020
1021Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1022				wchar_t *w,
1023				Py_ssize_t size)
1024{
1025    if (unicode == NULL) {
1026	PyErr_BadInternalCall();
1027	return -1;
1028    }
1029
1030    /* If possible, try to copy the 0-termination as well */
1031    if (size > PyUnicode_GET_SIZE(unicode))
1032	size = PyUnicode_GET_SIZE(unicode) + 1;
1033
1034#ifdef HAVE_USABLE_WCHAR_T
1035    memcpy(w, unicode->str, size * sizeof(wchar_t));
1036#else
1037    {
1038	register Py_UNICODE *u;
1039	register Py_ssize_t i;
1040	u = PyUnicode_AS_UNICODE(unicode);
1041	for (i = size; i > 0; i--)
1042	    *w++ = *u++;
1043    }
1044#endif
1045
1046    if (size > PyUnicode_GET_SIZE(unicode))
1047        return PyUnicode_GET_SIZE(unicode);
1048    else
1049    return size;
1050}
1051
1052#endif
1053
1054PyObject *PyUnicode_FromOrdinal(int ordinal)
1055{
1056    Py_UNICODE s[2];
1057
1058    if (ordinal < 0 || ordinal > 0x10ffff) {
1059	PyErr_SetString(PyExc_ValueError,
1060			"chr() arg not in range(0x110000)");
1061	return NULL;
1062    }
1063
1064#ifndef Py_UNICODE_WIDE
1065    if (ordinal > 0xffff) {
1066        ordinal -= 0x10000;
1067        s[0] = 0xD800 | (ordinal >> 10);
1068        s[1] = 0xDC00 | (ordinal & 0x3FF);
1069        return PyUnicode_FromUnicode(s, 2);
1070    }
1071#endif
1072
1073    s[0] = (Py_UNICODE)ordinal;
1074    return PyUnicode_FromUnicode(s, 1);
1075}
1076
1077PyObject *PyUnicode_FromObject(register PyObject *obj)
1078{
1079    /* XXX Perhaps we should make this API an alias of
1080           PyObject_Str() instead ?! */
1081    if (PyUnicode_CheckExact(obj)) {
1082	Py_INCREF(obj);
1083	return obj;
1084    }
1085    if (PyUnicode_Check(obj)) {
1086	/* For a Unicode subtype that's not a Unicode object,
1087	   return a true Unicode object with the same data. */
1088	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1089				     PyUnicode_GET_SIZE(obj));
1090    }
1091    PyErr_Format(PyExc_TypeError,
1092                 "Can't convert '%.100s' object to str implicitly",
1093                 Py_TYPE(obj)->tp_name);
1094    return NULL;
1095}
1096
1097PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1098				      const char *encoding,
1099				      const char *errors)
1100{
1101    const char *s = NULL;
1102    Py_ssize_t len;
1103    PyObject *v;
1104
1105    if (obj == NULL) {
1106	PyErr_BadInternalCall();
1107	return NULL;
1108    }
1109
1110    if (PyUnicode_Check(obj)) {
1111	PyErr_SetString(PyExc_TypeError,
1112			"decoding str is not supported");
1113	return NULL;
1114	}
1115
1116    /* Coerce object */
1117    if (PyBytes_Check(obj)) {
1118        s = PyBytes_AS_STRING(obj);
1119        len = PyBytes_GET_SIZE(obj);
1120    }
1121    else if (PyByteArray_Check(obj)) {
1122        s = PyByteArray_AS_STRING(obj);
1123        len = PyByteArray_GET_SIZE(obj);
1124    }
1125    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1126	/* Overwrite the error message with something more useful in
1127	   case of a TypeError. */
1128	if (PyErr_ExceptionMatches(PyExc_TypeError))
1129            PyErr_Format(PyExc_TypeError,
1130			 "coercing to str: need string or buffer, "
1131			 "%.80s found",
1132		     Py_TYPE(obj)->tp_name);
1133	goto onError;
1134    }
1135
1136    /* Convert to Unicode */
1137    if (len == 0) {
1138	Py_INCREF(unicode_empty);
1139	v = (PyObject *)unicode_empty;
1140    }
1141    else
1142	v = PyUnicode_Decode(s, len, encoding, errors);
1143
1144    return v;
1145
1146 onError:
1147    return NULL;
1148}
1149
1150PyObject *PyUnicode_Decode(const char *s,
1151			   Py_ssize_t size,
1152			   const char *encoding,
1153			   const char *errors)
1154{
1155    PyObject *buffer = NULL, *unicode;
1156    Py_buffer info;
1157    char lower[20];  /* Enough for any encoding name we recognize */
1158    char *l;
1159    const char *e;
1160
1161    if (encoding == NULL)
1162        encoding = PyUnicode_GetDefaultEncoding();
1163
1164    /* Convert encoding to lower case and replace '_' with '-' in order to
1165       catch e.g. UTF_8 */
1166    e = encoding;
1167    l = lower;
1168    while (*e && l < &lower[(sizeof lower) - 2]) {
1169        if (ISUPPER(*e)) {
1170            *l++ = TOLOWER(*e++);
1171        }
1172        else if (*e == '_') {
1173            *l++ = '-';
1174            e++;
1175        }
1176        else {
1177            *l++ = *e++;
1178        }
1179    }
1180    *l = '\0';
1181
1182    /* Shortcuts for common default encodings */
1183    if (strcmp(lower, "utf-8") == 0)
1184        return PyUnicode_DecodeUTF8(s, size, errors);
1185    else if ((strcmp(lower, "latin-1") == 0) ||
1186             (strcmp(lower, "iso-8859-1") == 0))
1187        return PyUnicode_DecodeLatin1(s, size, errors);
1188#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1189    else if (strcmp(lower, "mbcs") == 0)
1190        return PyUnicode_DecodeMBCS(s, size, errors);
1191#endif
1192    else if (strcmp(lower, "ascii") == 0)
1193        return PyUnicode_DecodeASCII(s, size, errors);
1194    else if (strcmp(lower, "utf-16") == 0)
1195        return PyUnicode_DecodeUTF16(s, size, errors, 0);
1196    else if (strcmp(lower, "utf-32") == 0)
1197        return PyUnicode_DecodeUTF32(s, size, errors, 0);
1198
1199    /* Decode via the codec registry */
1200    buffer = NULL;
1201    if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1202        goto onError;
1203    buffer = PyMemoryView_FromMemory(&info);
1204    if (buffer == NULL)
1205        goto onError;
1206    unicode = PyCodec_Decode(buffer, encoding, errors);
1207    if (unicode == NULL)
1208        goto onError;
1209    if (!PyUnicode_Check(unicode)) {
1210        PyErr_Format(PyExc_TypeError,
1211                     "decoder did not return a str object (type=%.400s)",
1212                     Py_TYPE(unicode)->tp_name);
1213        Py_DECREF(unicode);
1214        goto onError;
1215    }
1216    Py_DECREF(buffer);
1217    return unicode;
1218
1219 onError:
1220    Py_XDECREF(buffer);
1221    return NULL;
1222}
1223
1224PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1225                                    const char *encoding,
1226                                    const char *errors)
1227{
1228    PyObject *v;
1229
1230    if (!PyUnicode_Check(unicode)) {
1231        PyErr_BadArgument();
1232        goto onError;
1233    }
1234
1235    if (encoding == NULL)
1236	encoding = PyUnicode_GetDefaultEncoding();
1237
1238    /* Decode via the codec registry */
1239    v = PyCodec_Decode(unicode, encoding, errors);
1240    if (v == NULL)
1241        goto onError;
1242    return v;
1243
1244 onError:
1245    return NULL;
1246}
1247
1248PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1249                                     const char *encoding,
1250                                     const char *errors)
1251{
1252    PyObject *v;
1253
1254    if (!PyUnicode_Check(unicode)) {
1255        PyErr_BadArgument();
1256        goto onError;
1257    }
1258
1259    if (encoding == NULL)
1260	encoding = PyUnicode_GetDefaultEncoding();
1261
1262    /* Decode via the codec registry */
1263    v = PyCodec_Decode(unicode, encoding, errors);
1264    if (v == NULL)
1265        goto onError;
1266    if (!PyUnicode_Check(v)) {
1267        PyErr_Format(PyExc_TypeError,
1268                     "decoder did not return a str object (type=%.400s)",
1269                     Py_TYPE(v)->tp_name);
1270        Py_DECREF(v);
1271        goto onError;
1272    }
1273    return v;
1274
1275 onError:
1276    return NULL;
1277}
1278
1279PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1280			   Py_ssize_t size,
1281			   const char *encoding,
1282			   const char *errors)
1283{
1284    PyObject *v, *unicode;
1285
1286    unicode = PyUnicode_FromUnicode(s, size);
1287    if (unicode == NULL)
1288	return NULL;
1289    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1290    Py_DECREF(unicode);
1291    return v;
1292}
1293
1294PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1295                                    const char *encoding,
1296                                    const char *errors)
1297{
1298    PyObject *v;
1299
1300    if (!PyUnicode_Check(unicode)) {
1301        PyErr_BadArgument();
1302        goto onError;
1303    }
1304
1305    if (encoding == NULL)
1306	encoding = PyUnicode_GetDefaultEncoding();
1307
1308    /* Encode via the codec registry */
1309    v = PyCodec_Encode(unicode, encoding, errors);
1310    if (v == NULL)
1311        goto onError;
1312    return v;
1313
1314 onError:
1315    return NULL;
1316}
1317
1318PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1319                                    const char *encoding,
1320                                    const char *errors)
1321{
1322    PyObject *v;
1323
1324    if (!PyUnicode_Check(unicode)) {
1325        PyErr_BadArgument();
1326        goto onError;
1327    }
1328
1329    if (encoding == NULL)
1330	encoding = PyUnicode_GetDefaultEncoding();
1331
1332    /* Shortcuts for common default encodings */
1333    if (errors == NULL) {
1334	if (strcmp(encoding, "utf-8") == 0)
1335	    return PyUnicode_AsUTF8String(unicode);
1336	else if (strcmp(encoding, "latin-1") == 0)
1337	    return PyUnicode_AsLatin1String(unicode);
1338#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1339	else if (strcmp(encoding, "mbcs") == 0)
1340	    return PyUnicode_AsMBCSString(unicode);
1341#endif
1342	else if (strcmp(encoding, "ascii") == 0)
1343	    return PyUnicode_AsASCIIString(unicode);
1344    }
1345
1346    /* Encode via the codec registry */
1347    v = PyCodec_Encode(unicode, encoding, errors);
1348    if (v == NULL)
1349        goto onError;
1350    if (PyByteArray_Check(v)) {
1351        char msg[100];
1352        PyOS_snprintf(msg, sizeof(msg),
1353                      "encoder %s returned buffer instead of bytes",
1354                      encoding);
1355        if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1356            v = NULL;
1357            goto onError;
1358        }
1359        v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1360    }
1361    else if (!PyBytes_Check(v)) {
1362        PyErr_Format(PyExc_TypeError,
1363                     "encoder did not return a bytes object (type=%.400s)",
1364                     Py_TYPE(v)->tp_name);
1365        v = NULL;
1366    }
1367    return v;
1368
1369 onError:
1370    return NULL;
1371}
1372
1373PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1374                                     const char *encoding,
1375                                     const char *errors)
1376{
1377    PyObject *v;
1378
1379    if (!PyUnicode_Check(unicode)) {
1380        PyErr_BadArgument();
1381        goto onError;
1382    }
1383
1384    if (encoding == NULL)
1385	encoding = PyUnicode_GetDefaultEncoding();
1386
1387    /* Encode via the codec registry */
1388    v = PyCodec_Encode(unicode, encoding, errors);
1389    if (v == NULL)
1390        goto onError;
1391    if (!PyUnicode_Check(v)) {
1392        PyErr_Format(PyExc_TypeError,
1393                     "encoder did not return an str object (type=%.400s)",
1394                     Py_TYPE(v)->tp_name);
1395        Py_DECREF(v);
1396        goto onError;
1397    }
1398    return v;
1399
1400 onError:
1401    return NULL;
1402}
1403
1404PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1405					    const char *errors)
1406{
1407    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1408    if (v)
1409        return v;
1410    if (errors != NULL)
1411        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1412    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1413                             PyUnicode_GET_SIZE(unicode),
1414                             NULL);
1415    if (!v)
1416        return NULL;
1417    ((PyUnicodeObject *)unicode)->defenc = v;
1418    return v;
1419}
1420
1421PyObject*
1422PyUnicode_DecodeFSDefault(const char *s) {
1423    Py_ssize_t size = (Py_ssize_t)strlen(s);
1424    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1425}
1426
1427PyObject*
1428PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1429{
1430    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1431       can be undefined. If it is case, decode using UTF-8. The following assumes
1432       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1433       bootstrapping process where the codecs aren't ready yet.
1434    */
1435    if (Py_FileSystemDefaultEncoding) {
1436#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1437        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1438            return PyUnicode_DecodeMBCS(s, size, "replace");
1439        }
1440#elif defined(__APPLE__)
1441        if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1442            return PyUnicode_DecodeUTF8(s, size, "replace");
1443        }
1444#endif
1445        return PyUnicode_Decode(s, size,
1446                                Py_FileSystemDefaultEncoding,
1447                                "replace");
1448    }
1449    else {
1450        return PyUnicode_DecodeUTF8(s, size, "replace");
1451    }
1452}
1453
1454char*
1455_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1456{
1457    PyObject *bytes;
1458    if (!PyUnicode_Check(unicode)) {
1459        PyErr_BadArgument();
1460        return NULL;
1461    }
1462    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1463    if (bytes == NULL)
1464        return NULL;
1465    if (psize != NULL)
1466        *psize = PyBytes_GET_SIZE(bytes);
1467    return PyBytes_AS_STRING(bytes);
1468}
1469
1470char*
1471_PyUnicode_AsString(PyObject *unicode)
1472{
1473    return _PyUnicode_AsStringAndSize(unicode, NULL);
1474}
1475
1476Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1477{
1478    if (!PyUnicode_Check(unicode)) {
1479        PyErr_BadArgument();
1480        goto onError;
1481    }
1482    return PyUnicode_AS_UNICODE(unicode);
1483
1484 onError:
1485    return NULL;
1486}
1487
1488Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1489{
1490    if (!PyUnicode_Check(unicode)) {
1491        PyErr_BadArgument();
1492        goto onError;
1493    }
1494    return PyUnicode_GET_SIZE(unicode);
1495
1496 onError:
1497    return -1;
1498}
1499
1500const char *PyUnicode_GetDefaultEncoding(void)
1501{
1502    return unicode_default_encoding;
1503}
1504
1505int PyUnicode_SetDefaultEncoding(const char *encoding)
1506{
1507    if (strcmp(encoding, unicode_default_encoding) != 0) {
1508        PyErr_Format(PyExc_ValueError,
1509                     "Can only set default encoding to %s",
1510                     unicode_default_encoding);
1511        return -1;
1512    }
1513    return 0;
1514}
1515
1516/* error handling callback helper:
1517   build arguments, call the callback and check the arguments,
1518   if no exception occurred, copy the replacement to the output
1519   and adjust various state variables.
1520   return 0 on success, -1 on error
1521*/
1522
1523static
1524int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1525                 const char *encoding, const char *reason,
1526                 const char **input, const char **inend, Py_ssize_t *startinpos,
1527                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1528                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1529{
1530    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
1531
1532    PyObject *restuple = NULL;
1533    PyObject *repunicode = NULL;
1534    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1535    Py_ssize_t insize;
1536    Py_ssize_t requiredsize;
1537    Py_ssize_t newpos;
1538    Py_UNICODE *repptr;
1539    PyObject *inputobj = NULL;
1540    Py_ssize_t repsize;
1541    int res = -1;
1542
1543    if (*errorHandler == NULL) {
1544	*errorHandler = PyCodec_LookupError(errors);
1545	if (*errorHandler == NULL)
1546	   goto onError;
1547    }
1548
1549    if (*exceptionObject == NULL) {
1550    	*exceptionObject = PyUnicodeDecodeError_Create(
1551	    encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1552	if (*exceptionObject == NULL)
1553	   goto onError;
1554    }
1555    else {
1556	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1557	    goto onError;
1558	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1559	    goto onError;
1560	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1561	    goto onError;
1562    }
1563
1564    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1565    if (restuple == NULL)
1566	goto onError;
1567    if (!PyTuple_Check(restuple)) {
1568	PyErr_Format(PyExc_TypeError, &argparse[4]);
1569	goto onError;
1570    }
1571    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1572	goto onError;
1573
1574    /* Copy back the bytes variables, which might have been modified by the
1575       callback */
1576    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1577    if (!inputobj)
1578        goto onError;
1579    if (!PyBytes_Check(inputobj)) {
1580	PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1581    }
1582    *input = PyBytes_AS_STRING(inputobj);
1583    insize = PyBytes_GET_SIZE(inputobj);
1584    *inend = *input + insize;
1585    /* we can DECREF safely, as the exception has another reference,
1586       so the object won't go away. */
1587    Py_DECREF(inputobj);
1588
1589    if (newpos<0)
1590	newpos = insize+newpos;
1591    if (newpos<0 || newpos>insize) {
1592	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1593	goto onError;
1594    }
1595
1596    /* need more space? (at least enough for what we
1597       have+the replacement+the rest of the string (starting
1598       at the new input position), so we won't have to check space
1599       when there are no errors in the rest of the string) */
1600    repptr = PyUnicode_AS_UNICODE(repunicode);
1601    repsize = PyUnicode_GET_SIZE(repunicode);
1602    requiredsize = *outpos + repsize + insize-newpos;
1603    if (requiredsize > outsize) {
1604	if (requiredsize<2*outsize)
1605	    requiredsize = 2*outsize;
1606	if (PyUnicode_Resize(output, requiredsize) < 0)
1607	    goto onError;
1608	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1609    }
1610    *endinpos = newpos;
1611    *inptr = *input + newpos;
1612    Py_UNICODE_COPY(*outptr, repptr, repsize);
1613    *outptr += repsize;
1614    *outpos += repsize;
1615
1616    /* we made it! */
1617    res = 0;
1618
1619    onError:
1620    Py_XDECREF(restuple);
1621    return res;
1622}
1623
1624/* --- UTF-7 Codec -------------------------------------------------------- */
1625
1626/* see RFC2152 for details */
1627
1628static
1629char utf7_special[128] = {
1630    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1631       encoded:
1632	   0 - not special
1633	   1 - special
1634	   2 - whitespace (optional)
1635	   3 - RFC2152 Set O (optional) */
1636    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1637    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1638    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1639    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1640    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1641    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1642    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1643    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1644
1645};
1646
1647/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1648   warnings about the comparison always being false; since
1649   utf7_special[0] is 1, we can safely make that one comparison
1650   true  */
1651
1652#define SPECIAL(c, encodeO, encodeWS) \
1653    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1654     (encodeWS && (utf7_special[(c)] == 2)) || \
1655     (encodeO && (utf7_special[(c)] == 3)))
1656
1657#define B64(n)  \
1658    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1659#define B64CHAR(c) \
1660    (ISALNUM(c) || (c) == '+' || (c) == '/')
1661#define UB64(c) \
1662    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1663     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1664
1665#define ENCODE(out, ch, bits)                   \
1666    while (bits >= 6) {                         \
1667        *out++ = B64(ch >> (bits-6));           \
1668        bits -= 6;                              \
1669    }
1670
1671#define DECODE(out, ch, bits, surrogate)                                \
1672    while (bits >= 16) {                                                \
1673        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1674        bits -= 16;                                                     \
1675        if (surrogate) {                                                \
1676            /* We have already generated an error for the high surrogate \
1677               so let's not bother seeing if the low surrogate is correct or not */ \
1678            surrogate = 0;                                              \
1679        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1680            /* This is a surrogate pair. Unfortunately we can't represent \
1681               it in a 16-bit character */                              \
1682            surrogate = 1;                                              \
1683            errmsg = "code pairs are not supported";                    \
1684            goto utf7Error;                                             \
1685        } else {                                                        \
1686            *out++ = outCh;                                             \
1687        }                                                               \
1688    }
1689
1690PyObject *PyUnicode_DecodeUTF7(const char *s,
1691			       Py_ssize_t size,
1692			       const char *errors)
1693{
1694    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1695}
1696
1697PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1698			       Py_ssize_t size,
1699			       const char *errors,
1700			       Py_ssize_t *consumed)
1701{
1702    const char *starts = s;
1703    Py_ssize_t startinpos;
1704    Py_ssize_t endinpos;
1705    Py_ssize_t outpos;
1706    const char *e;
1707    PyUnicodeObject *unicode;
1708    Py_UNICODE *p;
1709    const char *errmsg = "";
1710    int inShift = 0;
1711    unsigned int bitsleft = 0;
1712    unsigned long charsleft = 0;
1713    int surrogate = 0;
1714    PyObject *errorHandler = NULL;
1715    PyObject *exc = NULL;
1716
1717    unicode = _PyUnicode_New(size);
1718    if (!unicode)
1719        return NULL;
1720    if (size == 0) {
1721        if (consumed)
1722            *consumed = 0;
1723        return (PyObject *)unicode;
1724    }
1725
1726    p = unicode->str;
1727    e = s + size;
1728
1729    while (s < e) {
1730        Py_UNICODE ch;
1731        restart:
1732        ch = (unsigned char) *s;
1733
1734        if (inShift) {
1735            if ((ch == '-') || !B64CHAR(ch)) {
1736                inShift = 0;
1737                s++;
1738
1739                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1740                if (bitsleft >= 6) {
1741                    /* The shift sequence has a partial character in it. If
1742                       bitsleft < 6 then we could just classify it as padding
1743                       but that is not the case here */
1744
1745                    errmsg = "partial character in shift sequence";
1746                    goto utf7Error;
1747                }
1748                /* According to RFC2152 the remaining bits should be zero. We
1749                   choose to signal an error/insert a replacement character
1750                   here so indicate the potential of a misencoded character. */
1751
1752                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1753                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1754                    errmsg = "non-zero padding bits in shift sequence";
1755                    goto utf7Error;
1756                }
1757
1758                if (ch == '-') {
1759                    if ((s < e) && (*(s) == '-')) {
1760                        *p++ = '-';
1761                        inShift = 1;
1762                    }
1763                } else if (SPECIAL(ch,0,0)) {
1764                    errmsg = "unexpected special character";
1765	                goto utf7Error;
1766                } else  {
1767                    *p++ = ch;
1768                }
1769            } else {
1770                charsleft = (charsleft << 6) | UB64(ch);
1771                bitsleft += 6;
1772                s++;
1773                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1774            }
1775        }
1776        else if ( ch == '+' ) {
1777            startinpos = s-starts;
1778            s++;
1779            if (s < e && *s == '-') {
1780                s++;
1781                *p++ = '+';
1782            } else
1783            {
1784                inShift = 1;
1785                bitsleft = 0;
1786            }
1787        }
1788        else if (SPECIAL(ch,0,0)) {
1789            startinpos = s-starts;
1790            errmsg = "unexpected special character";
1791            s++;
1792            goto utf7Error;
1793        }
1794        else {
1795            *p++ = ch;
1796            s++;
1797        }
1798        continue;
1799    utf7Error:
1800        outpos = p-PyUnicode_AS_UNICODE(unicode);
1801        endinpos = s-starts;
1802        if (unicode_decode_call_errorhandler(
1803             errors, &errorHandler,
1804             "utf7", errmsg,
1805             &starts, &e, &startinpos, &endinpos, &exc, &s,
1806             (PyObject **)&unicode, &outpos, &p))
1807        goto onError;
1808    }
1809
1810    if (inShift && !consumed) {
1811        outpos = p-PyUnicode_AS_UNICODE(unicode);
1812        endinpos = size;
1813        if (unicode_decode_call_errorhandler(
1814             errors, &errorHandler,
1815             "utf7", "unterminated shift sequence",
1816             &starts, &e, &startinpos, &endinpos, &exc, &s,
1817             (PyObject **)&unicode, &outpos, &p))
1818            goto onError;
1819        if (s < e)
1820           goto restart;
1821    }
1822    if (consumed) {
1823        if(inShift)
1824            *consumed = startinpos;
1825        else
1826            *consumed = s-starts;
1827    }
1828
1829    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1830        goto onError;
1831
1832    Py_XDECREF(errorHandler);
1833    Py_XDECREF(exc);
1834    return (PyObject *)unicode;
1835
1836onError:
1837    Py_XDECREF(errorHandler);
1838    Py_XDECREF(exc);
1839    Py_DECREF(unicode);
1840    return NULL;
1841}
1842
1843
1844PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1845                   Py_ssize_t size,
1846                   int encodeSetO,
1847                   int encodeWhiteSpace,
1848                   const char *errors)
1849{
1850    PyObject *v, *result;
1851    /* It might be possible to tighten this worst case */
1852    Py_ssize_t cbAllocated = 5 * size;
1853    int inShift = 0;
1854    Py_ssize_t i = 0;
1855    unsigned int bitsleft = 0;
1856    unsigned long charsleft = 0;
1857    char * out;
1858    char * start;
1859
1860    if (size == 0)
1861       return PyBytes_FromStringAndSize(NULL, 0);
1862
1863    v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
1864    if (v == NULL)
1865        return NULL;
1866
1867    start = out = PyByteArray_AS_STRING(v);
1868    for (;i < size; ++i) {
1869        Py_UNICODE ch = s[i];
1870
1871        if (!inShift) {
1872            if (ch == '+') {
1873                *out++ = '+';
1874                *out++ = '-';
1875            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1876                charsleft = ch;
1877                bitsleft = 16;
1878                *out++ = '+';
1879                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1880                inShift = bitsleft > 0;
1881            } else {
1882                *out++ = (char) ch;
1883            }
1884        } else {
1885            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1886                *out++ = B64(charsleft << (6-bitsleft));
1887                charsleft = 0;
1888                bitsleft = 0;
1889                /* Characters not in the BASE64 set implicitly unshift the sequence
1890                   so no '-' is required, except if the character is itself a '-' */
1891                if (B64CHAR(ch) || ch == '-') {
1892                    *out++ = '-';
1893                }
1894                inShift = 0;
1895                *out++ = (char) ch;
1896            } else {
1897                bitsleft += 16;
1898                charsleft = (charsleft << 16) | ch;
1899                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1900
1901                /* If the next character is special then we dont' need to terminate
1902                   the shift sequence. If the next character is not a BASE64 character
1903                   or '-' then the shift sequence will be terminated implicitly and we
1904                   don't have to insert a '-'. */
1905
1906                if (bitsleft == 0) {
1907                    if (i + 1 < size) {
1908                        Py_UNICODE ch2 = s[i+1];
1909
1910                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1911
1912                        } else if (B64CHAR(ch2) || ch2 == '-') {
1913                            *out++ = '-';
1914                            inShift = 0;
1915                        } else {
1916                            inShift = 0;
1917                        }
1918
1919                    }
1920                    else {
1921                        *out++ = '-';
1922                        inShift = 0;
1923                    }
1924                }
1925            }
1926        }
1927    }
1928    if (bitsleft) {
1929        *out++= B64(charsleft << (6-bitsleft) );
1930        *out++ = '-';
1931    }
1932
1933    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
1934    Py_DECREF(v);
1935    return result;
1936}
1937
1938#undef SPECIAL
1939#undef B64
1940#undef B64CHAR
1941#undef UB64
1942#undef ENCODE
1943#undef DECODE
1944
1945/* --- UTF-8 Codec -------------------------------------------------------- */
1946
1947static
1948char utf8_code_length[256] = {
1949    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1950       illegal prefix.  see RFC 2279 for details */
1951    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1952    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1953    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1954    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1955    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1956    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1957    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1958    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1959    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1960    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1961    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1962    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1963    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1964    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1965    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1966    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1967};
1968
1969PyObject *PyUnicode_DecodeUTF8(const char *s,
1970			       Py_ssize_t size,
1971			       const char *errors)
1972{
1973    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1974}
1975
1976PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1977			                Py_ssize_t size,
1978			                const char *errors,
1979			                Py_ssize_t *consumed)
1980{
1981    const char *starts = s;
1982    int n;
1983    Py_ssize_t startinpos;
1984    Py_ssize_t endinpos;
1985    Py_ssize_t outpos;
1986    const char *e;
1987    PyUnicodeObject *unicode;
1988    Py_UNICODE *p;
1989    const char *errmsg = "";
1990    PyObject *errorHandler = NULL;
1991    PyObject *exc = NULL;
1992
1993    /* Note: size will always be longer than the resulting Unicode
1994       character count */
1995    unicode = _PyUnicode_New(size);
1996    if (!unicode)
1997        return NULL;
1998    if (size == 0) {
1999        if (consumed)
2000            *consumed = 0;
2001        return (PyObject *)unicode;
2002    }
2003
2004    /* Unpack UTF-8 encoded data */
2005    p = unicode->str;
2006    e = s + size;
2007
2008    while (s < e) {
2009        Py_UCS4 ch = (unsigned char)*s;
2010
2011        if (ch < 0x80) {
2012            *p++ = (Py_UNICODE)ch;
2013            s++;
2014            continue;
2015        }
2016
2017        n = utf8_code_length[ch];
2018
2019        if (s + n > e) {
2020	    if (consumed)
2021		break;
2022	    else {
2023		errmsg = "unexpected end of data";
2024		startinpos = s-starts;
2025		endinpos = size;
2026		goto utf8Error;
2027	    }
2028	}
2029
2030        switch (n) {
2031
2032        case 0:
2033            errmsg = "unexpected code byte";
2034	    startinpos = s-starts;
2035	    endinpos = startinpos+1;
2036	    goto utf8Error;
2037
2038        case 1:
2039            errmsg = "internal error";
2040	    startinpos = s-starts;
2041	    endinpos = startinpos+1;
2042	    goto utf8Error;
2043
2044        case 2:
2045            if ((s[1] & 0xc0) != 0x80) {
2046                errmsg = "invalid data";
2047		startinpos = s-starts;
2048		endinpos = startinpos+2;
2049		goto utf8Error;
2050	    }
2051            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2052            if (ch < 0x80) {
2053		startinpos = s-starts;
2054		endinpos = startinpos+2;
2055                errmsg = "illegal encoding";
2056		goto utf8Error;
2057	    }
2058	    else
2059		*p++ = (Py_UNICODE)ch;
2060            break;
2061
2062        case 3:
2063            if ((s[1] & 0xc0) != 0x80 ||
2064                (s[2] & 0xc0) != 0x80) {
2065                errmsg = "invalid data";
2066		startinpos = s-starts;
2067		endinpos = startinpos+3;
2068		goto utf8Error;
2069	    }
2070            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2071            if (ch < 0x0800) {
2072		/* Note: UTF-8 encodings of surrogates are considered
2073		   legal UTF-8 sequences;
2074
2075		   XXX For wide builds (UCS-4) we should probably try
2076		       to recombine the surrogates into a single code
2077		       unit.
2078		*/
2079                errmsg = "illegal encoding";
2080		startinpos = s-starts;
2081		endinpos = startinpos+3;
2082		goto utf8Error;
2083	    }
2084	    else
2085		*p++ = (Py_UNICODE)ch;
2086            break;
2087
2088        case 4:
2089            if ((s[1] & 0xc0) != 0x80 ||
2090                (s[2] & 0xc0) != 0x80 ||
2091                (s[3] & 0xc0) != 0x80) {
2092                errmsg = "invalid data";
2093		startinpos = s-starts;
2094		endinpos = startinpos+4;
2095		goto utf8Error;
2096	    }
2097            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2098                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2099            /* validate and convert to UTF-16 */
2100            if ((ch < 0x10000)        /* minimum value allowed for 4
2101					 byte encoding */
2102                || (ch > 0x10ffff))   /* maximum value allowed for
2103					 UTF-16 */
2104	    {
2105                errmsg = "illegal encoding";
2106		startinpos = s-starts;
2107		endinpos = startinpos+4;
2108		goto utf8Error;
2109	    }
2110#ifdef Py_UNICODE_WIDE
2111	    *p++ = (Py_UNICODE)ch;
2112#else
2113            /*  compute and append the two surrogates: */
2114
2115            /*  translate from 10000..10FFFF to 0..FFFF */
2116            ch -= 0x10000;
2117
2118            /*  high surrogate = top 10 bits added to D800 */
2119            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2120
2121            /*  low surrogate = bottom 10 bits added to DC00 */
2122            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2123#endif
2124            break;
2125
2126        default:
2127            /* Other sizes are only needed for UCS-4 */
2128            errmsg = "unsupported Unicode code range";
2129	    startinpos = s-starts;
2130	    endinpos = startinpos+n;
2131	    goto utf8Error;
2132        }
2133        s += n;
2134	continue;
2135
2136    utf8Error:
2137    outpos = p-PyUnicode_AS_UNICODE(unicode);
2138    if (unicode_decode_call_errorhandler(
2139	     errors, &errorHandler,
2140	     "utf8", errmsg,
2141	     &starts, &e, &startinpos, &endinpos, &exc, &s,
2142	     (PyObject **)&unicode, &outpos, &p))
2143	goto onError;
2144    }
2145    if (consumed)
2146	*consumed = s-starts;
2147
2148    /* Adjust length */
2149    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2150        goto onError;
2151
2152    Py_XDECREF(errorHandler);
2153    Py_XDECREF(exc);
2154    return (PyObject *)unicode;
2155
2156onError:
2157    Py_XDECREF(errorHandler);
2158    Py_XDECREF(exc);
2159    Py_DECREF(unicode);
2160    return NULL;
2161}
2162
2163/* Allocation strategy:  if the string is short, convert into a stack buffer
2164   and allocate exactly as much space needed at the end.  Else allocate the
2165   maximum possible needed (4 result bytes per Unicode character), and return
2166   the excess memory at the end.
2167*/
2168PyObject *
2169PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2170		     Py_ssize_t size,
2171		     const char *errors)
2172{
2173#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2174
2175    Py_ssize_t i;                /* index into s of next input byte */
2176    PyObject *result;            /* result string object */
2177    char *p;                     /* next free byte in output buffer */
2178    Py_ssize_t nallocated;      /* number of result bytes allocated */
2179    Py_ssize_t nneeded;            /* number of result bytes needed */
2180    char stackbuf[MAX_SHORT_UNICHARS * 4];
2181
2182    assert(s != NULL);
2183    assert(size >= 0);
2184
2185    if (size <= MAX_SHORT_UNICHARS) {
2186        /* Write into the stack buffer; nallocated can't overflow.
2187         * At the end, we'll allocate exactly as much heap space as it
2188         * turns out we need.
2189         */
2190        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2191        result = NULL;   /* will allocate after we're done */
2192        p = stackbuf;
2193    }
2194    else {
2195        /* Overallocate on the heap, and give the excess back at the end. */
2196        nallocated = size * 4;
2197        if (nallocated / 4 != size)  /* overflow! */
2198            return PyErr_NoMemory();
2199        result = PyBytes_FromStringAndSize(NULL, nallocated);
2200        if (result == NULL)
2201            return NULL;
2202        p = PyBytes_AS_STRING(result);
2203    }
2204
2205    for (i = 0; i < size;) {
2206        Py_UCS4 ch = s[i++];
2207
2208        if (ch < 0x80)
2209            /* Encode ASCII */
2210            *p++ = (char) ch;
2211
2212        else if (ch < 0x0800) {
2213            /* Encode Latin-1 */
2214            *p++ = (char)(0xc0 | (ch >> 6));
2215            *p++ = (char)(0x80 | (ch & 0x3f));
2216        }
2217        else {
2218            /* Encode UCS2 Unicode ordinals */
2219            if (ch < 0x10000) {
2220                /* Special case: check for high surrogate */
2221                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2222                    Py_UCS4 ch2 = s[i];
2223                    /* Check for low surrogate and combine the two to
2224                       form a UCS4 value */
2225                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2226                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2227                        i++;
2228                        goto encodeUCS4;
2229                    }
2230                    /* Fall through: handles isolated high surrogates */
2231                }
2232                *p++ = (char)(0xe0 | (ch >> 12));
2233                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2234                *p++ = (char)(0x80 | (ch & 0x3f));
2235                continue;
2236    	    }
2237encodeUCS4:
2238            /* Encode UCS4 Unicode ordinals */
2239            *p++ = (char)(0xf0 | (ch >> 18));
2240            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2241            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2242            *p++ = (char)(0x80 | (ch & 0x3f));
2243        }
2244    }
2245
2246    if (result == NULL) {
2247        /* This was stack allocated. */
2248        nneeded = p - stackbuf;
2249        assert(nneeded <= nallocated);
2250        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2251    }
2252    else {
2253        /* Cut back to size actually needed. */
2254        nneeded = p - PyBytes_AS_STRING(result);
2255        assert(nneeded <= nallocated);
2256        _PyBytes_Resize(&result, nneeded);
2257    }
2258    return result;
2259
2260#undef MAX_SHORT_UNICHARS
2261}
2262
2263PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2264{
2265    if (!PyUnicode_Check(unicode)) {
2266        PyErr_BadArgument();
2267        return NULL;
2268    }
2269    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2270				PyUnicode_GET_SIZE(unicode),
2271				NULL);
2272}
2273
2274/* --- UTF-32 Codec ------------------------------------------------------- */
2275
2276PyObject *
2277PyUnicode_DecodeUTF32(const char *s,
2278		      Py_ssize_t size,
2279		      const char *errors,
2280		      int *byteorder)
2281{
2282    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2283}
2284
2285PyObject *
2286PyUnicode_DecodeUTF32Stateful(const char *s,
2287			      Py_ssize_t size,
2288			      const char *errors,
2289			      int *byteorder,
2290			      Py_ssize_t *consumed)
2291{
2292    const char *starts = s;
2293    Py_ssize_t startinpos;
2294    Py_ssize_t endinpos;
2295    Py_ssize_t outpos;
2296    PyUnicodeObject *unicode;
2297    Py_UNICODE *p;
2298#ifndef Py_UNICODE_WIDE
2299    int i, pairs;
2300#else
2301    const int pairs = 0;
2302#endif
2303    const unsigned char *q, *e;
2304    int bo = 0;       /* assume native ordering by default */
2305    const char *errmsg = "";
2306    /* Offsets from q for retrieving bytes in the right order. */
2307#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2308    int iorder[] = {0, 1, 2, 3};
2309#else
2310    int iorder[] = {3, 2, 1, 0};
2311#endif
2312    PyObject *errorHandler = NULL;
2313    PyObject *exc = NULL;
2314    /* On narrow builds we split characters outside the BMP into two
2315       codepoints => count how much extra space we need. */
2316#ifndef Py_UNICODE_WIDE
2317    for (i = pairs = 0; i < size/4; i++)
2318	if (((Py_UCS4 *)s)[i] >= 0x10000)
2319	    pairs++;
2320#endif
2321
2322    /* This might be one to much, because of a BOM */
2323    unicode = _PyUnicode_New((size+3)/4+pairs);
2324    if (!unicode)
2325        return NULL;
2326    if (size == 0)
2327        return (PyObject *)unicode;
2328
2329    /* Unpack UTF-32 encoded data */
2330    p = unicode->str;
2331    q = (unsigned char *)s;
2332    e = q + size;
2333
2334    if (byteorder)
2335        bo = *byteorder;
2336
2337    /* Check for BOM marks (U+FEFF) in the input and adjust current
2338       byte order setting accordingly. In native mode, the leading BOM
2339       mark is skipped, in all other modes, it is copied to the output
2340       stream as-is (giving a ZWNBSP character). */
2341    if (bo == 0) {
2342        if (size >= 4) {
2343            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2344                                (q[iorder[1]] << 8) | q[iorder[0]];
2345#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2346	    if (bom == 0x0000FEFF) {
2347		q += 4;
2348		bo = -1;
2349	    }
2350	    else if (bom == 0xFFFE0000) {
2351		q += 4;
2352		bo = 1;
2353	    }
2354#else
2355	    if (bom == 0x0000FEFF) {
2356		q += 4;
2357		bo = 1;
2358	    }
2359	    else if (bom == 0xFFFE0000) {
2360		q += 4;
2361		bo = -1;
2362	    }
2363#endif
2364	}
2365    }
2366
2367    if (bo == -1) {
2368        /* force LE */
2369        iorder[0] = 0;
2370        iorder[1] = 1;
2371        iorder[2] = 2;
2372        iorder[3] = 3;
2373    }
2374    else if (bo == 1) {
2375        /* force BE */
2376        iorder[0] = 3;
2377        iorder[1] = 2;
2378        iorder[2] = 1;
2379        iorder[3] = 0;
2380    }
2381
2382    while (q < e) {
2383	Py_UCS4 ch;
2384	/* remaining bytes at the end? (size should be divisible by 4) */
2385	if (e-q<4) {
2386	    if (consumed)
2387		break;
2388	    errmsg = "truncated data";
2389	    startinpos = ((const char *)q)-starts;
2390	    endinpos = ((const char *)e)-starts;
2391	    goto utf32Error;
2392	    /* The remaining input chars are ignored if the callback
2393	       chooses to skip the input */
2394	}
2395	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2396	     (q[iorder[1]] << 8) | q[iorder[0]];
2397
2398	if (ch >= 0x110000)
2399	{
2400	    errmsg = "codepoint not in range(0x110000)";
2401	    startinpos = ((const char *)q)-starts;
2402	    endinpos = startinpos+4;
2403	    goto utf32Error;
2404	}
2405#ifndef Py_UNICODE_WIDE
2406	if (ch >= 0x10000)
2407	{
2408	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
2409	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2410	}
2411	else
2412#endif
2413	    *p++ = ch;
2414	q += 4;
2415	continue;
2416    utf32Error:
2417	outpos = p-PyUnicode_AS_UNICODE(unicode);
2418	if (unicode_decode_call_errorhandler(
2419	         errors, &errorHandler,
2420	         "utf32", errmsg,
2421	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2422	         (PyObject **)&unicode, &outpos, &p))
2423	    goto onError;
2424    }
2425
2426    if (byteorder)
2427        *byteorder = bo;
2428
2429    if (consumed)
2430	*consumed = (const char *)q-starts;
2431
2432    /* Adjust length */
2433    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2434        goto onError;
2435
2436    Py_XDECREF(errorHandler);
2437    Py_XDECREF(exc);
2438    return (PyObject *)unicode;
2439
2440onError:
2441    Py_DECREF(unicode);
2442    Py_XDECREF(errorHandler);
2443    Py_XDECREF(exc);
2444    return NULL;
2445}
2446
2447PyObject *
2448PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2449		      Py_ssize_t size,
2450		      const char *errors,
2451		      int byteorder)
2452{
2453    PyObject *v, *result;
2454    unsigned char *p;
2455#ifndef Py_UNICODE_WIDE
2456    int i, pairs;
2457#else
2458    const int pairs = 0;
2459#endif
2460    /* Offsets from p for storing byte pairs in the right order. */
2461#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2462    int iorder[] = {0, 1, 2, 3};
2463#else
2464    int iorder[] = {3, 2, 1, 0};
2465#endif
2466
2467#define STORECHAR(CH)                       \
2468    do {                                    \
2469        p[iorder[3]] = ((CH) >> 24) & 0xff; \
2470        p[iorder[2]] = ((CH) >> 16) & 0xff; \
2471        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2472        p[iorder[0]] = (CH) & 0xff;         \
2473        p += 4;                             \
2474    } while(0)
2475
2476    /* In narrow builds we can output surrogate pairs as one codepoint,
2477       so we need less space. */
2478#ifndef Py_UNICODE_WIDE
2479    for (i = pairs = 0; i < size-1; i++)
2480	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2481	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2482	    pairs++;
2483#endif
2484    v = PyByteArray_FromStringAndSize(NULL,
2485		  4 * (size - pairs + (byteorder == 0)));
2486    if (v == NULL)
2487        return NULL;
2488
2489    p = (unsigned char *)PyByteArray_AS_STRING(v);
2490    if (byteorder == 0)
2491	STORECHAR(0xFEFF);
2492    if (size == 0)
2493        goto done;
2494
2495    if (byteorder == -1) {
2496        /* force LE */
2497        iorder[0] = 0;
2498        iorder[1] = 1;
2499        iorder[2] = 2;
2500        iorder[3] = 3;
2501    }
2502    else if (byteorder == 1) {
2503        /* force BE */
2504        iorder[0] = 3;
2505        iorder[1] = 2;
2506        iorder[2] = 1;
2507        iorder[3] = 0;
2508    }
2509
2510    while (size-- > 0) {
2511	Py_UCS4 ch = *s++;
2512#ifndef Py_UNICODE_WIDE
2513	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2514	    Py_UCS4 ch2 = *s;
2515	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2516		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2517		s++;
2518		size--;
2519	    }
2520	}
2521#endif
2522        STORECHAR(ch);
2523    }
2524
2525  done:
2526    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2527    Py_DECREF(v);
2528    return result;
2529#undef STORECHAR
2530}
2531
2532PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2533{
2534    if (!PyUnicode_Check(unicode)) {
2535        PyErr_BadArgument();
2536        return NULL;
2537    }
2538    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2539				 PyUnicode_GET_SIZE(unicode),
2540				 NULL,
2541				 0);
2542}
2543
2544/* --- UTF-16 Codec ------------------------------------------------------- */
2545
2546PyObject *
2547PyUnicode_DecodeUTF16(const char *s,
2548		      Py_ssize_t size,
2549		      const char *errors,
2550		      int *byteorder)
2551{
2552    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2553}
2554
2555PyObject *
2556PyUnicode_DecodeUTF16Stateful(const char *s,
2557			      Py_ssize_t size,
2558			      const char *errors,
2559			      int *byteorder,
2560			      Py_ssize_t *consumed)
2561{
2562    const char *starts = s;
2563    Py_ssize_t startinpos;
2564    Py_ssize_t endinpos;
2565    Py_ssize_t outpos;
2566    PyUnicodeObject *unicode;
2567    Py_UNICODE *p;
2568    const unsigned char *q, *e;
2569    int bo = 0;       /* assume native ordering by default */
2570    const char *errmsg = "";
2571    /* Offsets from q for retrieving byte pairs in the right order. */
2572#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2573    int ihi = 1, ilo = 0;
2574#else
2575    int ihi = 0, ilo = 1;
2576#endif
2577    PyObject *errorHandler = NULL;
2578    PyObject *exc = NULL;
2579
2580    /* Note: size will always be longer than the resulting Unicode
2581       character count */
2582    unicode = _PyUnicode_New(size);
2583    if (!unicode)
2584        return NULL;
2585    if (size == 0)
2586        return (PyObject *)unicode;
2587
2588    /* Unpack UTF-16 encoded data */
2589    p = unicode->str;
2590    q = (unsigned char *)s;
2591    e = q + size;
2592
2593    if (byteorder)
2594        bo = *byteorder;
2595
2596    /* Check for BOM marks (U+FEFF) in the input and adjust current
2597       byte order setting accordingly. In native mode, the leading BOM
2598       mark is skipped, in all other modes, it is copied to the output
2599       stream as-is (giving a ZWNBSP character). */
2600    if (bo == 0) {
2601        if (size >= 2) {
2602            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2603#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2604	    if (bom == 0xFEFF) {
2605		q += 2;
2606		bo = -1;
2607	    }
2608	    else if (bom == 0xFFFE) {
2609		q += 2;
2610		bo = 1;
2611	    }
2612#else
2613	    if (bom == 0xFEFF) {
2614		q += 2;
2615		bo = 1;
2616	    }
2617	    else if (bom == 0xFFFE) {
2618		q += 2;
2619		bo = -1;
2620	    }
2621#endif
2622	}
2623    }
2624
2625    if (bo == -1) {
2626        /* force LE */
2627        ihi = 1;
2628        ilo = 0;
2629    }
2630    else if (bo == 1) {
2631        /* force BE */
2632        ihi = 0;
2633        ilo = 1;
2634    }
2635
2636    while (q < e) {
2637	Py_UNICODE ch;
2638	/* remaining bytes at the end? (size should be even) */
2639	if (e-q<2) {
2640	    if (consumed)
2641		break;
2642	    errmsg = "truncated data";
2643	    startinpos = ((const char *)q)-starts;
2644	    endinpos = ((const char *)e)-starts;
2645	    goto utf16Error;
2646	    /* The remaining input chars are ignored if the callback
2647	       chooses to skip the input */
2648	}
2649	ch = (q[ihi] << 8) | q[ilo];
2650
2651	q += 2;
2652
2653	if (ch < 0xD800 || ch > 0xDFFF) {
2654	    *p++ = ch;
2655	    continue;
2656	}
2657
2658	/* UTF-16 code pair: */
2659	if (q >= e) {
2660	    errmsg = "unexpected end of data";
2661	    startinpos = (((const char *)q)-2)-starts;
2662	    endinpos = ((const char *)e)-starts;
2663	    goto utf16Error;
2664	}
2665	if (0xD800 <= ch && ch <= 0xDBFF) {
2666	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2667	    q += 2;
2668	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2669#ifndef Py_UNICODE_WIDE
2670		*p++ = ch;
2671		*p++ = ch2;
2672#else
2673		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2674#endif
2675		continue;
2676	    }
2677	    else {
2678                errmsg = "illegal UTF-16 surrogate";
2679		startinpos = (((const char *)q)-4)-starts;
2680		endinpos = startinpos+2;
2681		goto utf16Error;
2682	    }
2683
2684	}
2685	errmsg = "illegal encoding";
2686	startinpos = (((const char *)q)-2)-starts;
2687	endinpos = startinpos+2;
2688	/* Fall through to report the error */
2689
2690    utf16Error:
2691	outpos = p-PyUnicode_AS_UNICODE(unicode);
2692	if (unicode_decode_call_errorhandler(
2693	         errors, &errorHandler,
2694	         "utf16", errmsg,
2695	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2696	         (PyObject **)&unicode, &outpos, &p))
2697	    goto onError;
2698    }
2699
2700    if (byteorder)
2701        *byteorder = bo;
2702
2703    if (consumed)
2704	*consumed = (const char *)q-starts;
2705
2706    /* Adjust length */
2707    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2708        goto onError;
2709
2710    Py_XDECREF(errorHandler);
2711    Py_XDECREF(exc);
2712    return (PyObject *)unicode;
2713
2714onError:
2715    Py_DECREF(unicode);
2716    Py_XDECREF(errorHandler);
2717    Py_XDECREF(exc);
2718    return NULL;
2719}
2720
2721PyObject *
2722PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2723		      Py_ssize_t size,
2724		      const char *errors,
2725		      int byteorder)
2726{
2727    PyObject *v, *result;
2728    unsigned char *p;
2729#ifdef Py_UNICODE_WIDE
2730    int i, pairs;
2731#else
2732    const int pairs = 0;
2733#endif
2734    /* Offsets from p for storing byte pairs in the right order. */
2735#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2736    int ihi = 1, ilo = 0;
2737#else
2738    int ihi = 0, ilo = 1;
2739#endif
2740
2741#define STORECHAR(CH)                   \
2742    do {                                \
2743        p[ihi] = ((CH) >> 8) & 0xff;    \
2744        p[ilo] = (CH) & 0xff;           \
2745        p += 2;                         \
2746    } while(0)
2747
2748#ifdef Py_UNICODE_WIDE
2749    for (i = pairs = 0; i < size; i++)
2750	if (s[i] >= 0x10000)
2751	    pairs++;
2752#endif
2753    v = PyByteArray_FromStringAndSize(NULL,
2754		  2 * (size + pairs + (byteorder == 0)));
2755    if (v == NULL)
2756        return NULL;
2757
2758    p = (unsigned char *)PyByteArray_AS_STRING(v);
2759    if (byteorder == 0)
2760	STORECHAR(0xFEFF);
2761    if (size == 0)
2762        goto done;
2763
2764    if (byteorder == -1) {
2765        /* force LE */
2766        ihi = 1;
2767        ilo = 0;
2768    }
2769    else if (byteorder == 1) {
2770        /* force BE */
2771        ihi = 0;
2772        ilo = 1;
2773    }
2774
2775    while (size-- > 0) {
2776	Py_UNICODE ch = *s++;
2777	Py_UNICODE ch2 = 0;
2778#ifdef Py_UNICODE_WIDE
2779	if (ch >= 0x10000) {
2780	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2781	    ch  = 0xD800 | ((ch-0x10000) >> 10);
2782	}
2783#endif
2784        STORECHAR(ch);
2785        if (ch2)
2786            STORECHAR(ch2);
2787    }
2788
2789  done:
2790    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2791    Py_DECREF(v);
2792    return result;
2793#undef STORECHAR
2794}
2795
2796PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2797{
2798    if (!PyUnicode_Check(unicode)) {
2799        PyErr_BadArgument();
2800        return NULL;
2801    }
2802    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2803				 PyUnicode_GET_SIZE(unicode),
2804				 NULL,
2805				 0);
2806}
2807
2808/* --- Unicode Escape Codec ----------------------------------------------- */
2809
2810static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2811
2812PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2813					Py_ssize_t size,
2814					const char *errors)
2815{
2816    const char *starts = s;
2817    Py_ssize_t startinpos;
2818    Py_ssize_t endinpos;
2819    Py_ssize_t outpos;
2820    int i;
2821    PyUnicodeObject *v;
2822    Py_UNICODE *p;
2823    const char *end;
2824    char* message;
2825    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2826    PyObject *errorHandler = NULL;
2827    PyObject *exc = NULL;
2828
2829    /* Escaped strings will always be longer than the resulting
2830       Unicode string, so we start with size here and then reduce the
2831       length after conversion to the true value.
2832       (but if the error callback returns a long replacement string
2833       we'll have to allocate more space) */
2834    v = _PyUnicode_New(size);
2835    if (v == NULL)
2836        goto onError;
2837    if (size == 0)
2838        return (PyObject *)v;
2839
2840    p = PyUnicode_AS_UNICODE(v);
2841    end = s + size;
2842
2843    while (s < end) {
2844        unsigned char c;
2845        Py_UNICODE x;
2846        int digits;
2847
2848        /* Non-escape characters are interpreted as Unicode ordinals */
2849        if (*s != '\\') {
2850            *p++ = (unsigned char) *s++;
2851            continue;
2852        }
2853
2854        startinpos = s-starts;
2855        /* \ - Escapes */
2856        s++;
2857        c = *s++;
2858        if (s > end)
2859            c = '\0'; /* Invalid after \ */
2860        switch (c) {
2861
2862        /* \x escapes */
2863        case '\n': break;
2864        case '\\': *p++ = '\\'; break;
2865        case '\'': *p++ = '\''; break;
2866        case '\"': *p++ = '\"'; break;
2867        case 'b': *p++ = '\b'; break;
2868        case 'f': *p++ = '\014'; break; /* FF */
2869        case 't': *p++ = '\t'; break;
2870        case 'n': *p++ = '\n'; break;
2871        case 'r': *p++ = '\r'; break;
2872        case 'v': *p++ = '\013'; break; /* VT */
2873        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2874
2875        /* \OOO (octal) escapes */
2876        case '0': case '1': case '2': case '3':
2877        case '4': case '5': case '6': case '7':
2878            x = s[-1] - '0';
2879            if (s < end && '0' <= *s && *s <= '7') {
2880                x = (x<<3) + *s++ - '0';
2881                if (s < end && '0' <= *s && *s <= '7')
2882                    x = (x<<3) + *s++ - '0';
2883            }
2884            *p++ = x;
2885            break;
2886
2887        /* hex escapes */
2888        /* \xXX */
2889        case 'x':
2890            digits = 2;
2891            message = "truncated \\xXX escape";
2892            goto hexescape;
2893
2894        /* \uXXXX */
2895        case 'u':
2896            digits = 4;
2897            message = "truncated \\uXXXX escape";
2898            goto hexescape;
2899
2900        /* \UXXXXXXXX */
2901        case 'U':
2902            digits = 8;
2903            message = "truncated \\UXXXXXXXX escape";
2904        hexescape:
2905            chr = 0;
2906            outpos = p-PyUnicode_AS_UNICODE(v);
2907            if (s+digits>end) {
2908                endinpos = size;
2909                if (unicode_decode_call_errorhandler(
2910                    errors, &errorHandler,
2911                    "unicodeescape", "end of string in escape sequence",
2912                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2913                    (PyObject **)&v, &outpos, &p))
2914                    goto onError;
2915                goto nextByte;
2916            }
2917            for (i = 0; i < digits; ++i) {
2918                c = (unsigned char) s[i];
2919                if (!ISXDIGIT(c)) {
2920                    endinpos = (s+i+1)-starts;
2921                    if (unicode_decode_call_errorhandler(
2922                        errors, &errorHandler,
2923                        "unicodeescape", message,
2924                        &starts, &end, &startinpos, &endinpos, &exc, &s,
2925                        (PyObject **)&v, &outpos, &p))
2926                        goto onError;
2927                    goto nextByte;
2928                }
2929                chr = (chr<<4) & ~0xF;
2930                if (c >= '0' && c <= '9')
2931                    chr += c - '0';
2932                else if (c >= 'a' && c <= 'f')
2933                    chr += 10 + c - 'a';
2934                else
2935                    chr += 10 + c - 'A';
2936            }
2937            s += i;
2938            if (chr == 0xffffffff && PyErr_Occurred())
2939                /* _decoding_error will have already written into the
2940                   target buffer. */
2941                break;
2942        store:
2943            /* when we get here, chr is a 32-bit unicode character */
2944            if (chr <= 0xffff)
2945                /* UCS-2 character */
2946                *p++ = (Py_UNICODE) chr;
2947            else if (chr <= 0x10ffff) {
2948                /* UCS-4 character. Either store directly, or as
2949                   surrogate pair. */
2950#ifdef Py_UNICODE_WIDE
2951                *p++ = chr;
2952#else
2953                chr -= 0x10000L;
2954                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2955                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2956#endif
2957            } else {
2958                endinpos = s-starts;
2959                outpos = p-PyUnicode_AS_UNICODE(v);
2960                if (unicode_decode_call_errorhandler(
2961                    errors, &errorHandler,
2962                    "unicodeescape", "illegal Unicode character",
2963                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2964                    (PyObject **)&v, &outpos, &p))
2965                    goto onError;
2966            }
2967            break;
2968
2969        /* \N{name} */
2970        case 'N':
2971            message = "malformed \\N character escape";
2972            if (ucnhash_CAPI == NULL) {
2973                /* load the unicode data module */
2974                PyObject *m, *api;
2975                m = PyImport_ImportModuleNoBlock("unicodedata");
2976                if (m == NULL)
2977                    goto ucnhashError;
2978                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2979                Py_DECREF(m);
2980                if (api == NULL)
2981                    goto ucnhashError;
2982                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2983                Py_DECREF(api);
2984                if (ucnhash_CAPI == NULL)
2985                    goto ucnhashError;
2986            }
2987            if (*s == '{') {
2988                const char *start = s+1;
2989                /* look for the closing brace */
2990                while (*s != '}' && s < end)
2991                    s++;
2992                if (s > start && s < end && *s == '}') {
2993                    /* found a name.  look it up in the unicode database */
2994                    message = "unknown Unicode character name";
2995                    s++;
2996                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2997                        goto store;
2998                }
2999            }
3000            endinpos = s-starts;
3001            outpos = p-PyUnicode_AS_UNICODE(v);
3002            if (unicode_decode_call_errorhandler(
3003                errors, &errorHandler,
3004                "unicodeescape", message,
3005                &starts, &end, &startinpos, &endinpos, &exc, &s,
3006                (PyObject **)&v, &outpos, &p))
3007                goto onError;
3008            break;
3009
3010        default:
3011            if (s > end) {
3012                message = "\\ at end of string";
3013                s--;
3014                endinpos = s-starts;
3015                outpos = p-PyUnicode_AS_UNICODE(v);
3016                if (unicode_decode_call_errorhandler(
3017                    errors, &errorHandler,
3018                    "unicodeescape", message,
3019                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3020                    (PyObject **)&v, &outpos, &p))
3021                    goto onError;
3022            }
3023            else {
3024                *p++ = '\\';
3025                *p++ = (unsigned char)s[-1];
3026            }
3027            break;
3028        }
3029        nextByte:
3030        ;
3031    }
3032    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3033        goto onError;
3034    Py_XDECREF(errorHandler);
3035    Py_XDECREF(exc);
3036    return (PyObject *)v;
3037
3038ucnhashError:
3039    PyErr_SetString(
3040        PyExc_UnicodeError,
3041        "\\N escapes not supported (can't load unicodedata module)"
3042        );
3043    Py_XDECREF(v);
3044    Py_XDECREF(errorHandler);
3045    Py_XDECREF(exc);
3046    return NULL;
3047
3048onError:
3049    Py_XDECREF(v);
3050    Py_XDECREF(errorHandler);
3051    Py_XDECREF(exc);
3052    return NULL;
3053}
3054
3055/* Return a Unicode-Escape string version of the Unicode object.
3056
3057   If quotes is true, the string is enclosed in u"" or u'' quotes as
3058   appropriate.
3059
3060*/
3061
3062Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3063                                      Py_ssize_t size,
3064                                      Py_UNICODE ch)
3065{
3066    /* like wcschr, but doesn't stop at NULL characters */
3067
3068    while (size-- > 0) {
3069        if (*s == ch)
3070            return s;
3071        s++;
3072    }
3073
3074    return NULL;
3075}
3076
3077static const char *hexdigits = "0123456789abcdef";
3078
3079PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3080					Py_ssize_t size)
3081{
3082    PyObject *repr, *result;
3083    char *p;
3084
3085    /* XXX(nnorwitz): rather than over-allocating, it would be
3086       better to choose a different scheme.  Perhaps scan the
3087       first N-chars of the string and allocate based on that size.
3088    */
3089    /* Initial allocation is based on the longest-possible unichr
3090       escape.
3091
3092       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3093       unichr, so in this case it's the longest unichr escape. In
3094       narrow (UTF-16) builds this is five chars per source unichr
3095       since there are two unichrs in the surrogate pair, so in narrow
3096       (UTF-16) builds it's not the longest unichr escape.
3097
3098       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3099       so in the narrow (UTF-16) build case it's the longest unichr
3100       escape.
3101    */
3102
3103    repr = PyByteArray_FromStringAndSize(NULL,
3104#ifdef Py_UNICODE_WIDE
3105        + 10*size
3106#else
3107        + 6*size
3108#endif
3109        + 1);
3110    if (repr == NULL)
3111        return NULL;
3112
3113    p = PyByteArray_AS_STRING(repr);
3114
3115    while (size-- > 0) {
3116        Py_UNICODE ch = *s++;
3117
3118        /* Escape backslashes */
3119        if (ch == '\\') {
3120            *p++ = '\\';
3121            *p++ = (char) ch;
3122            continue;
3123        }
3124
3125#ifdef Py_UNICODE_WIDE
3126        /* Map 21-bit characters to '\U00xxxxxx' */
3127        else if (ch >= 0x10000) {
3128            *p++ = '\\';
3129            *p++ = 'U';
3130            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3131            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3132            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3133            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3134            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3135            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3136            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3137            *p++ = hexdigits[ch & 0x0000000F];
3138	    continue;
3139        }
3140#else
3141	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3142	else if (ch >= 0xD800 && ch < 0xDC00) {
3143	    Py_UNICODE ch2;
3144	    Py_UCS4 ucs;
3145
3146	    ch2 = *s++;
3147	    size--;
3148	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3149		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3150		*p++ = '\\';
3151		*p++ = 'U';
3152		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3153		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3154		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3155		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3156		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3157		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3158		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3159		*p++ = hexdigits[ucs & 0x0000000F];
3160		continue;
3161	    }
3162	    /* Fall through: isolated surrogates are copied as-is */
3163	    s--;
3164	    size++;
3165	}
3166#endif
3167
3168        /* Map 16-bit characters to '\uxxxx' */
3169        if (ch >= 256) {
3170            *p++ = '\\';
3171            *p++ = 'u';
3172            *p++ = hexdigits[(ch >> 12) & 0x000F];
3173            *p++ = hexdigits[(ch >> 8) & 0x000F];
3174            *p++ = hexdigits[(ch >> 4) & 0x000F];
3175            *p++ = hexdigits[ch & 0x000F];
3176        }
3177
3178        /* Map special whitespace to '\t', \n', '\r' */
3179        else if (ch == '\t') {
3180            *p++ = '\\';
3181            *p++ = 't';
3182        }
3183        else if (ch == '\n') {
3184            *p++ = '\\';
3185            *p++ = 'n';
3186        }
3187        else if (ch == '\r') {
3188            *p++ = '\\';
3189            *p++ = 'r';
3190        }
3191
3192        /* Map non-printable US ASCII to '\xhh' */
3193        else if (ch < ' ' || ch >= 0x7F) {
3194            *p++ = '\\';
3195            *p++ = 'x';
3196            *p++ = hexdigits[(ch >> 4) & 0x000F];
3197            *p++ = hexdigits[ch & 0x000F];
3198        }
3199
3200        /* Copy everything else as-is */
3201        else
3202            *p++ = (char) ch;
3203    }
3204
3205    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
3206                                        p - PyByteArray_AS_STRING(repr));
3207    Py_DECREF(repr);
3208    return result;
3209}
3210
3211PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3212{
3213    PyObject *s, *result;
3214    if (!PyUnicode_Check(unicode)) {
3215        PyErr_BadArgument();
3216        return NULL;
3217    }
3218    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3219                                      PyUnicode_GET_SIZE(unicode));
3220
3221    if (!s)
3222        return NULL;
3223    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
3224                                        PyByteArray_GET_SIZE(s));
3225    Py_DECREF(s);
3226    return result;
3227}
3228
3229/* --- Raw Unicode Escape Codec ------------------------------------------- */
3230
3231PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3232					   Py_ssize_t size,
3233					   const char *errors)
3234{
3235    const char *starts = s;
3236    Py_ssize_t startinpos;
3237    Py_ssize_t endinpos;
3238    Py_ssize_t outpos;
3239    PyUnicodeObject *v;
3240    Py_UNICODE *p;
3241    const char *end;
3242    const char *bs;
3243    PyObject *errorHandler = NULL;
3244    PyObject *exc = NULL;
3245
3246    /* Escaped strings will always be longer than the resulting
3247       Unicode string, so we start with size here and then reduce the
3248       length after conversion to the true value. (But decoding error
3249       handler might have to resize the string) */
3250    v = _PyUnicode_New(size);
3251    if (v == NULL)
3252	goto onError;
3253    if (size == 0)
3254	return (PyObject *)v;
3255    p = PyUnicode_AS_UNICODE(v);
3256    end = s + size;
3257    while (s < end) {
3258	unsigned char c;
3259	Py_UCS4 x;
3260	int i;
3261        int count;
3262
3263	/* Non-escape characters are interpreted as Unicode ordinals */
3264	if (*s != '\\') {
3265	    *p++ = (unsigned char)*s++;
3266	    continue;
3267	}
3268	startinpos = s-starts;
3269
3270	/* \u-escapes are only interpreted iff the number of leading
3271	   backslashes if odd */
3272	bs = s;
3273	for (;s < end;) {
3274	    if (*s != '\\')
3275		break;
3276	    *p++ = (unsigned char)*s++;
3277	}
3278	if (((s - bs) & 1) == 0 ||
3279	    s >= end ||
3280	    (*s != 'u' && *s != 'U')) {
3281	    continue;
3282	}
3283	p--;
3284        count = *s=='u' ? 4 : 8;
3285	s++;
3286
3287	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3288	outpos = p-PyUnicode_AS_UNICODE(v);
3289	for (x = 0, i = 0; i < count; ++i, ++s) {
3290	    c = (unsigned char)*s;
3291	    if (!ISXDIGIT(c)) {
3292		endinpos = s-starts;
3293		if (unicode_decode_call_errorhandler(
3294		    errors, &errorHandler,
3295		    "rawunicodeescape", "truncated \\uXXXX",
3296		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3297		    (PyObject **)&v, &outpos, &p))
3298		    goto onError;
3299		goto nextByte;
3300	    }
3301	    x = (x<<4) & ~0xF;
3302	    if (c >= '0' && c <= '9')
3303		x += c - '0';
3304	    else if (c >= 'a' && c <= 'f')
3305		x += 10 + c - 'a';
3306	    else
3307		x += 10 + c - 'A';
3308	}
3309        if (x <= 0xffff)
3310                /* UCS-2 character */
3311                *p++ = (Py_UNICODE) x;
3312        else if (x <= 0x10ffff) {
3313                /* UCS-4 character. Either store directly, or as
3314                   surrogate pair. */
3315#ifdef Py_UNICODE_WIDE
3316                *p++ = (Py_UNICODE) x;
3317#else
3318                x -= 0x10000L;
3319                *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3320                *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3321#endif
3322        } else {
3323            endinpos = s-starts;
3324            outpos = p-PyUnicode_AS_UNICODE(v);
3325            if (unicode_decode_call_errorhandler(
3326                    errors, &errorHandler,
3327                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3328		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3329		    (PyObject **)&v, &outpos, &p))
3330		    goto onError;
3331        }
3332	nextByte:
3333	;
3334    }
3335    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3336	goto onError;
3337    Py_XDECREF(errorHandler);
3338    Py_XDECREF(exc);
3339    return (PyObject *)v;
3340
3341 onError:
3342    Py_XDECREF(v);
3343    Py_XDECREF(errorHandler);
3344    Py_XDECREF(exc);
3345    return NULL;
3346}
3347
3348PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3349					   Py_ssize_t size)
3350{
3351    PyObject *repr, *result;
3352    char *p;
3353    char *q;
3354
3355#ifdef Py_UNICODE_WIDE
3356    repr = PyByteArray_FromStringAndSize(NULL, 10 * size);
3357#else
3358    repr = PyByteArray_FromStringAndSize(NULL, 6 * size);
3359#endif
3360    if (repr == NULL)
3361        return NULL;
3362    if (size == 0)
3363        goto done;
3364
3365    p = q = PyByteArray_AS_STRING(repr);
3366    while (size-- > 0) {
3367        Py_UNICODE ch = *s++;
3368#ifdef Py_UNICODE_WIDE
3369	/* Map 32-bit characters to '\Uxxxxxxxx' */
3370	if (ch >= 0x10000) {
3371            *p++ = '\\';
3372            *p++ = 'U';
3373            *p++ = hexdigits[(ch >> 28) & 0xf];
3374            *p++ = hexdigits[(ch >> 24) & 0xf];
3375            *p++ = hexdigits[(ch >> 20) & 0xf];
3376            *p++ = hexdigits[(ch >> 16) & 0xf];
3377            *p++ = hexdigits[(ch >> 12) & 0xf];
3378            *p++ = hexdigits[(ch >> 8) & 0xf];
3379            *p++ = hexdigits[(ch >> 4) & 0xf];
3380            *p++ = hexdigits[ch & 15];
3381        }
3382        else
3383#else
3384	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3385	if (ch >= 0xD800 && ch < 0xDC00) {
3386	    Py_UNICODE ch2;
3387	    Py_UCS4 ucs;
3388
3389	    ch2 = *s++;
3390	    size--;
3391	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3392		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3393		*p++ = '\\';
3394		*p++ = 'U';
3395		*p++ = hexdigits[(ucs >> 28) & 0xf];
3396		*p++ = hexdigits[(ucs >> 24) & 0xf];
3397		*p++ = hexdigits[(ucs >> 20) & 0xf];
3398		*p++ = hexdigits[(ucs >> 16) & 0xf];
3399		*p++ = hexdigits[(ucs >> 12) & 0xf];
3400		*p++ = hexdigits[(ucs >> 8) & 0xf];
3401		*p++ = hexdigits[(ucs >> 4) & 0xf];
3402		*p++ = hexdigits[ucs & 0xf];
3403		continue;
3404	    }
3405	    /* Fall through: isolated surrogates are copied as-is */
3406	    s--;
3407	    size++;
3408	}
3409#endif
3410	/* Map 16-bit characters to '\uxxxx' */
3411	if (ch >= 256) {
3412            *p++ = '\\';
3413            *p++ = 'u';
3414            *p++ = hexdigits[(ch >> 12) & 0xf];
3415            *p++ = hexdigits[(ch >> 8) & 0xf];
3416            *p++ = hexdigits[(ch >> 4) & 0xf];
3417            *p++ = hexdigits[ch & 15];
3418        }
3419	/* Copy everything else as-is */
3420	else
3421            *p++ = (char) ch;
3422    }
3423    size = p - q;
3424
3425  done:
3426    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
3427    Py_DECREF(repr);
3428    return result;
3429}
3430
3431PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3432{
3433    PyObject *s, *result;
3434    if (!PyUnicode_Check(unicode)) {
3435        PyErr_BadArgument();
3436        return NULL;
3437    }
3438    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3439                                         PyUnicode_GET_SIZE(unicode));
3440
3441    if (!s)
3442        return NULL;
3443    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
3444                                        PyByteArray_GET_SIZE(s));
3445    Py_DECREF(s);
3446    return result;
3447}
3448
3449/* --- Unicode Internal Codec ------------------------------------------- */
3450
3451PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3452					   Py_ssize_t size,
3453					   const char *errors)
3454{
3455    const char *starts = s;
3456    Py_ssize_t startinpos;
3457    Py_ssize_t endinpos;
3458    Py_ssize_t outpos;
3459    PyUnicodeObject *v;
3460    Py_UNICODE *p;
3461    const char *end;
3462    const char *reason;
3463    PyObject *errorHandler = NULL;
3464    PyObject *exc = NULL;
3465
3466#ifdef Py_UNICODE_WIDE
3467    Py_UNICODE unimax = PyUnicode_GetMax();
3468#endif
3469
3470    /* XXX overflow detection missing */
3471    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3472    if (v == NULL)
3473	goto onError;
3474    if (PyUnicode_GetSize((PyObject *)v) == 0)
3475	return (PyObject *)v;
3476    p = PyUnicode_AS_UNICODE(v);
3477    end = s + size;
3478
3479    while (s < end) {
3480        memcpy(p, s, sizeof(Py_UNICODE));
3481        /* We have to sanity check the raw data, otherwise doom looms for
3482           some malformed UCS-4 data. */
3483        if (
3484            #ifdef Py_UNICODE_WIDE
3485            *p > unimax || *p < 0 ||
3486            #endif
3487            end-s < Py_UNICODE_SIZE
3488            )
3489            {
3490            startinpos = s - starts;
3491            if (end-s < Py_UNICODE_SIZE) {
3492                endinpos = end-starts;
3493                reason = "truncated input";
3494            }
3495            else {
3496                endinpos = s - starts + Py_UNICODE_SIZE;
3497                reason = "illegal code point (> 0x10FFFF)";
3498            }
3499            outpos = p - PyUnicode_AS_UNICODE(v);
3500            if (unicode_decode_call_errorhandler(
3501                    errors, &errorHandler,
3502                    "unicode_internal", reason,
3503                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3504                    (PyObject **)&v, &outpos, &p)) {
3505                goto onError;
3506            }
3507        }
3508        else {
3509            p++;
3510            s += Py_UNICODE_SIZE;
3511        }
3512    }
3513
3514    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3515        goto onError;
3516    Py_XDECREF(errorHandler);
3517    Py_XDECREF(exc);
3518    return (PyObject *)v;
3519
3520 onError:
3521    Py_XDECREF(v);
3522    Py_XDECREF(errorHandler);
3523    Py_XDECREF(exc);
3524    return NULL;
3525}
3526
3527/* --- Latin-1 Codec ------------------------------------------------------ */
3528
3529PyObject *PyUnicode_DecodeLatin1(const char *s,
3530				 Py_ssize_t size,
3531				 const char *errors)
3532{
3533    PyUnicodeObject *v;
3534    Py_UNICODE *p;
3535
3536    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3537    if (size == 1) {
3538	Py_UNICODE r = *(unsigned char*)s;
3539	return PyUnicode_FromUnicode(&r, 1);
3540    }
3541
3542    v = _PyUnicode_New(size);
3543    if (v == NULL)
3544	goto onError;
3545    if (size == 0)
3546	return (PyObject *)v;
3547    p = PyUnicode_AS_UNICODE(v);
3548    while (size-- > 0)
3549	*p++ = (unsigned char)*s++;
3550    return (PyObject *)v;
3551
3552 onError:
3553    Py_XDECREF(v);
3554    return NULL;
3555}
3556
3557/* create or adjust a UnicodeEncodeError */
3558static void make_encode_exception(PyObject **exceptionObject,
3559    const char *encoding,
3560    const Py_UNICODE *unicode, Py_ssize_t size,
3561    Py_ssize_t startpos, Py_ssize_t endpos,
3562    const char *reason)
3563{
3564    if (*exceptionObject == NULL) {
3565	*exceptionObject = PyUnicodeEncodeError_Create(
3566	    encoding, unicode, size, startpos, endpos, reason);
3567    }
3568    else {
3569	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3570	    goto onError;
3571	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3572	    goto onError;
3573	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3574	    goto onError;
3575	return;
3576	onError:
3577	Py_DECREF(*exceptionObject);
3578	*exceptionObject = NULL;
3579    }
3580}
3581
3582/* raises a UnicodeEncodeError */
3583static void raise_encode_exception(PyObject **exceptionObject,
3584    const char *encoding,
3585    const Py_UNICODE *unicode, Py_ssize_t size,
3586    Py_ssize_t startpos, Py_ssize_t endpos,
3587    const char *reason)
3588{
3589    make_encode_exception(exceptionObject,
3590	encoding, unicode, size, startpos, endpos, reason);
3591    if (*exceptionObject != NULL)
3592	PyCodec_StrictErrors(*exceptionObject);
3593}
3594
3595/* error handling callback helper:
3596   build arguments, call the callback and check the arguments,
3597   put the result into newpos and return the replacement string, which
3598   has to be freed by the caller */
3599static PyObject *unicode_encode_call_errorhandler(const char *errors,
3600    PyObject **errorHandler,
3601    const char *encoding, const char *reason,
3602    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3603    Py_ssize_t startpos, Py_ssize_t endpos,
3604    Py_ssize_t *newpos)
3605{
3606    static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
3607
3608    PyObject *restuple;
3609    PyObject *resunicode;
3610
3611    if (*errorHandler == NULL) {
3612	*errorHandler = PyCodec_LookupError(errors);
3613        if (*errorHandler == NULL)
3614	    return NULL;
3615    }
3616
3617    make_encode_exception(exceptionObject,
3618	encoding, unicode, size, startpos, endpos, reason);
3619    if (*exceptionObject == NULL)
3620	return NULL;
3621
3622    restuple = PyObject_CallFunctionObjArgs(
3623	*errorHandler, *exceptionObject, NULL);
3624    if (restuple == NULL)
3625	return NULL;
3626    if (!PyTuple_Check(restuple)) {
3627	PyErr_Format(PyExc_TypeError, &argparse[4]);
3628	Py_DECREF(restuple);
3629	return NULL;
3630    }
3631    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3632	&resunicode, newpos)) {
3633	Py_DECREF(restuple);
3634	return NULL;
3635    }
3636    if (*newpos<0)
3637	*newpos = size+*newpos;
3638    if (*newpos<0 || *newpos>size) {
3639	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3640	Py_DECREF(restuple);
3641	return NULL;
3642    }
3643    Py_INCREF(resunicode);
3644    Py_DECREF(restuple);
3645    return resunicode;
3646}
3647
3648static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3649				 Py_ssize_t size,
3650				 const char *errors,
3651				 int limit)
3652{
3653    /* output object */
3654    PyObject *res;
3655    /* pointers to the beginning and end+1 of input */
3656    const Py_UNICODE *startp = p;
3657    const Py_UNICODE *endp = p + size;
3658    /* pointer to the beginning of the unencodable characters */
3659    /* const Py_UNICODE *badp = NULL; */
3660    /* pointer into the output */
3661    char *str;
3662    /* current output position */
3663    Py_ssize_t ressize;
3664    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3665    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3666    PyObject *errorHandler = NULL;
3667    PyObject *exc = NULL;
3668    PyObject *result = NULL;
3669    /* the following variable is used for caching string comparisons
3670     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3671    int known_errorHandler = -1;
3672
3673    /* allocate enough for a simple encoding without
3674       replacements, if we need more, we'll resize */
3675    if (size == 0)
3676        return PyBytes_FromStringAndSize(NULL, 0);
3677    res = PyByteArray_FromStringAndSize(NULL, size);
3678    if (res == NULL)
3679        return NULL;
3680    str = PyByteArray_AS_STRING(res);
3681    ressize = size;
3682
3683    while (p<endp) {
3684	Py_UNICODE c = *p;
3685
3686	/* can we encode this? */
3687	if (c<limit) {
3688	    /* no overflow check, because we know that the space is enough */
3689	    *str++ = (char)c;
3690	    ++p;
3691	}
3692	else {
3693	    Py_ssize_t unicodepos = p-startp;
3694	    Py_ssize_t requiredsize;
3695	    PyObject *repunicode;
3696	    Py_ssize_t repsize;
3697	    Py_ssize_t newpos;
3698	    Py_ssize_t respos;
3699	    Py_UNICODE *uni2;
3700	    /* startpos for collecting unencodable chars */
3701	    const Py_UNICODE *collstart = p;
3702	    const Py_UNICODE *collend = p;
3703	    /* find all unecodable characters */
3704	    while ((collend < endp) && ((*collend)>=limit))
3705		++collend;
3706	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3707	    if (known_errorHandler==-1) {
3708		if ((errors==NULL) || (!strcmp(errors, "strict")))
3709		    known_errorHandler = 1;
3710		else if (!strcmp(errors, "replace"))
3711		    known_errorHandler = 2;
3712		else if (!strcmp(errors, "ignore"))
3713		    known_errorHandler = 3;
3714		else if (!strcmp(errors, "xmlcharrefreplace"))
3715		    known_errorHandler = 4;
3716		else
3717		    known_errorHandler = 0;
3718	    }
3719	    switch (known_errorHandler) {
3720		case 1: /* strict */
3721		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3722		    goto onError;
3723		case 2: /* replace */
3724		    while (collstart++<collend)
3725			*str++ = '?'; /* fall through */
3726		case 3: /* ignore */
3727		    p = collend;
3728		    break;
3729		case 4: /* xmlcharrefreplace */
3730		    respos = str - PyByteArray_AS_STRING(res);
3731		    /* determine replacement size (temporarily (mis)uses p) */
3732		    for (p = collstart, repsize = 0; p < collend; ++p) {
3733			if (*p<10)
3734			    repsize += 2+1+1;
3735			else if (*p<100)
3736			    repsize += 2+2+1;
3737			else if (*p<1000)
3738			    repsize += 2+3+1;
3739			else if (*p<10000)
3740			    repsize += 2+4+1;
3741#ifndef Py_UNICODE_WIDE
3742			else
3743			    repsize += 2+5+1;
3744#else
3745			else if (*p<100000)
3746			    repsize += 2+5+1;
3747			else if (*p<1000000)
3748			    repsize += 2+6+1;
3749			else
3750			    repsize += 2+7+1;
3751#endif
3752		    }
3753		    requiredsize = respos+repsize+(endp-collend);
3754		    if (requiredsize > ressize) {
3755			if (requiredsize<2*ressize)
3756			    requiredsize = 2*ressize;
3757			if (PyByteArray_Resize(res, requiredsize))
3758			    goto onError;
3759			str = PyByteArray_AS_STRING(res) + respos;
3760			ressize = requiredsize;
3761		    }
3762		    /* generate replacement (temporarily (mis)uses p) */
3763		    for (p = collstart; p < collend; ++p) {
3764			str += sprintf(str, "&#%d;", (int)*p);
3765		    }
3766		    p = collend;
3767		    break;
3768		default:
3769		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3770			encoding, reason, startp, size, &exc,
3771			collstart-startp, collend-startp, &newpos);
3772		    if (repunicode == NULL)
3773			goto onError;
3774		    /* need more space? (at least enough for what we
3775		       have+the replacement+the rest of the string, so
3776		       we won't have to check space for encodable characters) */
3777		    respos = str - PyByteArray_AS_STRING(res);
3778		    repsize = PyUnicode_GET_SIZE(repunicode);
3779		    requiredsize = respos+repsize+(endp-collend);
3780		    if (requiredsize > ressize) {
3781			if (requiredsize<2*ressize)
3782			    requiredsize = 2*ressize;
3783			if (PyByteArray_Resize(res, requiredsize)) {
3784			    Py_DECREF(repunicode);
3785			    goto onError;
3786			}
3787			str = PyByteArray_AS_STRING(res) + respos;
3788			ressize = requiredsize;
3789		    }
3790		    /* check if there is anything unencodable in the replacement
3791		       and copy it to the output */
3792		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3793			c = *uni2;
3794			if (c >= limit) {
3795			    raise_encode_exception(&exc, encoding, startp, size,
3796				unicodepos, unicodepos+1, reason);
3797			    Py_DECREF(repunicode);
3798			    goto onError;
3799			}
3800			*str = (char)c;
3801		    }
3802		    p = startp + newpos;
3803		    Py_DECREF(repunicode);
3804	    }
3805	}
3806    }
3807    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
3808                                        str - PyByteArray_AS_STRING(res));
3809  onError:
3810    Py_DECREF(res);
3811    Py_XDECREF(errorHandler);
3812    Py_XDECREF(exc);
3813    return result;
3814}
3815
3816PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3817				 Py_ssize_t size,
3818				 const char *errors)
3819{
3820    return unicode_encode_ucs1(p, size, errors, 256);
3821}
3822
3823PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3824{
3825    if (!PyUnicode_Check(unicode)) {
3826	PyErr_BadArgument();
3827	return NULL;
3828    }
3829    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3830				  PyUnicode_GET_SIZE(unicode),
3831				  NULL);
3832}
3833
3834/* --- 7-bit ASCII Codec -------------------------------------------------- */
3835
3836PyObject *PyUnicode_DecodeASCII(const char *s,
3837				Py_ssize_t size,
3838				const char *errors)
3839{
3840    const char *starts = s;
3841    PyUnicodeObject *v;
3842    Py_UNICODE *p;
3843    Py_ssize_t startinpos;
3844    Py_ssize_t endinpos;
3845    Py_ssize_t outpos;
3846    const char *e;
3847    PyObject *errorHandler = NULL;
3848    PyObject *exc = NULL;
3849
3850    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3851    if (size == 1 && *(unsigned char*)s < 128) {
3852	Py_UNICODE r = *(unsigned char*)s;
3853	return PyUnicode_FromUnicode(&r, 1);
3854    }
3855
3856    v = _PyUnicode_New(size);
3857    if (v == NULL)
3858	goto onError;
3859    if (size == 0)
3860	return (PyObject *)v;
3861    p = PyUnicode_AS_UNICODE(v);
3862    e = s + size;
3863    while (s < e) {
3864	register unsigned char c = (unsigned char)*s;
3865	if (c < 128) {
3866	    *p++ = c;
3867	    ++s;
3868	}
3869	else {
3870	    startinpos = s-starts;
3871	    endinpos = startinpos + 1;
3872	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3873	    if (unicode_decode_call_errorhandler(
3874		 errors, &errorHandler,
3875		 "ascii", "ordinal not in range(128)",
3876		 &starts, &e, &startinpos, &endinpos, &exc, &s,
3877		 (PyObject **)&v, &outpos, &p))
3878		goto onError;
3879	}
3880    }
3881    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3882	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3883	    goto onError;
3884    Py_XDECREF(errorHandler);
3885    Py_XDECREF(exc);
3886    return (PyObject *)v;
3887
3888 onError:
3889    Py_XDECREF(v);
3890    Py_XDECREF(errorHandler);
3891    Py_XDECREF(exc);
3892    return NULL;
3893}
3894
3895PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3896				Py_ssize_t size,
3897				const char *errors)
3898{
3899    return unicode_encode_ucs1(p, size, errors, 128);
3900}
3901
3902PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3903{
3904    if (!PyUnicode_Check(unicode)) {
3905	PyErr_BadArgument();
3906	return NULL;
3907    }
3908    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3909				 PyUnicode_GET_SIZE(unicode),
3910				 NULL);
3911}
3912
3913#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3914
3915/* --- MBCS codecs for Windows -------------------------------------------- */
3916
3917#if SIZEOF_INT < SIZEOF_SSIZE_T
3918#define NEED_RETRY
3919#endif
3920
3921/* XXX This code is limited to "true" double-byte encodings, as
3922   a) it assumes an incomplete character consists of a single byte, and
3923   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3924      encodings, see IsDBCSLeadByteEx documentation. */
3925
3926static int is_dbcs_lead_byte(const char *s, int offset)
3927{
3928    const char *curr = s + offset;
3929
3930    if (IsDBCSLeadByte(*curr)) {
3931	const char *prev = CharPrev(s, curr);
3932	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3933    }
3934    return 0;
3935}
3936
3937/*
3938 * Decode MBCS string into unicode object. If 'final' is set, converts
3939 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3940 */
3941static int decode_mbcs(PyUnicodeObject **v,
3942			const char *s, /* MBCS string */
3943			int size, /* sizeof MBCS string */
3944			int final)
3945{
3946    Py_UNICODE *p;
3947    Py_ssize_t n = 0;
3948    int usize = 0;
3949
3950    assert(size >= 0);
3951
3952    /* Skip trailing lead-byte unless 'final' is set */
3953    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3954	--size;
3955
3956    /* First get the size of the result */
3957    if (size > 0) {
3958	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3959	if (usize == 0) {
3960	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3961	    return -1;
3962	}
3963    }
3964
3965    if (*v == NULL) {
3966	/* Create unicode object */
3967	*v = _PyUnicode_New(usize);
3968	if (*v == NULL)
3969	    return -1;
3970    }
3971    else {
3972	/* Extend unicode object */
3973	n = PyUnicode_GET_SIZE(*v);
3974	if (_PyUnicode_Resize(v, n + usize) < 0)
3975	    return -1;
3976    }
3977
3978    /* Do the conversion */
3979    if (size > 0) {
3980	p = PyUnicode_AS_UNICODE(*v) + n;
3981	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3982	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3983	    return -1;
3984	}
3985    }
3986
3987    return size;
3988}
3989
3990PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3991					Py_ssize_t size,
3992					const char *errors,
3993					Py_ssize_t *consumed)
3994{
3995    PyUnicodeObject *v = NULL;
3996    int done;
3997
3998    if (consumed)
3999	*consumed = 0;
4000
4001#ifdef NEED_RETRY
4002  retry:
4003    if (size > INT_MAX)
4004	done = decode_mbcs(&v, s, INT_MAX, 0);
4005    else
4006#endif
4007	done = decode_mbcs(&v, s, (int)size, !consumed);
4008
4009    if (done < 0) {
4010        Py_XDECREF(v);
4011	return NULL;
4012    }
4013
4014    if (consumed)
4015	*consumed += done;
4016
4017#ifdef NEED_RETRY
4018    if (size > INT_MAX) {
4019	s += done;
4020	size -= done;
4021	goto retry;
4022    }
4023#endif
4024
4025    return (PyObject *)v;
4026}
4027
4028PyObject *PyUnicode_DecodeMBCS(const char *s,
4029				Py_ssize_t size,
4030				const char *errors)
4031{
4032    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4033}
4034
4035/*
4036 * Convert unicode into string object (MBCS).
4037 * Returns 0 if succeed, -1 otherwise.
4038 */
4039static int encode_mbcs(PyObject **repr,
4040			const Py_UNICODE *p, /* unicode */
4041			int size) /* size of unicode */
4042{
4043    int mbcssize = 0;
4044    Py_ssize_t n = 0;
4045
4046    assert(size >= 0);
4047
4048    /* First get the size of the result */
4049    if (size > 0) {
4050	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4051	if (mbcssize == 0) {
4052	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4053	    return -1;
4054	}
4055    }
4056
4057    if (*repr == NULL) {
4058	/* Create string object */
4059	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4060	if (*repr == NULL)
4061	    return -1;
4062    }
4063    else {
4064	/* Extend string object */
4065	n = PyBytes_Size(*repr);
4066	if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4067	    return -1;
4068    }
4069
4070    /* Do the conversion */
4071    if (size > 0) {
4072	char *s = PyBytes_AS_STRING(*repr) + n;
4073	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4074	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4075	    return -1;
4076	}
4077    }
4078
4079    return 0;
4080}
4081
4082PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4083				Py_ssize_t size,
4084				const char *errors)
4085{
4086    PyObject *repr = NULL;
4087    int ret;
4088
4089#ifdef NEED_RETRY
4090 retry:
4091    if (size > INT_MAX)
4092	ret = encode_mbcs(&repr, p, INT_MAX);
4093    else
4094#endif
4095	ret = encode_mbcs(&repr, p, (int)size);
4096
4097    if (ret < 0) {
4098	Py_XDECREF(repr);
4099	return NULL;
4100    }
4101
4102#ifdef NEED_RETRY
4103    if (size > INT_MAX) {
4104	p += INT_MAX;
4105	size -= INT_MAX;
4106	goto retry;
4107    }
4108#endif
4109
4110    return repr;
4111}
4112
4113PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4114{
4115    if (!PyUnicode_Check(unicode)) {
4116        PyErr_BadArgument();
4117        return NULL;
4118    }
4119    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4120				PyUnicode_GET_SIZE(unicode),
4121				NULL);
4122}
4123
4124#undef NEED_RETRY
4125
4126#endif /* MS_WINDOWS */
4127
4128/* --- Character Mapping Codec -------------------------------------------- */
4129
4130PyObject *PyUnicode_DecodeCharmap(const char *s,
4131				  Py_ssize_t size,
4132				  PyObject *mapping,
4133				  const char *errors)
4134{
4135    const char *starts = s;
4136    Py_ssize_t startinpos;
4137    Py_ssize_t endinpos;
4138    Py_ssize_t outpos;
4139    const char *e;
4140    PyUnicodeObject *v;
4141    Py_UNICODE *p;
4142    Py_ssize_t extrachars = 0;
4143    PyObject *errorHandler = NULL;
4144    PyObject *exc = NULL;
4145    Py_UNICODE *mapstring = NULL;
4146    Py_ssize_t maplen = 0;
4147
4148    /* Default to Latin-1 */
4149    if (mapping == NULL)
4150	return PyUnicode_DecodeLatin1(s, size, errors);
4151
4152    v = _PyUnicode_New(size);
4153    if (v == NULL)
4154	goto onError;
4155    if (size == 0)
4156	return (PyObject *)v;
4157    p = PyUnicode_AS_UNICODE(v);
4158    e = s + size;
4159    if (PyUnicode_CheckExact(mapping)) {
4160	mapstring = PyUnicode_AS_UNICODE(mapping);
4161	maplen = PyUnicode_GET_SIZE(mapping);
4162	while (s < e) {
4163	    unsigned char ch = *s;
4164	    Py_UNICODE x = 0xfffe; /* illegal value */
4165
4166	    if (ch < maplen)
4167		x = mapstring[ch];
4168
4169	    if (x == 0xfffe) {
4170		/* undefined mapping */
4171		outpos = p-PyUnicode_AS_UNICODE(v);
4172		startinpos = s-starts;
4173		endinpos = startinpos+1;
4174		if (unicode_decode_call_errorhandler(
4175		     errors, &errorHandler,
4176		     "charmap", "character maps to <undefined>",
4177		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4178		     (PyObject **)&v, &outpos, &p)) {
4179		    goto onError;
4180		}
4181		continue;
4182	    }
4183	    *p++ = x;
4184	    ++s;
4185	}
4186    }
4187    else {
4188	while (s < e) {
4189	    unsigned char ch = *s;
4190	    PyObject *w, *x;
4191
4192	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
4193	    w = PyLong_FromLong((long)ch);
4194	    if (w == NULL)
4195		goto onError;
4196	    x = PyObject_GetItem(mapping, w);
4197	    Py_DECREF(w);
4198	    if (x == NULL) {
4199		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4200		    /* No mapping found means: mapping is undefined. */
4201		    PyErr_Clear();
4202		    x = Py_None;
4203		    Py_INCREF(x);
4204		} else
4205		    goto onError;
4206	    }
4207
4208	    /* Apply mapping */
4209	    if (PyLong_Check(x)) {
4210		long value = PyLong_AS_LONG(x);
4211		if (value < 0 || value > 65535) {
4212		    PyErr_SetString(PyExc_TypeError,
4213				    "character mapping must be in range(65536)");
4214		    Py_DECREF(x);
4215		    goto onError;
4216		}
4217		*p++ = (Py_UNICODE)value;
4218	    }
4219	    else if (x == Py_None) {
4220		/* undefined mapping */
4221		outpos = p-PyUnicode_AS_UNICODE(v);
4222		startinpos = s-starts;
4223		endinpos = startinpos+1;
4224		if (unicode_decode_call_errorhandler(
4225		     errors, &errorHandler,
4226		     "charmap", "character maps to <undefined>",
4227		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4228		     (PyObject **)&v, &outpos, &p)) {
4229		    Py_DECREF(x);
4230		    goto onError;
4231		}
4232		Py_DECREF(x);
4233		continue;
4234	    }
4235	    else if (PyUnicode_Check(x)) {
4236		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4237
4238		if (targetsize == 1)
4239		    /* 1-1 mapping */
4240		    *p++ = *PyUnicode_AS_UNICODE(x);
4241
4242		else if (targetsize > 1) {
4243		    /* 1-n mapping */
4244		    if (targetsize > extrachars) {
4245			/* resize first */
4246			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4247			Py_ssize_t needed = (targetsize - extrachars) + \
4248				     (targetsize << 2);
4249			extrachars += needed;
4250			/* XXX overflow detection missing */
4251			if (_PyUnicode_Resize(&v,
4252					     PyUnicode_GET_SIZE(v) + needed) < 0) {
4253			    Py_DECREF(x);
4254			    goto onError;
4255			}
4256			p = PyUnicode_AS_UNICODE(v) + oldpos;
4257		    }
4258		    Py_UNICODE_COPY(p,
4259				    PyUnicode_AS_UNICODE(x),
4260				    targetsize);
4261		    p += targetsize;
4262		    extrachars -= targetsize;
4263		}
4264		/* 1-0 mapping: skip the character */
4265	    }
4266	    else {
4267		/* wrong return value */
4268		PyErr_SetString(PyExc_TypeError,
4269		      "character mapping must return integer, None or str");
4270		Py_DECREF(x);
4271		goto onError;
4272	    }
4273	    Py_DECREF(x);
4274	    ++s;
4275	}
4276    }
4277    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4278	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4279	    goto onError;
4280    Py_XDECREF(errorHandler);
4281    Py_XDECREF(exc);
4282    return (PyObject *)v;
4283
4284 onError:
4285    Py_XDECREF(errorHandler);
4286    Py_XDECREF(exc);
4287    Py_XDECREF(v);
4288    return NULL;
4289}
4290
4291/* Charmap encoding: the lookup table */
4292
4293struct encoding_map{
4294  PyObject_HEAD
4295  unsigned char level1[32];
4296  int count2, count3;
4297  unsigned char level23[1];
4298};
4299
4300static PyObject*
4301encoding_map_size(PyObject *obj, PyObject* args)
4302{
4303    struct encoding_map *map = (struct encoding_map*)obj;
4304    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4305                          128*map->count3);
4306}
4307
4308static PyMethodDef encoding_map_methods[] = {
4309	{"size", encoding_map_size, METH_NOARGS,
4310         PyDoc_STR("Return the size (in bytes) of this object") },
4311        { 0 }
4312};
4313
4314static void
4315encoding_map_dealloc(PyObject* o)
4316{
4317	PyObject_FREE(o);
4318}
4319
4320static PyTypeObject EncodingMapType = {
4321	PyVarObject_HEAD_INIT(NULL, 0)
4322        "EncodingMap",          /*tp_name*/
4323        sizeof(struct encoding_map),   /*tp_basicsize*/
4324        0,                      /*tp_itemsize*/
4325        /* methods */
4326        encoding_map_dealloc,   /*tp_dealloc*/
4327        0,                      /*tp_print*/
4328        0,                      /*tp_getattr*/
4329        0,                      /*tp_setattr*/
4330        0,                      /*tp_compare*/
4331        0,                      /*tp_repr*/
4332        0,                      /*tp_as_number*/
4333        0,                      /*tp_as_sequence*/
4334        0,                      /*tp_as_mapping*/
4335        0,                      /*tp_hash*/
4336        0,                      /*tp_call*/
4337        0,                      /*tp_str*/
4338        0,                      /*tp_getattro*/
4339        0,                      /*tp_setattro*/
4340        0,                      /*tp_as_buffer*/
4341        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4342        0,                      /*tp_doc*/
4343        0,                      /*tp_traverse*/
4344        0,                      /*tp_clear*/
4345        0,                      /*tp_richcompare*/
4346        0,                      /*tp_weaklistoffset*/
4347        0,                      /*tp_iter*/
4348        0,                      /*tp_iternext*/
4349        encoding_map_methods,   /*tp_methods*/
4350        0,                      /*tp_members*/
4351        0,                      /*tp_getset*/
4352        0,                      /*tp_base*/
4353        0,                      /*tp_dict*/
4354        0,                      /*tp_descr_get*/
4355        0,                      /*tp_descr_set*/
4356        0,                      /*tp_dictoffset*/
4357        0,                      /*tp_init*/
4358        0,                      /*tp_alloc*/
4359        0,                      /*tp_new*/
4360        0,                      /*tp_free*/
4361        0,                      /*tp_is_gc*/
4362};
4363
4364PyObject*
4365PyUnicode_BuildEncodingMap(PyObject* string)
4366{
4367    Py_UNICODE *decode;
4368    PyObject *result;
4369    struct encoding_map *mresult;
4370    int i;
4371    int need_dict = 0;
4372    unsigned char level1[32];
4373    unsigned char level2[512];
4374    unsigned char *mlevel1, *mlevel2, *mlevel3;
4375    int count2 = 0, count3 = 0;
4376
4377    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4378        PyErr_BadArgument();
4379        return NULL;
4380    }
4381    decode = PyUnicode_AS_UNICODE(string);
4382    memset(level1, 0xFF, sizeof level1);
4383    memset(level2, 0xFF, sizeof level2);
4384
4385    /* If there isn't a one-to-one mapping of NULL to \0,
4386       or if there are non-BMP characters, we need to use
4387       a mapping dictionary. */
4388    if (decode[0] != 0)
4389        need_dict = 1;
4390    for (i = 1; i < 256; i++) {
4391        int l1, l2;
4392        if (decode[i] == 0
4393            #ifdef Py_UNICODE_WIDE
4394            || decode[i] > 0xFFFF
4395            #endif
4396        ) {
4397            need_dict = 1;
4398            break;
4399        }
4400        if (decode[i] == 0xFFFE)
4401            /* unmapped character */
4402            continue;
4403        l1 = decode[i] >> 11;
4404        l2 = decode[i] >> 7;
4405        if (level1[l1] == 0xFF)
4406            level1[l1] = count2++;
4407        if (level2[l2] == 0xFF)
4408            level2[l2] = count3++;
4409    }
4410
4411    if (count2 >= 0xFF || count3 >= 0xFF)
4412        need_dict = 1;
4413
4414    if (need_dict) {
4415        PyObject *result = PyDict_New();
4416        PyObject *key, *value;
4417        if (!result)
4418            return NULL;
4419        for (i = 0; i < 256; i++) {
4420            key = value = NULL;
4421            key = PyLong_FromLong(decode[i]);
4422            value = PyLong_FromLong(i);
4423            if (!key || !value)
4424                goto failed1;
4425            if (PyDict_SetItem(result, key, value) == -1)
4426                goto failed1;
4427            Py_DECREF(key);
4428            Py_DECREF(value);
4429        }
4430        return result;
4431      failed1:
4432        Py_XDECREF(key);
4433        Py_XDECREF(value);
4434        Py_DECREF(result);
4435        return NULL;
4436    }
4437
4438    /* Create a three-level trie */
4439    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4440                             16*count2 + 128*count3 - 1);
4441    if (!result)
4442        return PyErr_NoMemory();
4443    PyObject_Init(result, &EncodingMapType);
4444    mresult = (struct encoding_map*)result;
4445    mresult->count2 = count2;
4446    mresult->count3 = count3;
4447    mlevel1 = mresult->level1;
4448    mlevel2 = mresult->level23;
4449    mlevel3 = mresult->level23 + 16*count2;
4450    memcpy(mlevel1, level1, 32);
4451    memset(mlevel2, 0xFF, 16*count2);
4452    memset(mlevel3, 0, 128*count3);
4453    count3 = 0;
4454    for (i = 1; i < 256; i++) {
4455        int o1, o2, o3, i2, i3;
4456        if (decode[i] == 0xFFFE)
4457            /* unmapped character */
4458            continue;
4459        o1 = decode[i]>>11;
4460        o2 = (decode[i]>>7) & 0xF;
4461        i2 = 16*mlevel1[o1] + o2;
4462        if (mlevel2[i2] == 0xFF)
4463            mlevel2[i2] = count3++;
4464        o3 = decode[i] & 0x7F;
4465        i3 = 128*mlevel2[i2] + o3;
4466        mlevel3[i3] = i;
4467    }
4468    return result;
4469}
4470
4471static int
4472encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4473{
4474    struct encoding_map *map = (struct encoding_map*)mapping;
4475    int l1 = c>>11;
4476    int l2 = (c>>7) & 0xF;
4477    int l3 = c & 0x7F;
4478    int i;
4479
4480#ifdef Py_UNICODE_WIDE
4481    if (c > 0xFFFF) {
4482	return -1;
4483    }
4484#endif
4485    if (c == 0)
4486        return 0;
4487    /* level 1*/
4488    i = map->level1[l1];
4489    if (i == 0xFF) {
4490        return -1;
4491    }
4492    /* level 2*/
4493    i = map->level23[16*i+l2];
4494    if (i == 0xFF) {
4495        return -1;
4496    }
4497    /* level 3 */
4498    i = map->level23[16*map->count2 + 128*i + l3];
4499    if (i == 0) {
4500        return -1;
4501    }
4502    return i;
4503}
4504
4505/* Lookup the character ch in the mapping. If the character
4506   can't be found, Py_None is returned (or NULL, if another
4507   error occurred). */
4508static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4509{
4510    PyObject *w = PyLong_FromLong((long)c);
4511    PyObject *x;
4512
4513    if (w == NULL)
4514	 return NULL;
4515    x = PyObject_GetItem(mapping, w);
4516    Py_DECREF(w);
4517    if (x == NULL) {
4518	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4519	    /* No mapping found means: mapping is undefined. */
4520	    PyErr_Clear();
4521	    x = Py_None;
4522	    Py_INCREF(x);
4523	    return x;
4524	} else
4525	    return NULL;
4526    }
4527    else if (x == Py_None)
4528	return x;
4529    else if (PyLong_Check(x)) {
4530	long value = PyLong_AS_LONG(x);
4531	if (value < 0 || value > 255) {
4532	    PyErr_SetString(PyExc_TypeError,
4533			     "character mapping must be in range(256)");
4534	    Py_DECREF(x);
4535	    return NULL;
4536	}
4537	return x;
4538    }
4539    else if (PyBytes_Check(x))
4540	return x;
4541    else {
4542	/* wrong return value */
4543	PyErr_Format(PyExc_TypeError,
4544                "character mapping must return integer, bytes or None, not %.400s",
4545                x->ob_type->tp_name);
4546	Py_DECREF(x);
4547	return NULL;
4548    }
4549}
4550
4551static int
4552charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4553{
4554	Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4555	/* exponentially overallocate to minimize reallocations */
4556	if (requiredsize < 2*outsize)
4557	    requiredsize = 2*outsize;
4558	if (_PyBytes_Resize(outobj, requiredsize))
4559	    return -1;
4560	return 0;
4561}
4562
4563typedef enum charmapencode_result {
4564  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4565}charmapencode_result;
4566/* lookup the character, put the result in the output string and adjust
4567   various state variables. Resize the output bytes object if not enough
4568   space is available. Return a new reference to the object that
4569   was put in the output buffer, or Py_None, if the mapping was undefined
4570   (in which case no character was written) or NULL, if a
4571   reallocation error occurred. The caller must decref the result */
4572static
4573charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4574    PyObject **outobj, Py_ssize_t *outpos)
4575{
4576    PyObject *rep;
4577    char *outstart;
4578    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4579
4580    if (Py_TYPE(mapping) == &EncodingMapType) {
4581        int res = encoding_map_lookup(c, mapping);
4582	Py_ssize_t requiredsize = *outpos+1;
4583        if (res == -1)
4584            return enc_FAILED;
4585	if (outsize<requiredsize)
4586	    if (charmapencode_resize(outobj, outpos, requiredsize))
4587		return enc_EXCEPTION;
4588        outstart = PyBytes_AS_STRING(*outobj);
4589	outstart[(*outpos)++] = (char)res;
4590	return enc_SUCCESS;
4591    }
4592
4593    rep = charmapencode_lookup(c, mapping);
4594    if (rep==NULL)
4595	return enc_EXCEPTION;
4596    else if (rep==Py_None) {
4597	Py_DECREF(rep);
4598	return enc_FAILED;
4599    } else {
4600	if (PyLong_Check(rep)) {
4601	    Py_ssize_t requiredsize = *outpos+1;
4602	    if (outsize<requiredsize)
4603		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4604		    Py_DECREF(rep);
4605		    return enc_EXCEPTION;
4606		}
4607            outstart = PyBytes_AS_STRING(*outobj);
4608	    outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
4609	}
4610	else {
4611	    const char *repchars = PyBytes_AS_STRING(rep);
4612	    Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4613	    Py_ssize_t requiredsize = *outpos+repsize;
4614	    if (outsize<requiredsize)
4615		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4616		    Py_DECREF(rep);
4617		    return enc_EXCEPTION;
4618		}
4619            outstart = PyBytes_AS_STRING(*outobj);
4620	    memcpy(outstart + *outpos, repchars, repsize);
4621	    *outpos += repsize;
4622	}
4623    }
4624    Py_DECREF(rep);
4625    return enc_SUCCESS;
4626}
4627
4628/* handle an error in PyUnicode_EncodeCharmap
4629   Return 0 on success, -1 on error */
4630static
4631int charmap_encoding_error(
4632    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4633    PyObject **exceptionObject,
4634    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4635    PyObject **res, Py_ssize_t *respos)
4636{
4637    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4638    Py_ssize_t repsize;
4639    Py_ssize_t newpos;
4640    Py_UNICODE *uni2;
4641    /* startpos for collecting unencodable chars */
4642    Py_ssize_t collstartpos = *inpos;
4643    Py_ssize_t collendpos = *inpos+1;
4644    Py_ssize_t collpos;
4645    char *encoding = "charmap";
4646    char *reason = "character maps to <undefined>";
4647    charmapencode_result x;
4648
4649    /* find all unencodable characters */
4650    while (collendpos < size) {
4651        PyObject *rep;
4652        if (Py_TYPE(mapping) == &EncodingMapType) {
4653	    int res = encoding_map_lookup(p[collendpos], mapping);
4654	    if (res != -1)
4655		break;
4656	    ++collendpos;
4657	    continue;
4658	}
4659
4660	rep = charmapencode_lookup(p[collendpos], mapping);
4661	if (rep==NULL)
4662	    return -1;
4663	else if (rep!=Py_None) {
4664	    Py_DECREF(rep);
4665	    break;
4666	}
4667	Py_DECREF(rep);
4668	++collendpos;
4669    }
4670    /* cache callback name lookup
4671     * (if not done yet, i.e. it's the first error) */
4672    if (*known_errorHandler==-1) {
4673	if ((errors==NULL) || (!strcmp(errors, "strict")))
4674	    *known_errorHandler = 1;
4675	else if (!strcmp(errors, "replace"))
4676	    *known_errorHandler = 2;
4677	else if (!strcmp(errors, "ignore"))
4678	    *known_errorHandler = 3;
4679	else if (!strcmp(errors, "xmlcharrefreplace"))
4680	    *known_errorHandler = 4;
4681	else
4682	    *known_errorHandler = 0;
4683    }
4684    switch (*known_errorHandler) {
4685	case 1: /* strict */
4686	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4687	    return -1;
4688	case 2: /* replace */
4689	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4690		x = charmapencode_output('?', mapping, res, respos);
4691		if (x==enc_EXCEPTION) {
4692		    return -1;
4693		}
4694		else if (x==enc_FAILED) {
4695		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4696		    return -1;
4697		}
4698	    }
4699	    /* fall through */
4700	case 3: /* ignore */
4701	    *inpos = collendpos;
4702	    break;
4703	case 4: /* xmlcharrefreplace */
4704	    /* generate replacement (temporarily (mis)uses p) */
4705	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4706		char buffer[2+29+1+1];
4707		char *cp;
4708		sprintf(buffer, "&#%d;", (int)p[collpos]);
4709		for (cp = buffer; *cp; ++cp) {
4710		    x = charmapencode_output(*cp, mapping, res, respos);
4711		    if (x==enc_EXCEPTION)
4712			return -1;
4713		    else if (x==enc_FAILED) {
4714			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4715			return -1;
4716		    }
4717		}
4718	    }
4719	    *inpos = collendpos;
4720	    break;
4721	default:
4722	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4723		encoding, reason, p, size, exceptionObject,
4724		collstartpos, collendpos, &newpos);
4725	    if (repunicode == NULL)
4726		return -1;
4727	    /* generate replacement  */
4728	    repsize = PyUnicode_GET_SIZE(repunicode);
4729	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4730		x = charmapencode_output(*uni2, mapping, res, respos);
4731		if (x==enc_EXCEPTION) {
4732		    return -1;
4733		}
4734		else if (x==enc_FAILED) {
4735		    Py_DECREF(repunicode);
4736		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4737		    return -1;
4738		}
4739	    }
4740	    *inpos = newpos;
4741	    Py_DECREF(repunicode);
4742    }
4743    return 0;
4744}
4745
4746PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4747				  Py_ssize_t size,
4748				  PyObject *mapping,
4749				  const char *errors)
4750{
4751    /* output object */
4752    PyObject *res = NULL;
4753    /* current input position */
4754    Py_ssize_t inpos = 0;
4755    /* current output position */
4756    Py_ssize_t respos = 0;
4757    PyObject *errorHandler = NULL;
4758    PyObject *exc = NULL;
4759    /* the following variable is used for caching string comparisons
4760     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4761     * 3=ignore, 4=xmlcharrefreplace */
4762    int known_errorHandler = -1;
4763
4764    /* Default to Latin-1 */
4765    if (mapping == NULL)
4766	return PyUnicode_EncodeLatin1(p, size, errors);
4767
4768    /* allocate enough for a simple encoding without
4769       replacements, if we need more, we'll resize */
4770    res = PyBytes_FromStringAndSize(NULL, size);
4771    if (res == NULL)
4772        goto onError;
4773    if (size == 0)
4774	return res;
4775
4776    while (inpos<size) {
4777	/* try to encode it */
4778	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4779	if (x==enc_EXCEPTION) /* error */
4780	    goto onError;
4781	if (x==enc_FAILED) { /* unencodable character */
4782	    if (charmap_encoding_error(p, size, &inpos, mapping,
4783		&exc,
4784		&known_errorHandler, &errorHandler, errors,
4785		&res, &respos)) {
4786		goto onError;
4787	    }
4788	}
4789	else
4790	    /* done with this character => adjust input position */
4791	    ++inpos;
4792    }
4793
4794    /* Resize if we allocated to much */
4795    if (respos<PyBytes_GET_SIZE(res))
4796	_PyBytes_Resize(&res, respos);
4797
4798    Py_XDECREF(exc);
4799    Py_XDECREF(errorHandler);
4800    return res;
4801
4802    onError:
4803    Py_XDECREF(res);
4804    Py_XDECREF(exc);
4805    Py_XDECREF(errorHandler);
4806    return NULL;
4807}
4808
4809PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4810				    PyObject *mapping)
4811{
4812    if (!PyUnicode_Check(unicode) || mapping == NULL) {
4813	PyErr_BadArgument();
4814	return NULL;
4815    }
4816    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4817				   PyUnicode_GET_SIZE(unicode),
4818				   mapping,
4819				   NULL);
4820}
4821
4822/* create or adjust a UnicodeTranslateError */
4823static void make_translate_exception(PyObject **exceptionObject,
4824    const Py_UNICODE *unicode, Py_ssize_t size,
4825    Py_ssize_t startpos, Py_ssize_t endpos,
4826    const char *reason)
4827{
4828    if (*exceptionObject == NULL) {
4829    	*exceptionObject = PyUnicodeTranslateError_Create(
4830	    unicode, size, startpos, endpos, reason);
4831    }
4832    else {
4833	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4834	    goto onError;
4835	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4836	    goto onError;
4837	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4838	    goto onError;
4839	return;
4840	onError:
4841	Py_DECREF(*exceptionObject);
4842	*exceptionObject = NULL;
4843    }
4844}
4845
4846/* raises a UnicodeTranslateError */
4847static void raise_translate_exception(PyObject **exceptionObject,
4848    const Py_UNICODE *unicode, Py_ssize_t size,
4849    Py_ssize_t startpos, Py_ssize_t endpos,
4850    const char *reason)
4851{
4852    make_translate_exception(exceptionObject,
4853	unicode, size, startpos, endpos, reason);
4854    if (*exceptionObject != NULL)
4855	PyCodec_StrictErrors(*exceptionObject);
4856}
4857
4858/* error handling callback helper:
4859   build arguments, call the callback and check the arguments,
4860   put the result into newpos and return the replacement string, which
4861   has to be freed by the caller */
4862static PyObject *unicode_translate_call_errorhandler(const char *errors,
4863    PyObject **errorHandler,
4864    const char *reason,
4865    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4866    Py_ssize_t startpos, Py_ssize_t endpos,
4867    Py_ssize_t *newpos)
4868{
4869    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
4870
4871    Py_ssize_t i_newpos;
4872    PyObject *restuple;
4873    PyObject *resunicode;
4874
4875    if (*errorHandler == NULL) {
4876	*errorHandler = PyCodec_LookupError(errors);
4877        if (*errorHandler == NULL)
4878	    return NULL;
4879    }
4880
4881    make_translate_exception(exceptionObject,
4882	unicode, size, startpos, endpos, reason);
4883    if (*exceptionObject == NULL)
4884	return NULL;
4885
4886    restuple = PyObject_CallFunctionObjArgs(
4887	*errorHandler, *exceptionObject, NULL);
4888    if (restuple == NULL)
4889	return NULL;
4890    if (!PyTuple_Check(restuple)) {
4891	PyErr_Format(PyExc_TypeError, &argparse[4]);
4892	Py_DECREF(restuple);
4893	return NULL;
4894    }
4895    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4896	&resunicode, &i_newpos)) {
4897	Py_DECREF(restuple);
4898	return NULL;
4899    }
4900    if (i_newpos<0)
4901	*newpos = size+i_newpos;
4902    else
4903        *newpos = i_newpos;
4904    if (*newpos<0 || *newpos>size) {
4905	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4906	Py_DECREF(restuple);
4907	return NULL;
4908    }
4909    Py_INCREF(resunicode);
4910    Py_DECREF(restuple);
4911    return resunicode;
4912}
4913
4914/* Lookup the character ch in the mapping and put the result in result,
4915   which must be decrefed by the caller.
4916   Return 0 on success, -1 on error */
4917static
4918int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4919{
4920    PyObject *w = PyLong_FromLong((long)c);
4921    PyObject *x;
4922
4923    if (w == NULL)
4924	 return -1;
4925    x = PyObject_GetItem(mapping, w);
4926    Py_DECREF(w);
4927    if (x == NULL) {
4928	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4929	    /* No mapping found means: use 1:1 mapping. */
4930	    PyErr_Clear();
4931	    *result = NULL;
4932	    return 0;
4933	} else
4934	    return -1;
4935    }
4936    else if (x == Py_None) {
4937	*result = x;
4938	return 0;
4939    }
4940    else if (PyLong_Check(x)) {
4941	long value = PyLong_AS_LONG(x);
4942	long max = PyUnicode_GetMax();
4943	if (value < 0 || value > max) {
4944	    PyErr_Format(PyExc_TypeError,
4945                         "character mapping must be in range(0x%x)", max+1);
4946	    Py_DECREF(x);
4947	    return -1;
4948	}
4949	*result = x;
4950	return 0;
4951    }
4952    else if (PyUnicode_Check(x)) {
4953	*result = x;
4954	return 0;
4955    }
4956    else {
4957	/* wrong return value */
4958	PyErr_SetString(PyExc_TypeError,
4959	      "character mapping must return integer, None or str");
4960	Py_DECREF(x);
4961	return -1;
4962    }
4963}
4964/* ensure that *outobj is at least requiredsize characters long,
4965if not reallocate and adjust various state variables.
4966Return 0 on success, -1 on error */
4967static
4968int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4969    Py_ssize_t requiredsize)
4970{
4971    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4972    if (requiredsize > oldsize) {
4973	/* remember old output position */
4974	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4975	/* exponentially overallocate to minimize reallocations */
4976	if (requiredsize < 2 * oldsize)
4977	    requiredsize = 2 * oldsize;
4978	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4979	    return -1;
4980	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4981    }
4982    return 0;
4983}
4984/* lookup the character, put the result in the output string and adjust
4985   various state variables. Return a new reference to the object that
4986   was put in the output buffer in *result, or Py_None, if the mapping was
4987   undefined (in which case no character was written).
4988   The called must decref result.
4989   Return 0 on success, -1 on error. */
4990static
4991int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4992    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4993    PyObject **res)
4994{
4995    if (charmaptranslate_lookup(*curinp, mapping, res))
4996	return -1;
4997    if (*res==NULL) {
4998	/* not found => default to 1:1 mapping */
4999	*(*outp)++ = *curinp;
5000    }
5001    else if (*res==Py_None)
5002	;
5003    else if (PyLong_Check(*res)) {
5004	/* no overflow check, because we know that the space is enough */
5005	*(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
5006    }
5007    else if (PyUnicode_Check(*res)) {
5008	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5009	if (repsize==1) {
5010	    /* no overflow check, because we know that the space is enough */
5011	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5012	}
5013	else if (repsize!=0) {
5014	    /* more than one character */
5015	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5016		(insize - (curinp-startinp)) +
5017		repsize - 1;
5018	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
5019		return -1;
5020	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5021	    *outp += repsize;
5022	}
5023    }
5024    else
5025	return -1;
5026    return 0;
5027}
5028
5029PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5030				     Py_ssize_t size,
5031				     PyObject *mapping,
5032				     const char *errors)
5033{
5034    /* output object */
5035    PyObject *res = NULL;
5036    /* pointers to the beginning and end+1 of input */
5037    const Py_UNICODE *startp = p;
5038    const Py_UNICODE *endp = p + size;
5039    /* pointer into the output */
5040    Py_UNICODE *str;
5041    /* current output position */
5042    Py_ssize_t respos = 0;
5043    char *reason = "character maps to <undefined>";
5044    PyObject *errorHandler = NULL;
5045    PyObject *exc = NULL;
5046    /* the following variable is used for caching string comparisons
5047     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5048     * 3=ignore, 4=xmlcharrefreplace */
5049    int known_errorHandler = -1;
5050
5051    if (mapping == NULL) {
5052	PyErr_BadArgument();
5053	return NULL;
5054    }
5055
5056    /* allocate enough for a simple 1:1 translation without
5057       replacements, if we need more, we'll resize */
5058    res = PyUnicode_FromUnicode(NULL, size);
5059    if (res == NULL)
5060	goto onError;
5061    if (size == 0)
5062	return res;
5063    str = PyUnicode_AS_UNICODE(res);
5064
5065    while (p<endp) {
5066	/* try to encode it */
5067	PyObject *x = NULL;
5068	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5069	    Py_XDECREF(x);
5070	    goto onError;
5071	}
5072	Py_XDECREF(x);
5073	if (x!=Py_None) /* it worked => adjust input pointer */
5074	    ++p;
5075	else { /* untranslatable character */
5076	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5077	    Py_ssize_t repsize;
5078	    Py_ssize_t newpos;
5079	    Py_UNICODE *uni2;
5080	    /* startpos for collecting untranslatable chars */
5081	    const Py_UNICODE *collstart = p;
5082	    const Py_UNICODE *collend = p+1;
5083	    const Py_UNICODE *coll;
5084
5085	    /* find all untranslatable characters */
5086	    while (collend < endp) {
5087		if (charmaptranslate_lookup(*collend, mapping, &x))
5088		    goto onError;
5089		Py_XDECREF(x);
5090		if (x!=Py_None)
5091		    break;
5092		++collend;
5093	    }
5094	    /* cache callback name lookup
5095	     * (if not done yet, i.e. it's the first error) */
5096	    if (known_errorHandler==-1) {
5097		if ((errors==NULL) || (!strcmp(errors, "strict")))
5098		    known_errorHandler = 1;
5099		else if (!strcmp(errors, "replace"))
5100		    known_errorHandler = 2;
5101		else if (!strcmp(errors, "ignore"))
5102		    known_errorHandler = 3;
5103		else if (!strcmp(errors, "xmlcharrefreplace"))
5104		    known_errorHandler = 4;
5105		else
5106		    known_errorHandler = 0;
5107	    }
5108	    switch (known_errorHandler) {
5109		case 1: /* strict */
5110		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5111		    goto onError;
5112		case 2: /* replace */
5113		    /* No need to check for space, this is a 1:1 replacement */
5114		    for (coll = collstart; coll<collend; ++coll)
5115			*str++ = '?';
5116		    /* fall through */
5117		case 3: /* ignore */
5118		    p = collend;
5119		    break;
5120		case 4: /* xmlcharrefreplace */
5121		    /* generate replacement (temporarily (mis)uses p) */
5122		    for (p = collstart; p < collend; ++p) {
5123			char buffer[2+29+1+1];
5124			char *cp;
5125			sprintf(buffer, "&#%d;", (int)*p);
5126			if (charmaptranslate_makespace(&res, &str,
5127			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5128			    goto onError;
5129			for (cp = buffer; *cp; ++cp)
5130			    *str++ = *cp;
5131		    }
5132		    p = collend;
5133		    break;
5134		default:
5135		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5136			reason, startp, size, &exc,
5137			collstart-startp, collend-startp, &newpos);
5138		    if (repunicode == NULL)
5139			goto onError;
5140		    /* generate replacement  */
5141		    repsize = PyUnicode_GET_SIZE(repunicode);
5142		    if (charmaptranslate_makespace(&res, &str,
5143			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5144			Py_DECREF(repunicode);
5145			goto onError;
5146		    }
5147		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5148			*str++ = *uni2;
5149		    p = startp + newpos;
5150		    Py_DECREF(repunicode);
5151	    }
5152	}
5153    }
5154    /* Resize if we allocated to much */
5155    respos = str-PyUnicode_AS_UNICODE(res);
5156    if (respos<PyUnicode_GET_SIZE(res)) {
5157	if (_PyUnicode_Resize(&res, respos) < 0)
5158	    goto onError;
5159    }
5160    Py_XDECREF(exc);
5161    Py_XDECREF(errorHandler);
5162    return res;
5163
5164    onError:
5165    Py_XDECREF(res);
5166    Py_XDECREF(exc);
5167    Py_XDECREF(errorHandler);
5168    return NULL;
5169}
5170
5171PyObject *PyUnicode_Translate(PyObject *str,
5172			      PyObject *mapping,
5173			      const char *errors)
5174{
5175    PyObject *result;
5176
5177    str = PyUnicode_FromObject(str);
5178    if (str == NULL)
5179	goto onError;
5180    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5181					PyUnicode_GET_SIZE(str),
5182					mapping,
5183					errors);
5184    Py_DECREF(str);
5185    return result;
5186
5187 onError:
5188    Py_XDECREF(str);
5189    return NULL;
5190}
5191
5192/* --- Decimal Encoder ---------------------------------------------------- */
5193
5194int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5195			    Py_ssize_t length,
5196			    char *output,
5197			    const char *errors)
5198{
5199    Py_UNICODE *p, *end;
5200    PyObject *errorHandler = NULL;
5201    PyObject *exc = NULL;
5202    const char *encoding = "decimal";
5203    const char *reason = "invalid decimal Unicode string";
5204    /* the following variable is used for caching string comparisons
5205     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5206    int known_errorHandler = -1;
5207
5208    if (output == NULL) {
5209	PyErr_BadArgument();
5210	return -1;
5211    }
5212
5213    p = s;
5214    end = s + length;
5215    while (p < end) {
5216	register Py_UNICODE ch = *p;
5217	int decimal;
5218	PyObject *repunicode;
5219	Py_ssize_t repsize;
5220	Py_ssize_t newpos;
5221	Py_UNICODE *uni2;
5222	Py_UNICODE *collstart;
5223	Py_UNICODE *collend;
5224
5225	if (Py_UNICODE_ISSPACE(ch)) {
5226	    *output++ = ' ';
5227	    ++p;
5228	    continue;
5229	}
5230	decimal = Py_UNICODE_TODECIMAL(ch);
5231	if (decimal >= 0) {
5232	    *output++ = '0' + decimal;
5233	    ++p;
5234	    continue;
5235	}
5236	if (0 < ch && ch < 256) {
5237	    *output++ = (char)ch;
5238	    ++p;
5239	    continue;
5240	}
5241	/* All other characters are considered unencodable */
5242	collstart = p;
5243	collend = p+1;
5244	while (collend < end) {
5245	    if ((0 < *collend && *collend < 256) ||
5246	        !Py_UNICODE_ISSPACE(*collend) ||
5247	        Py_UNICODE_TODECIMAL(*collend))
5248		break;
5249	}
5250	/* cache callback name lookup
5251	 * (if not done yet, i.e. it's the first error) */
5252	if (known_errorHandler==-1) {
5253	    if ((errors==NULL) || (!strcmp(errors, "strict")))
5254		known_errorHandler = 1;
5255	    else if (!strcmp(errors, "replace"))
5256		known_errorHandler = 2;
5257	    else if (!strcmp(errors, "ignore"))
5258		known_errorHandler = 3;
5259	    else if (!strcmp(errors, "xmlcharrefreplace"))
5260		known_errorHandler = 4;
5261	    else
5262		known_errorHandler = 0;
5263	}
5264	switch (known_errorHandler) {
5265	    case 1: /* strict */
5266		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5267		goto onError;
5268	    case 2: /* replace */
5269		for (p = collstart; p < collend; ++p)
5270		    *output++ = '?';
5271		/* fall through */
5272	    case 3: /* ignore */
5273		p = collend;
5274		break;
5275	    case 4: /* xmlcharrefreplace */
5276		/* generate replacement (temporarily (mis)uses p) */
5277		for (p = collstart; p < collend; ++p)
5278		    output += sprintf(output, "&#%d;", (int)*p);
5279		p = collend;
5280		break;
5281	    default:
5282		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5283		    encoding, reason, s, length, &exc,
5284		    collstart-s, collend-s, &newpos);
5285		if (repunicode == NULL)
5286		    goto onError;
5287		/* generate replacement  */
5288		repsize = PyUnicode_GET_SIZE(repunicode);
5289		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5290		    Py_UNICODE ch = *uni2;
5291		    if (Py_UNICODE_ISSPACE(ch))
5292			*output++ = ' ';
5293		    else {
5294			decimal = Py_UNICODE_TODECIMAL(ch);
5295			if (decimal >= 0)
5296			    *output++ = '0' + decimal;
5297			else if (0 < ch && ch < 256)
5298			    *output++ = (char)ch;
5299			else {
5300			    Py_DECREF(repunicode);
5301			    raise_encode_exception(&exc, encoding,
5302				s, length, collstart-s, collend-s, reason);
5303			    goto onError;
5304			}
5305		    }
5306		}
5307		p = s + newpos;
5308		Py_DECREF(repunicode);
5309	}
5310    }
5311    /* 0-terminate the output string */
5312    *output++ = '\0';
5313    Py_XDECREF(exc);
5314    Py_XDECREF(errorHandler);
5315    return 0;
5316
5317 onError:
5318    Py_XDECREF(exc);
5319    Py_XDECREF(errorHandler);
5320    return -1;
5321}
5322
5323/* --- Helpers ------------------------------------------------------------ */
5324
5325#include "stringlib/unicodedefs.h"
5326#include "stringlib/fastsearch.h"
5327#include "stringlib/count.h"
5328/* Include _ParseTupleFinds from find.h */
5329#define FROM_UNICODE
5330#include "stringlib/find.h"
5331#include "stringlib/partition.h"
5332
5333#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5334#include "stringlib/localeutil.h"
5335
5336/* helper macro to fixup start/end slice values */
5337#define FIX_START_END(obj)                      \
5338    if (start < 0)                              \
5339        start += (obj)->length;                 \
5340    if (start < 0)                              \
5341        start = 0;                              \
5342    if (end > (obj)->length)                    \
5343        end = (obj)->length;                    \
5344    if (end < 0)                                \
5345        end += (obj)->length;                   \
5346    if (end < 0)                                \
5347        end = 0;
5348
5349Py_ssize_t PyUnicode_Count(PyObject *str,
5350                           PyObject *substr,
5351                           Py_ssize_t start,
5352                           Py_ssize_t end)
5353{
5354    Py_ssize_t result;
5355    PyUnicodeObject* str_obj;
5356    PyUnicodeObject* sub_obj;
5357
5358    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5359    if (!str_obj)
5360	return -1;
5361    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5362    if (!sub_obj) {
5363	Py_DECREF(str_obj);
5364	return -1;
5365    }
5366
5367    FIX_START_END(str_obj);
5368
5369    result = stringlib_count(
5370        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5371        );
5372
5373    Py_DECREF(sub_obj);
5374    Py_DECREF(str_obj);
5375
5376    return result;
5377}
5378
5379Py_ssize_t PyUnicode_Find(PyObject *str,
5380                          PyObject *sub,
5381                          Py_ssize_t start,
5382                          Py_ssize_t end,
5383                          int direction)
5384{
5385    Py_ssize_t result;
5386
5387    str = PyUnicode_FromObject(str);
5388    if (!str)
5389	return -2;
5390    sub = PyUnicode_FromObject(sub);
5391    if (!sub) {
5392	Py_DECREF(str);
5393	return -2;
5394    }
5395
5396    if (direction > 0)
5397        result = stringlib_find_slice(
5398            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5399            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5400            start, end
5401            );
5402    else
5403        result = stringlib_rfind_slice(
5404            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5405            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5406            start, end
5407            );
5408
5409    Py_DECREF(str);
5410    Py_DECREF(sub);
5411
5412    return result;
5413}
5414
5415static
5416int tailmatch(PyUnicodeObject *self,
5417	      PyUnicodeObject *substring,
5418	      Py_ssize_t start,
5419	      Py_ssize_t end,
5420	      int direction)
5421{
5422    if (substring->length == 0)
5423        return 1;
5424
5425    FIX_START_END(self);
5426
5427    end -= substring->length;
5428    if (end < start)
5429	return 0;
5430
5431    if (direction > 0) {
5432	if (Py_UNICODE_MATCH(self, end, substring))
5433	    return 1;
5434    } else {
5435        if (Py_UNICODE_MATCH(self, start, substring))
5436	    return 1;
5437    }
5438
5439    return 0;
5440}
5441
5442Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5443			PyObject *substr,
5444			Py_ssize_t start,
5445			Py_ssize_t end,
5446			int direction)
5447{
5448    Py_ssize_t result;
5449
5450    str = PyUnicode_FromObject(str);
5451    if (str == NULL)
5452	return -1;
5453    substr = PyUnicode_FromObject(substr);
5454    if (substr == NULL) {
5455	Py_DECREF(str);
5456	return -1;
5457    }
5458
5459    result = tailmatch((PyUnicodeObject *)str,
5460		       (PyUnicodeObject *)substr,
5461		       start, end, direction);
5462    Py_DECREF(str);
5463    Py_DECREF(substr);
5464    return result;
5465}
5466
5467/* Apply fixfct filter to the Unicode object self and return a
5468   reference to the modified object */
5469
5470static
5471PyObject *fixup(PyUnicodeObject *self,
5472		int (*fixfct)(PyUnicodeObject *s))
5473{
5474
5475    PyUnicodeObject *u;
5476
5477    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5478    if (u == NULL)
5479	return NULL;
5480
5481    Py_UNICODE_COPY(u->str, self->str, self->length);
5482
5483    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5484	/* fixfct should return TRUE if it modified the buffer. If
5485	   FALSE, return a reference to the original buffer instead
5486	   (to save space, not time) */
5487	Py_INCREF(self);
5488	Py_DECREF(u);
5489	return (PyObject*) self;
5490    }
5491    return (PyObject*) u;
5492}
5493
5494static
5495int fixupper(PyUnicodeObject *self)
5496{
5497    Py_ssize_t len = self->length;
5498    Py_UNICODE *s = self->str;
5499    int status = 0;
5500
5501    while (len-- > 0) {
5502	register Py_UNICODE ch;
5503
5504	ch = Py_UNICODE_TOUPPER(*s);
5505	if (ch != *s) {
5506            status = 1;
5507	    *s = ch;
5508	}
5509        s++;
5510    }
5511
5512    return status;
5513}
5514
5515static
5516int fixlower(PyUnicodeObject *self)
5517{
5518    Py_ssize_t len = self->length;
5519    Py_UNICODE *s = self->str;
5520    int status = 0;
5521
5522    while (len-- > 0) {
5523	register Py_UNICODE ch;
5524
5525	ch = Py_UNICODE_TOLOWER(*s);
5526	if (ch != *s) {
5527            status = 1;
5528	    *s = ch;
5529	}
5530        s++;
5531    }
5532
5533    return status;
5534}
5535
5536static
5537int fixswapcase(PyUnicodeObject *self)
5538{
5539    Py_ssize_t len = self->length;
5540    Py_UNICODE *s = self->str;
5541    int status = 0;
5542
5543    while (len-- > 0) {
5544        if (Py_UNICODE_ISUPPER(*s)) {
5545            *s = Py_UNICODE_TOLOWER(*s);
5546            status = 1;
5547        } else if (Py_UNICODE_ISLOWER(*s)) {
5548            *s = Py_UNICODE_TOUPPER(*s);
5549            status = 1;
5550        }
5551        s++;
5552    }
5553
5554    return status;
5555}
5556
5557static
5558int fixcapitalize(PyUnicodeObject *self)
5559{
5560    Py_ssize_t len = self->length;
5561    Py_UNICODE *s = self->str;
5562    int status = 0;
5563
5564    if (len == 0)
5565	return 0;
5566    if (Py_UNICODE_ISLOWER(*s)) {
5567	*s = Py_UNICODE_TOUPPER(*s);
5568	status = 1;
5569    }
5570    s++;
5571    while (--len > 0) {
5572        if (Py_UNICODE_ISUPPER(*s)) {
5573            *s = Py_UNICODE_TOLOWER(*s);
5574            status = 1;
5575        }
5576        s++;
5577    }
5578    return status;
5579}
5580
5581static
5582int fixtitle(PyUnicodeObject *self)
5583{
5584    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5585    register Py_UNICODE *e;
5586    int previous_is_cased;
5587
5588    /* Shortcut for single character strings */
5589    if (PyUnicode_GET_SIZE(self) == 1) {
5590	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5591	if (*p != ch) {
5592	    *p = ch;
5593	    return 1;
5594	}
5595	else
5596	    return 0;
5597    }
5598
5599    e = p + PyUnicode_GET_SIZE(self);
5600    previous_is_cased = 0;
5601    for (; p < e; p++) {
5602	register const Py_UNICODE ch = *p;
5603
5604	if (previous_is_cased)
5605	    *p = Py_UNICODE_TOLOWER(ch);
5606	else
5607	    *p = Py_UNICODE_TOTITLE(ch);
5608
5609	if (Py_UNICODE_ISLOWER(ch) ||
5610	    Py_UNICODE_ISUPPER(ch) ||
5611	    Py_UNICODE_ISTITLE(ch))
5612	    previous_is_cased = 1;
5613	else
5614	    previous_is_cased = 0;
5615    }
5616    return 1;
5617}
5618
5619PyObject *
5620PyUnicode_Join(PyObject *separator, PyObject *seq)
5621{
5622    const Py_UNICODE blank = ' ';
5623    const Py_UNICODE *sep = &blank;
5624    Py_ssize_t seplen = 1;
5625    PyUnicodeObject *res = NULL; /* the result */
5626    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5627    PyObject *fseq;          /* PySequence_Fast(seq) */
5628    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
5629    PyObject **items;
5630    PyObject *item;
5631    Py_ssize_t sz, i;
5632
5633    fseq = PySequence_Fast(seq, "");
5634    if (fseq == NULL) {
5635    	return NULL;
5636    }
5637
5638    /* NOTE: the following code can't call back into Python code,
5639     * so we are sure that fseq won't be mutated.
5640     */
5641
5642    seqlen = PySequence_Fast_GET_SIZE(fseq);
5643    /* If empty sequence, return u"". */
5644    if (seqlen == 0) {
5645    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5646    	goto Done;
5647    }
5648    items = PySequence_Fast_ITEMS(fseq);
5649    /* If singleton sequence with an exact Unicode, return that. */
5650    if (seqlen == 1) {
5651	item = items[0];
5652	if (PyUnicode_CheckExact(item)) {
5653	    Py_INCREF(item);
5654	    res = (PyUnicodeObject *)item;
5655	    goto Done;
5656	}
5657    }
5658    else {
5659        /* Set up sep and seplen */
5660        if (separator == NULL) {
5661            sep = &blank;
5662            seplen = 1;
5663        }
5664        else {
5665            if (!PyUnicode_Check(separator)) {
5666                PyErr_Format(PyExc_TypeError,
5667                             "separator: expected str instance,"
5668                             " %.80s found",
5669                             Py_TYPE(separator)->tp_name);
5670                goto onError;
5671            }
5672            sep = PyUnicode_AS_UNICODE(separator);
5673            seplen = PyUnicode_GET_SIZE(separator);
5674        }
5675    }
5676
5677    /* There are at least two things to join, or else we have a subclass
5678     * of str in the sequence.
5679     * Do a pre-pass to figure out the total amount of space we'll
5680     * need (sz), and see whether all argument are strings.
5681     */
5682    sz = 0;
5683    for (i = 0; i < seqlen; i++) {
5684        const Py_ssize_t old_sz = sz;
5685        item = items[i];
5686	if (!PyUnicode_Check(item)) {
5687	    PyErr_Format(PyExc_TypeError,
5688			 "sequence item %zd: expected str instance,"
5689			 " %.80s found",
5690			 i, Py_TYPE(item)->tp_name);
5691	    goto onError;
5692	}
5693        sz += PyUnicode_GET_SIZE(item);
5694        if (i != 0)
5695            sz += seplen;
5696        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5697            PyErr_SetString(PyExc_OverflowError,
5698                "join() result is too long for a Python string");
5699            goto onError;
5700        }
5701    }
5702
5703    res = _PyUnicode_New(sz);
5704    if (res == NULL)
5705        goto onError;
5706
5707    /* Catenate everything. */
5708    res_p = PyUnicode_AS_UNICODE(res);
5709    for (i = 0; i < seqlen; ++i) {
5710        Py_ssize_t itemlen;
5711        item = items[i];
5712        itemlen = PyUnicode_GET_SIZE(item);
5713	/* Copy item, and maybe the separator. */
5714	if (i) {
5715	    Py_UNICODE_COPY(res_p, sep, seplen);
5716	    res_p += seplen;
5717	}
5718	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5719	res_p += itemlen;
5720    }
5721
5722 Done:
5723    Py_DECREF(fseq);
5724    return (PyObject *)res;
5725
5726 onError:
5727    Py_DECREF(fseq);
5728    Py_XDECREF(res);
5729    return NULL;
5730}
5731
5732static
5733PyUnicodeObject *pad(PyUnicodeObject *self,
5734		     Py_ssize_t left,
5735		     Py_ssize_t right,
5736		     Py_UNICODE fill)
5737{
5738    PyUnicodeObject *u;
5739
5740    if (left < 0)
5741        left = 0;
5742    if (right < 0)
5743        right = 0;
5744
5745    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5746        Py_INCREF(self);
5747        return self;
5748    }
5749
5750    u = _PyUnicode_New(left + self->length + right);
5751    if (u) {
5752        if (left)
5753            Py_UNICODE_FILL(u->str, fill, left);
5754        Py_UNICODE_COPY(u->str + left, self->str, self->length);
5755        if (right)
5756            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5757    }
5758
5759    return u;
5760}
5761
5762#define SPLIT_APPEND(data, left, right)					\
5763	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
5764	if (!str)							\
5765	    goto onError;						\
5766	if (PyList_Append(list, str)) {					\
5767	    Py_DECREF(str);						\
5768	    goto onError;						\
5769	}								\
5770        else								\
5771            Py_DECREF(str);
5772
5773static
5774PyObject *split_whitespace(PyUnicodeObject *self,
5775			   PyObject *list,
5776			   Py_ssize_t maxcount)
5777{
5778    register Py_ssize_t i;
5779    register Py_ssize_t j;
5780    Py_ssize_t len = self->length;
5781    PyObject *str;
5782    register const Py_UNICODE *buf = self->str;
5783
5784    for (i = j = 0; i < len; ) {
5785	/* find a token */
5786	while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5787	    i++;
5788	j = i;
5789	while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5790	    i++;
5791	if (j < i) {
5792	    if (maxcount-- <= 0)
5793		break;
5794	    SPLIT_APPEND(buf, j, i);
5795	    while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5796		i++;
5797	    j = i;
5798	}
5799    }
5800    if (j < len) {
5801	SPLIT_APPEND(buf, j, len);
5802    }
5803    return list;
5804
5805 onError:
5806    Py_DECREF(list);
5807    return NULL;
5808}
5809
5810PyObject *PyUnicode_Splitlines(PyObject *string,
5811			       int keepends)
5812{
5813    register Py_ssize_t i;
5814    register Py_ssize_t j;
5815    Py_ssize_t len;
5816    PyObject *list;
5817    PyObject *str;
5818    Py_UNICODE *data;
5819
5820    string = PyUnicode_FromObject(string);
5821    if (string == NULL)
5822	return NULL;
5823    data = PyUnicode_AS_UNICODE(string);
5824    len = PyUnicode_GET_SIZE(string);
5825
5826    list = PyList_New(0);
5827    if (!list)
5828        goto onError;
5829
5830    for (i = j = 0; i < len; ) {
5831	Py_ssize_t eol;
5832
5833	/* Find a line and append it */
5834	while (i < len && !BLOOM_LINEBREAK(data[i]))
5835	    i++;
5836
5837	/* Skip the line break reading CRLF as one line break */
5838	eol = i;
5839	if (i < len) {
5840	    if (data[i] == '\r' && i + 1 < len &&
5841		data[i+1] == '\n')
5842		i += 2;
5843	    else
5844		i++;
5845	    if (keepends)
5846		eol = i;
5847	}
5848	SPLIT_APPEND(data, j, eol);
5849	j = i;
5850    }
5851    if (j < len) {
5852	SPLIT_APPEND(data, j, len);
5853    }
5854
5855    Py_DECREF(string);
5856    return list;
5857
5858 onError:
5859    Py_XDECREF(list);
5860    Py_DECREF(string);
5861    return NULL;
5862}
5863
5864static
5865PyObject *split_char(PyUnicodeObject *self,
5866		     PyObject *list,
5867		     Py_UNICODE ch,
5868		     Py_ssize_t maxcount)
5869{
5870    register Py_ssize_t i;
5871    register Py_ssize_t j;
5872    Py_ssize_t len = self->length;
5873    PyObject *str;
5874    register const Py_UNICODE *buf = self->str;
5875
5876    for (i = j = 0; i < len; ) {
5877	if (buf[i] == ch) {
5878	    if (maxcount-- <= 0)
5879		break;
5880	    SPLIT_APPEND(buf, j, i);
5881	    i = j = i + 1;
5882	} else
5883	    i++;
5884    }
5885    if (j <= len) {
5886	SPLIT_APPEND(buf, j, len);
5887    }
5888    return list;
5889
5890 onError:
5891    Py_DECREF(list);
5892    return NULL;
5893}
5894
5895static
5896PyObject *split_substring(PyUnicodeObject *self,
5897			  PyObject *list,
5898			  PyUnicodeObject *substring,
5899			  Py_ssize_t maxcount)
5900{
5901    register Py_ssize_t i;
5902    register Py_ssize_t j;
5903    Py_ssize_t len = self->length;
5904    Py_ssize_t sublen = substring->length;
5905    PyObject *str;
5906
5907    for (i = j = 0; i <= len - sublen; ) {
5908	if (Py_UNICODE_MATCH(self, i, substring)) {
5909	    if (maxcount-- <= 0)
5910		break;
5911	    SPLIT_APPEND(self->str, j, i);
5912	    i = j = i + sublen;
5913	} else
5914	    i++;
5915    }
5916    if (j <= len) {
5917	SPLIT_APPEND(self->str, j, len);
5918    }
5919    return list;
5920
5921 onError:
5922    Py_DECREF(list);
5923    return NULL;
5924}
5925
5926static
5927PyObject *rsplit_whitespace(PyUnicodeObject *self,
5928			    PyObject *list,
5929			    Py_ssize_t maxcount)
5930{
5931    register Py_ssize_t i;
5932    register Py_ssize_t j;
5933    Py_ssize_t len = self->length;
5934    PyObject *str;
5935    register const Py_UNICODE *buf = self->str;
5936
5937    for (i = j = len - 1; i >= 0; ) {
5938	/* find a token */
5939	while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5940	    i--;
5941	j = i;
5942	while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5943	    i--;
5944	if (j > i) {
5945	    if (maxcount-- <= 0)
5946		break;
5947	    SPLIT_APPEND(buf, i + 1, j + 1);
5948	    while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5949		i--;
5950	    j = i;
5951	}
5952    }
5953    if (j >= 0) {
5954	SPLIT_APPEND(buf, 0, j + 1);
5955    }
5956    if (PyList_Reverse(list) < 0)
5957        goto onError;
5958    return list;
5959
5960 onError:
5961    Py_DECREF(list);
5962    return NULL;
5963}
5964
5965static
5966PyObject *rsplit_char(PyUnicodeObject *self,
5967		      PyObject *list,
5968		      Py_UNICODE ch,
5969		      Py_ssize_t maxcount)
5970{
5971    register Py_ssize_t i;
5972    register Py_ssize_t j;
5973    Py_ssize_t len = self->length;
5974    PyObject *str;
5975    register const Py_UNICODE *buf = self->str;
5976
5977    for (i = j = len - 1; i >= 0; ) {
5978	if (buf[i] == ch) {
5979	    if (maxcount-- <= 0)
5980		break;
5981	    SPLIT_APPEND(buf, i + 1, j + 1);
5982	    j = i = i - 1;
5983	} else
5984	    i--;
5985    }
5986    if (j >= -1) {
5987	SPLIT_APPEND(buf, 0, j + 1);
5988    }
5989    if (PyList_Reverse(list) < 0)
5990        goto onError;
5991    return list;
5992
5993 onError:
5994    Py_DECREF(list);
5995    return NULL;
5996}
5997
5998static
5999PyObject *rsplit_substring(PyUnicodeObject *self,
6000			   PyObject *list,
6001			   PyUnicodeObject *substring,
6002			   Py_ssize_t maxcount)
6003{
6004    register Py_ssize_t i;
6005    register Py_ssize_t j;
6006    Py_ssize_t len = self->length;
6007    Py_ssize_t sublen = substring->length;
6008    PyObject *str;
6009
6010    for (i = len - sublen, j = len; i >= 0; ) {
6011	if (Py_UNICODE_MATCH(self, i, substring)) {
6012	    if (maxcount-- <= 0)
6013		break;
6014	    SPLIT_APPEND(self->str, i + sublen, j);
6015	    j = i;
6016	    i -= sublen;
6017	} else
6018	    i--;
6019    }
6020    if (j >= 0) {
6021	SPLIT_APPEND(self->str, 0, j);
6022    }
6023    if (PyList_Reverse(list) < 0)
6024        goto onError;
6025    return list;
6026
6027 onError:
6028    Py_DECREF(list);
6029    return NULL;
6030}
6031
6032#undef SPLIT_APPEND
6033
6034static
6035PyObject *split(PyUnicodeObject *self,
6036		PyUnicodeObject *substring,
6037		Py_ssize_t maxcount)
6038{
6039    PyObject *list;
6040
6041    if (maxcount < 0)
6042        maxcount = PY_SSIZE_T_MAX;
6043
6044    list = PyList_New(0);
6045    if (!list)
6046        return NULL;
6047
6048    if (substring == NULL)
6049	return split_whitespace(self,list,maxcount);
6050
6051    else if (substring->length == 1)
6052	return split_char(self,list,substring->str[0],maxcount);
6053
6054    else if (substring->length == 0) {
6055	Py_DECREF(list);
6056	PyErr_SetString(PyExc_ValueError, "empty separator");
6057	return NULL;
6058    }
6059    else
6060	return split_substring(self,list,substring,maxcount);
6061}
6062
6063static
6064PyObject *rsplit(PyUnicodeObject *self,
6065		 PyUnicodeObject *substring,
6066		 Py_ssize_t maxcount)
6067{
6068    PyObject *list;
6069
6070    if (maxcount < 0)
6071        maxcount = PY_SSIZE_T_MAX;
6072
6073    list = PyList_New(0);
6074    if (!list)
6075        return NULL;
6076
6077    if (substring == NULL)
6078	return rsplit_whitespace(self,list,maxcount);
6079
6080    else if (substring->length == 1)
6081	return rsplit_char(self,list,substring->str[0],maxcount);
6082
6083    else if (substring->length == 0) {
6084	Py_DECREF(list);
6085	PyErr_SetString(PyExc_ValueError, "empty separator");
6086	return NULL;
6087    }
6088    else
6089	return rsplit_substring(self,list,substring,maxcount);
6090}
6091
6092static
6093PyObject *replace(PyUnicodeObject *self,
6094		  PyUnicodeObject *str1,
6095		  PyUnicodeObject *str2,
6096		  Py_ssize_t maxcount)
6097{
6098    PyUnicodeObject *u;
6099
6100    if (maxcount < 0)
6101	maxcount = PY_SSIZE_T_MAX;
6102
6103    if (str1->length == str2->length) {
6104        /* same length */
6105        Py_ssize_t i;
6106        if (str1->length == 1) {
6107            /* replace characters */
6108            Py_UNICODE u1, u2;
6109            if (!findchar(self->str, self->length, str1->str[0]))
6110                goto nothing;
6111            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6112            if (!u)
6113                return NULL;
6114            Py_UNICODE_COPY(u->str, self->str, self->length);
6115            u1 = str1->str[0];
6116            u2 = str2->str[0];
6117            for (i = 0; i < u->length; i++)
6118                if (u->str[i] == u1) {
6119                    if (--maxcount < 0)
6120                        break;
6121                    u->str[i] = u2;
6122                }
6123        } else {
6124            i = fastsearch(
6125                self->str, self->length, str1->str, str1->length, FAST_SEARCH
6126                );
6127            if (i < 0)
6128                goto nothing;
6129            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6130            if (!u)
6131                return NULL;
6132            Py_UNICODE_COPY(u->str, self->str, self->length);
6133            while (i <= self->length - str1->length)
6134                if (Py_UNICODE_MATCH(self, i, str1)) {
6135                    if (--maxcount < 0)
6136                        break;
6137                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6138                    i += str1->length;
6139                } else
6140                    i++;
6141        }
6142    } else {
6143
6144        Py_ssize_t n, i, j, e;
6145        Py_ssize_t product, new_size, delta;
6146        Py_UNICODE *p;
6147
6148        /* replace strings */
6149        n = stringlib_count(self->str, self->length, str1->str, str1->length);
6150        if (n > maxcount)
6151            n = maxcount;
6152        if (n == 0)
6153            goto nothing;
6154        /* new_size = self->length + n * (str2->length - str1->length)); */
6155        delta = (str2->length - str1->length);
6156        if (delta == 0) {
6157            new_size = self->length;
6158        } else {
6159            product = n * (str2->length - str1->length);
6160            if ((product / (str2->length - str1->length)) != n) {
6161                PyErr_SetString(PyExc_OverflowError,
6162                                "replace string is too long");
6163                return NULL;
6164            }
6165            new_size = self->length + product;
6166            if (new_size < 0) {
6167                PyErr_SetString(PyExc_OverflowError,
6168                                "replace string is too long");
6169                return NULL;
6170            }
6171        }
6172        u = _PyUnicode_New(new_size);
6173        if (!u)
6174            return NULL;
6175        i = 0;
6176        p = u->str;
6177        e = self->length - str1->length;
6178        if (str1->length > 0) {
6179            while (n-- > 0) {
6180                /* look for next match */
6181                j = i;
6182                while (j <= e) {
6183                    if (Py_UNICODE_MATCH(self, j, str1))
6184                        break;
6185                    j++;
6186                }
6187		if (j > i) {
6188                    if (j > e)
6189                        break;
6190                    /* copy unchanged part [i:j] */
6191                    Py_UNICODE_COPY(p, self->str+i, j-i);
6192                    p += j - i;
6193                }
6194                /* copy substitution string */
6195                if (str2->length > 0) {
6196                    Py_UNICODE_COPY(p, str2->str, str2->length);
6197                    p += str2->length;
6198                }
6199                i = j + str1->length;
6200            }
6201            if (i < self->length)
6202                /* copy tail [i:] */
6203                Py_UNICODE_COPY(p, self->str+i, self->length-i);
6204        } else {
6205            /* interleave */
6206            while (n > 0) {
6207                Py_UNICODE_COPY(p, str2->str, str2->length);
6208                p += str2->length;
6209                if (--n <= 0)
6210                    break;
6211                *p++ = self->str[i++];
6212            }
6213            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6214        }
6215    }
6216    return (PyObject *) u;
6217
6218nothing:
6219    /* nothing to replace; return original string (when possible) */
6220    if (PyUnicode_CheckExact(self)) {
6221        Py_INCREF(self);
6222        return (PyObject *) self;
6223    }
6224    return PyUnicode_FromUnicode(self->str, self->length);
6225}
6226
6227/* --- Unicode Object Methods --------------------------------------------- */
6228
6229PyDoc_STRVAR(title__doc__,
6230"S.title() -> str\n\
6231\n\
6232Return a titlecased version of S, i.e. words start with title case\n\
6233characters, all remaining cased characters have lower case.");
6234
6235static PyObject*
6236unicode_title(PyUnicodeObject *self)
6237{
6238    return fixup(self, fixtitle);
6239}
6240
6241PyDoc_STRVAR(capitalize__doc__,
6242"S.capitalize() -> str\n\
6243\n\
6244Return a capitalized version of S, i.e. make the first character\n\
6245have upper case.");
6246
6247static PyObject*
6248unicode_capitalize(PyUnicodeObject *self)
6249{
6250    return fixup(self, fixcapitalize);
6251}
6252
6253#if 0
6254PyDoc_STRVAR(capwords__doc__,
6255"S.capwords() -> str\n\
6256\n\
6257Apply .capitalize() to all words in S and return the result with\n\
6258normalized whitespace (all whitespace strings are replaced by ' ').");
6259
6260static PyObject*
6261unicode_capwords(PyUnicodeObject *self)
6262{
6263    PyObject *list;
6264    PyObject *item;
6265    Py_ssize_t i;
6266
6267    /* Split into words */
6268    list = split(self, NULL, -1);
6269    if (!list)
6270        return NULL;
6271
6272    /* Capitalize each word */
6273    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6274        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6275		     fixcapitalize);
6276        if (item == NULL)
6277            goto onError;
6278        Py_DECREF(PyList_GET_ITEM(list, i));
6279        PyList_SET_ITEM(list, i, item);
6280    }
6281
6282    /* Join the words to form a new string */
6283    item = PyUnicode_Join(NULL, list);
6284
6285onError:
6286    Py_DECREF(list);
6287    return (PyObject *)item;
6288}
6289#endif
6290
6291/* Argument converter.  Coerces to a single unicode character */
6292
6293static int
6294convert_uc(PyObject *obj, void *addr)
6295{
6296	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6297	PyObject *uniobj;
6298	Py_UNICODE *unistr;
6299
6300	uniobj = PyUnicode_FromObject(obj);
6301	if (uniobj == NULL) {
6302		PyErr_SetString(PyExc_TypeError,
6303			"The fill character cannot be converted to Unicode");
6304		return 0;
6305	}
6306	if (PyUnicode_GET_SIZE(uniobj) != 1) {
6307		PyErr_SetString(PyExc_TypeError,
6308			"The fill character must be exactly one character long");
6309		Py_DECREF(uniobj);
6310		return 0;
6311	}
6312	unistr = PyUnicode_AS_UNICODE(uniobj);
6313	*fillcharloc = unistr[0];
6314	Py_DECREF(uniobj);
6315	return 1;
6316}
6317
6318PyDoc_STRVAR(center__doc__,
6319"S.center(width[, fillchar]) -> str\n\
6320\n\
6321Return S centered in a string of length width. Padding is\n\
6322done using the specified fill character (default is a space)");
6323
6324static PyObject *
6325unicode_center(PyUnicodeObject *self, PyObject *args)
6326{
6327    Py_ssize_t marg, left;
6328    Py_ssize_t width;
6329    Py_UNICODE fillchar = ' ';
6330
6331    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6332        return NULL;
6333
6334    if (self->length >= width && PyUnicode_CheckExact(self)) {
6335        Py_INCREF(self);
6336        return (PyObject*) self;
6337    }
6338
6339    marg = width - self->length;
6340    left = marg / 2 + (marg & width & 1);
6341
6342    return (PyObject*) pad(self, left, marg - left, fillchar);
6343}
6344
6345#if 0
6346
6347/* This code should go into some future Unicode collation support
6348   module. The basic comparison should compare ordinals on a naive
6349   basis (this is what Java does and thus JPython too). */
6350
6351/* speedy UTF-16 code point order comparison */
6352/* gleaned from: */
6353/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6354
6355static short utf16Fixup[32] =
6356{
6357    0, 0, 0, 0, 0, 0, 0, 0,
6358    0, 0, 0, 0, 0, 0, 0, 0,
6359    0, 0, 0, 0, 0, 0, 0, 0,
6360    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6361};
6362
6363static int
6364unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6365{
6366    Py_ssize_t len1, len2;
6367
6368    Py_UNICODE *s1 = str1->str;
6369    Py_UNICODE *s2 = str2->str;
6370
6371    len1 = str1->length;
6372    len2 = str2->length;
6373
6374    while (len1 > 0 && len2 > 0) {
6375        Py_UNICODE c1, c2;
6376
6377        c1 = *s1++;
6378        c2 = *s2++;
6379
6380	if (c1 > (1<<11) * 26)
6381	    c1 += utf16Fixup[c1>>11];
6382	if (c2 > (1<<11) * 26)
6383            c2 += utf16Fixup[c2>>11];
6384        /* now c1 and c2 are in UTF-32-compatible order */
6385
6386        if (c1 != c2)
6387            return (c1 < c2) ? -1 : 1;
6388
6389        len1--; len2--;
6390    }
6391
6392    return (len1 < len2) ? -1 : (len1 != len2);
6393}
6394
6395#else
6396
6397static int
6398unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6399{
6400    register Py_ssize_t len1, len2;
6401
6402    Py_UNICODE *s1 = str1->str;
6403    Py_UNICODE *s2 = str2->str;
6404
6405    len1 = str1->length;
6406    len2 = str2->length;
6407
6408    while (len1 > 0 && len2 > 0) {
6409        Py_UNICODE c1, c2;
6410
6411        c1 = *s1++;
6412        c2 = *s2++;
6413
6414        if (c1 != c2)
6415            return (c1 < c2) ? -1 : 1;
6416
6417        len1--; len2--;
6418    }
6419
6420    return (len1 < len2) ? -1 : (len1 != len2);
6421}
6422
6423#endif
6424
6425int PyUnicode_Compare(PyObject *left,
6426		      PyObject *right)
6427{
6428    if (PyUnicode_Check(left) && PyUnicode_Check(right))
6429        return unicode_compare((PyUnicodeObject *)left,
6430                               (PyUnicodeObject *)right);
6431    PyErr_Format(PyExc_TypeError,
6432                 "Can't compare %.100s and %.100s",
6433                 left->ob_type->tp_name,
6434                 right->ob_type->tp_name);
6435    return -1;
6436}
6437
6438int
6439PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6440{
6441    int i;
6442    Py_UNICODE *id;
6443    assert(PyUnicode_Check(uni));
6444    id = PyUnicode_AS_UNICODE(uni);
6445    /* Compare Unicode string and source character set string */
6446    for (i = 0; id[i] && str[i]; i++)
6447	if (id[i] != str[i])
6448	    return ((int)id[i] < (int)str[i]) ? -1 : 1;
6449    if (id[i])
6450	return 1; /* uni is longer */
6451    if (str[i])
6452	return -1; /* str is longer */
6453    return 0;
6454}
6455
6456PyObject *PyUnicode_RichCompare(PyObject *left,
6457                                PyObject *right,
6458                                int op)
6459{
6460    int result;
6461
6462    result = PyUnicode_Compare(left, right);
6463    if (result == -1 && PyErr_Occurred())
6464        goto onError;
6465
6466    /* Convert the return value to a Boolean */
6467    switch (op) {
6468    case Py_EQ:
6469        result = (result == 0);
6470        break;
6471    case Py_NE:
6472        result = (result != 0);
6473        break;
6474    case Py_LE:
6475        result = (result <= 0);
6476        break;
6477    case Py_GE:
6478        result = (result >= 0);
6479        break;
6480    case Py_LT:
6481        result = (result == -1);
6482        break;
6483    case Py_GT:
6484        result = (result == 1);
6485        break;
6486    }
6487    return PyBool_FromLong(result);
6488
6489 onError:
6490
6491    /* Standard case
6492
6493       Type errors mean that PyUnicode_FromObject() could not convert
6494       one of the arguments (usually the right hand side) to Unicode,
6495       ie. we can't handle the comparison request. However, it is
6496       possible that the other object knows a comparison method, which
6497       is why we return Py_NotImplemented to give the other object a
6498       chance.
6499
6500    */
6501    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6502        PyErr_Clear();
6503        Py_INCREF(Py_NotImplemented);
6504        return Py_NotImplemented;
6505    }
6506    if (op != Py_EQ && op != Py_NE)
6507        return NULL;
6508
6509    /* Equality comparison.
6510
6511       This is a special case: we silence any PyExc_UnicodeDecodeError
6512       and instead turn it into a PyErr_UnicodeWarning.
6513
6514    */
6515    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6516        return NULL;
6517    PyErr_Clear();
6518    if (PyErr_WarnEx(PyExc_UnicodeWarning,
6519                     (op == Py_EQ) ?
6520                     "equal comparison "
6521                     "failed to convert both arguments to str - "
6522                     "interpreting them as being unequal"
6523                     :
6524                     "Unicode unequal comparison "
6525                     "failed to convert both arguments to str - "
6526                     "interpreting them as being unequal",
6527                     1) < 0)
6528        return NULL;
6529    result = (op == Py_NE);
6530    return PyBool_FromLong(result);
6531}
6532
6533int PyUnicode_Contains(PyObject *container,
6534		       PyObject *element)
6535{
6536    PyObject *str, *sub;
6537    int result;
6538
6539    /* Coerce the two arguments */
6540    sub = PyUnicode_FromObject(element);
6541    if (!sub) {
6542	PyErr_Format(PyExc_TypeError,
6543	    "'in <string>' requires string as left operand, not %s",
6544	    element->ob_type->tp_name);
6545        return -1;
6546    }
6547
6548    str = PyUnicode_FromObject(container);
6549    if (!str) {
6550        Py_DECREF(sub);
6551        return -1;
6552    }
6553
6554    result = stringlib_contains_obj(str, sub);
6555
6556    Py_DECREF(str);
6557    Py_DECREF(sub);
6558
6559    return result;
6560}
6561
6562/* Concat to string or Unicode object giving a new Unicode object. */
6563
6564PyObject *PyUnicode_Concat(PyObject *left,
6565			   PyObject *right)
6566{
6567    PyUnicodeObject *u = NULL, *v = NULL, *w;
6568
6569    /* Coerce the two arguments */
6570    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6571    if (u == NULL)
6572	goto onError;
6573    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6574    if (v == NULL)
6575	goto onError;
6576
6577    /* Shortcuts */
6578    if (v == unicode_empty) {
6579	Py_DECREF(v);
6580	return (PyObject *)u;
6581    }
6582    if (u == unicode_empty) {
6583	Py_DECREF(u);
6584	return (PyObject *)v;
6585    }
6586
6587    /* Concat the two Unicode strings */
6588    w = _PyUnicode_New(u->length + v->length);
6589    if (w == NULL)
6590	goto onError;
6591    Py_UNICODE_COPY(w->str, u->str, u->length);
6592    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6593
6594    Py_DECREF(u);
6595    Py_DECREF(v);
6596    return (PyObject *)w;
6597
6598onError:
6599    Py_XDECREF(u);
6600    Py_XDECREF(v);
6601    return NULL;
6602}
6603
6604void
6605PyUnicode_Append(PyObject **pleft, PyObject *right)
6606{
6607	PyObject *new;
6608	if (*pleft == NULL)
6609		return;
6610	if (right == NULL || !PyUnicode_Check(*pleft)) {
6611		Py_DECREF(*pleft);
6612		*pleft = NULL;
6613		return;
6614	}
6615	new = PyUnicode_Concat(*pleft, right);
6616	Py_DECREF(*pleft);
6617	*pleft = new;
6618}
6619
6620void
6621PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6622{
6623	PyUnicode_Append(pleft, right);
6624	Py_XDECREF(right);
6625}
6626
6627PyDoc_STRVAR(count__doc__,
6628"S.count(sub[, start[, end]]) -> int\n\
6629\n\
6630Return the number of non-overlapping occurrences of substring sub in\n\
6631string S[start:end].  Optional arguments start and end are\n\
6632interpreted as in slice notation.");
6633
6634static PyObject *
6635unicode_count(PyUnicodeObject *self, PyObject *args)
6636{
6637    PyUnicodeObject *substring;
6638    Py_ssize_t start = 0;
6639    Py_ssize_t end = PY_SSIZE_T_MAX;
6640    PyObject *result;
6641
6642    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6643		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6644        return NULL;
6645
6646    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6647        (PyObject *)substring);
6648    if (substring == NULL)
6649	return NULL;
6650
6651    FIX_START_END(self);
6652
6653    result = PyLong_FromSsize_t(
6654        stringlib_count(self->str + start, end - start,
6655                        substring->str, substring->length)
6656        );
6657
6658    Py_DECREF(substring);
6659
6660    return result;
6661}
6662
6663PyDoc_STRVAR(encode__doc__,
6664"S.encode([encoding[, errors]]) -> bytes\n\
6665\n\
6666Encode S using the codec registered for encoding. encoding defaults\n\
6667to the default encoding. errors may be given to set a different error\n\
6668handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6669a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6670'xmlcharrefreplace' as well as any other name registered with\n\
6671codecs.register_error that can handle UnicodeEncodeErrors.");
6672
6673static PyObject *
6674unicode_encode(PyUnicodeObject *self, PyObject *args)
6675{
6676    char *encoding = NULL;
6677    char *errors = NULL;
6678    PyObject *v;
6679
6680    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6681        return NULL;
6682    v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
6683    if (v == NULL)
6684        goto onError;
6685    if (!PyBytes_Check(v)) {
6686        PyErr_Format(PyExc_TypeError,
6687                     "encoder did not return a bytes object "
6688                     "(type=%.400s)",
6689                     Py_TYPE(v)->tp_name);
6690        Py_DECREF(v);
6691        return NULL;
6692    }
6693    return v;
6694
6695 onError:
6696    return NULL;
6697}
6698
6699PyDoc_STRVAR(expandtabs__doc__,
6700"S.expandtabs([tabsize]) -> str\n\
6701\n\
6702Return a copy of S where all tab characters are expanded using spaces.\n\
6703If tabsize is not given, a tab size of 8 characters is assumed.");
6704
6705static PyObject*
6706unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6707{
6708    Py_UNICODE *e;
6709    Py_UNICODE *p;
6710    Py_UNICODE *q;
6711    Py_UNICODE *qe;
6712    Py_ssize_t i, j, incr;
6713    PyUnicodeObject *u;
6714    int tabsize = 8;
6715
6716    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6717	return NULL;
6718
6719    /* First pass: determine size of output string */
6720    i = 0; /* chars up to and including most recent \n or \r */
6721    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6722    e = self->str + self->length; /* end of input */
6723    for (p = self->str; p < e; p++)
6724        if (*p == '\t') {
6725	    if (tabsize > 0) {
6726		incr = tabsize - (j % tabsize); /* cannot overflow */
6727		if (j > PY_SSIZE_T_MAX - incr)
6728		    goto overflow1;
6729		j += incr;
6730            }
6731	}
6732        else {
6733	    if (j > PY_SSIZE_T_MAX - 1)
6734		goto overflow1;
6735            j++;
6736            if (*p == '\n' || *p == '\r') {
6737		if (i > PY_SSIZE_T_MAX - j)
6738		    goto overflow1;
6739                i += j;
6740                j = 0;
6741            }
6742        }
6743
6744    if (i > PY_SSIZE_T_MAX - j)
6745	goto overflow1;
6746
6747    /* Second pass: create output string and fill it */
6748    u = _PyUnicode_New(i + j);
6749    if (!u)
6750        return NULL;
6751
6752    j = 0; /* same as in first pass */
6753    q = u->str; /* next output char */
6754    qe = u->str + u->length; /* end of output */
6755
6756    for (p = self->str; p < e; p++)
6757        if (*p == '\t') {
6758	    if (tabsize > 0) {
6759		i = tabsize - (j % tabsize);
6760		j += i;
6761		while (i--) {
6762		    if (q >= qe)
6763			goto overflow2;
6764		    *q++ = ' ';
6765                }
6766	    }
6767	}
6768	else {
6769	    if (q >= qe)
6770		goto overflow2;
6771	    *q++ = *p;
6772            j++;
6773            if (*p == '\n' || *p == '\r')
6774                j = 0;
6775        }
6776
6777    return (PyObject*) u;
6778
6779  overflow2:
6780    Py_DECREF(u);
6781  overflow1:
6782    PyErr_SetString(PyExc_OverflowError, "new string is too long");
6783    return NULL;
6784}
6785
6786PyDoc_STRVAR(find__doc__,
6787"S.find(sub[, start[, end]]) -> int\n\
6788\n\
6789Return the lowest index in S where substring sub is found,\n\
6790such that sub is contained within s[start:end].  Optional\n\
6791arguments start and end are interpreted as in slice notation.\n\
6792\n\
6793Return -1 on failure.");
6794
6795static PyObject *
6796unicode_find(PyUnicodeObject *self, PyObject *args)
6797{
6798    PyObject *substring;
6799    Py_ssize_t start;
6800    Py_ssize_t end;
6801    Py_ssize_t result;
6802
6803    if (!_ParseTupleFinds(args, &substring, &start, &end))
6804        return NULL;
6805
6806    result = stringlib_find_slice(
6807        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6808        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6809        start, end
6810        );
6811
6812    Py_DECREF(substring);
6813
6814    return PyLong_FromSsize_t(result);
6815}
6816
6817static PyObject *
6818unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6819{
6820    if (index < 0 || index >= self->length) {
6821        PyErr_SetString(PyExc_IndexError, "string index out of range");
6822        return NULL;
6823    }
6824
6825    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6826}
6827
6828/* Believe it or not, this produces the same value for ASCII strings
6829   as string_hash(). */
6830static long
6831unicode_hash(PyUnicodeObject *self)
6832{
6833    Py_ssize_t len;
6834    Py_UNICODE *p;
6835    long x;
6836
6837    if (self->hash != -1)
6838        return self->hash;
6839    len = Py_SIZE(self);
6840    p = self->str;
6841    x = *p << 7;
6842    while (--len >= 0)
6843        x = (1000003*x) ^ *p++;
6844    x ^= Py_SIZE(self);
6845    if (x == -1)
6846        x = -2;
6847    self->hash = x;
6848    return x;
6849}
6850
6851PyDoc_STRVAR(index__doc__,
6852"S.index(sub[, start[, end]]) -> int\n\
6853\n\
6854Like S.find() but raise ValueError when the substring is not found.");
6855
6856static PyObject *
6857unicode_index(PyUnicodeObject *self, PyObject *args)
6858{
6859    Py_ssize_t result;
6860    PyObject *substring;
6861    Py_ssize_t start;
6862    Py_ssize_t end;
6863
6864    if (!_ParseTupleFinds(args, &substring, &start, &end))
6865        return NULL;
6866
6867    result = stringlib_find_slice(
6868        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6869        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6870        start, end
6871        );
6872
6873    Py_DECREF(substring);
6874
6875    if (result < 0) {
6876        PyErr_SetString(PyExc_ValueError, "substring not found");
6877        return NULL;
6878    }
6879
6880    return PyLong_FromSsize_t(result);
6881}
6882
6883PyDoc_STRVAR(islower__doc__,
6884"S.islower() -> bool\n\
6885\n\
6886Return True if all cased characters in S are lowercase and there is\n\
6887at least one cased character in S, False otherwise.");
6888
6889static PyObject*
6890unicode_islower(PyUnicodeObject *self)
6891{
6892    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6893    register const Py_UNICODE *e;
6894    int cased;
6895
6896    /* Shortcut for single character strings */
6897    if (PyUnicode_GET_SIZE(self) == 1)
6898	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6899
6900    /* Special case for empty strings */
6901    if (PyUnicode_GET_SIZE(self) == 0)
6902	return PyBool_FromLong(0);
6903
6904    e = p + PyUnicode_GET_SIZE(self);
6905    cased = 0;
6906    for (; p < e; p++) {
6907	register const Py_UNICODE ch = *p;
6908
6909	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6910	    return PyBool_FromLong(0);
6911	else if (!cased && Py_UNICODE_ISLOWER(ch))
6912	    cased = 1;
6913    }
6914    return PyBool_FromLong(cased);
6915}
6916
6917PyDoc_STRVAR(isupper__doc__,
6918"S.isupper() -> bool\n\
6919\n\
6920Return True if all cased characters in S are uppercase and there is\n\
6921at least one cased character in S, False otherwise.");
6922
6923static PyObject*
6924unicode_isupper(PyUnicodeObject *self)
6925{
6926    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6927    register const Py_UNICODE *e;
6928    int cased;
6929
6930    /* Shortcut for single character strings */
6931    if (PyUnicode_GET_SIZE(self) == 1)
6932	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6933
6934    /* Special case for empty strings */
6935    if (PyUnicode_GET_SIZE(self) == 0)
6936	return PyBool_FromLong(0);
6937
6938    e = p + PyUnicode_GET_SIZE(self);
6939    cased = 0;
6940    for (; p < e; p++) {
6941	register const Py_UNICODE ch = *p;
6942
6943	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6944	    return PyBool_FromLong(0);
6945	else if (!cased && Py_UNICODE_ISUPPER(ch))
6946	    cased = 1;
6947    }
6948    return PyBool_FromLong(cased);
6949}
6950
6951PyDoc_STRVAR(istitle__doc__,
6952"S.istitle() -> bool\n\
6953\n\
6954Return True if S is a titlecased string and there is at least one\n\
6955character in S, i.e. upper- and titlecase characters may only\n\
6956follow uncased characters and lowercase characters only cased ones.\n\
6957Return False otherwise.");
6958
6959static PyObject*
6960unicode_istitle(PyUnicodeObject *self)
6961{
6962    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6963    register const Py_UNICODE *e;
6964    int cased, previous_is_cased;
6965
6966    /* Shortcut for single character strings */
6967    if (PyUnicode_GET_SIZE(self) == 1)
6968	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6969			       (Py_UNICODE_ISUPPER(*p) != 0));
6970
6971    /* Special case for empty strings */
6972    if (PyUnicode_GET_SIZE(self) == 0)
6973	return PyBool_FromLong(0);
6974
6975    e = p + PyUnicode_GET_SIZE(self);
6976    cased = 0;
6977    previous_is_cased = 0;
6978    for (; p < e; p++) {
6979	register const Py_UNICODE ch = *p;
6980
6981	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6982	    if (previous_is_cased)
6983		return PyBool_FromLong(0);
6984	    previous_is_cased = 1;
6985	    cased = 1;
6986	}
6987	else if (Py_UNICODE_ISLOWER(ch)) {
6988	    if (!previous_is_cased)
6989		return PyBool_FromLong(0);
6990	    previous_is_cased = 1;
6991	    cased = 1;
6992	}
6993	else
6994	    previous_is_cased = 0;
6995    }
6996    return PyBool_FromLong(cased);
6997}
6998
6999PyDoc_STRVAR(isspace__doc__,
7000"S.isspace() -> bool\n\
7001\n\
7002Return True if all characters in S are whitespace\n\
7003and there is at least one character in S, False otherwise.");
7004
7005static PyObject*
7006unicode_isspace(PyUnicodeObject *self)
7007{
7008    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7009    register const Py_UNICODE *e;
7010
7011    /* Shortcut for single character strings */
7012    if (PyUnicode_GET_SIZE(self) == 1 &&
7013	Py_UNICODE_ISSPACE(*p))
7014	return PyBool_FromLong(1);
7015
7016    /* Special case for empty strings */
7017    if (PyUnicode_GET_SIZE(self) == 0)
7018	return PyBool_FromLong(0);
7019
7020    e = p + PyUnicode_GET_SIZE(self);
7021    for (; p < e; p++) {
7022	if (!Py_UNICODE_ISSPACE(*p))
7023	    return PyBool_FromLong(0);
7024    }
7025    return PyBool_FromLong(1);
7026}
7027
7028PyDoc_STRVAR(isalpha__doc__,
7029"S.isalpha() -> bool\n\
7030\n\
7031Return True if all characters in S are alphabetic\n\
7032and there is at least one character in S, False otherwise.");
7033
7034static PyObject*
7035unicode_isalpha(PyUnicodeObject *self)
7036{
7037    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7038    register const Py_UNICODE *e;
7039
7040    /* Shortcut for single character strings */
7041    if (PyUnicode_GET_SIZE(self) == 1 &&
7042	Py_UNICODE_ISALPHA(*p))
7043	return PyBool_FromLong(1);
7044
7045    /* Special case for empty strings */
7046    if (PyUnicode_GET_SIZE(self) == 0)
7047	return PyBool_FromLong(0);
7048
7049    e = p + PyUnicode_GET_SIZE(self);
7050    for (; p < e; p++) {
7051	if (!Py_UNICODE_ISALPHA(*p))
7052	    return PyBool_FromLong(0);
7053    }
7054    return PyBool_FromLong(1);
7055}
7056
7057PyDoc_STRVAR(isalnum__doc__,
7058"S.isalnum() -> bool\n\
7059\n\
7060Return True if all characters in S are alphanumeric\n\
7061and there is at least one character in S, False otherwise.");
7062
7063static PyObject*
7064unicode_isalnum(PyUnicodeObject *self)
7065{
7066    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7067    register const Py_UNICODE *e;
7068
7069    /* Shortcut for single character strings */
7070    if (PyUnicode_GET_SIZE(self) == 1 &&
7071	Py_UNICODE_ISALNUM(*p))
7072	return PyBool_FromLong(1);
7073
7074    /* Special case for empty strings */
7075    if (PyUnicode_GET_SIZE(self) == 0)
7076	return PyBool_FromLong(0);
7077
7078    e = p + PyUnicode_GET_SIZE(self);
7079    for (; p < e; p++) {
7080	if (!Py_UNICODE_ISALNUM(*p))
7081	    return PyBool_FromLong(0);
7082    }
7083    return PyBool_FromLong(1);
7084}
7085
7086PyDoc_STRVAR(isdecimal__doc__,
7087"S.isdecimal() -> bool\n\
7088\n\
7089Return True if there are only decimal characters in S,\n\
7090False otherwise.");
7091
7092static PyObject*
7093unicode_isdecimal(PyUnicodeObject *self)
7094{
7095    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7096    register const Py_UNICODE *e;
7097
7098    /* Shortcut for single character strings */
7099    if (PyUnicode_GET_SIZE(self) == 1 &&
7100	Py_UNICODE_ISDECIMAL(*p))
7101	return PyBool_FromLong(1);
7102
7103    /* Special case for empty strings */
7104    if (PyUnicode_GET_SIZE(self) == 0)
7105	return PyBool_FromLong(0);
7106
7107    e = p + PyUnicode_GET_SIZE(self);
7108    for (; p < e; p++) {
7109	if (!Py_UNICODE_ISDECIMAL(*p))
7110	    return PyBool_FromLong(0);
7111    }
7112    return PyBool_FromLong(1);
7113}
7114
7115PyDoc_STRVAR(isdigit__doc__,
7116"S.isdigit() -> bool\n\
7117\n\
7118Return True if all characters in S are digits\n\
7119and there is at least one character in S, False otherwise.");
7120
7121static PyObject*
7122unicode_isdigit(PyUnicodeObject *self)
7123{
7124    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7125    register const Py_UNICODE *e;
7126
7127    /* Shortcut for single character strings */
7128    if (PyUnicode_GET_SIZE(self) == 1 &&
7129	Py_UNICODE_ISDIGIT(*p))
7130	return PyBool_FromLong(1);
7131
7132    /* Special case for empty strings */
7133    if (PyUnicode_GET_SIZE(self) == 0)
7134	return PyBool_FromLong(0);
7135
7136    e = p + PyUnicode_GET_SIZE(self);
7137    for (; p < e; p++) {
7138	if (!Py_UNICODE_ISDIGIT(*p))
7139	    return PyBool_FromLong(0);
7140    }
7141    return PyBool_FromLong(1);
7142}
7143
7144PyDoc_STRVAR(isnumeric__doc__,
7145"S.isnumeric() -> bool\n\
7146\n\
7147Return True if there are only numeric characters in S,\n\
7148False otherwise.");
7149
7150static PyObject*
7151unicode_isnumeric(PyUnicodeObject *self)
7152{
7153    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7154    register const Py_UNICODE *e;
7155
7156    /* Shortcut for single character strings */
7157    if (PyUnicode_GET_SIZE(self) == 1 &&
7158	Py_UNICODE_ISNUMERIC(*p))
7159	return PyBool_FromLong(1);
7160
7161    /* Special case for empty strings */
7162    if (PyUnicode_GET_SIZE(self) == 0)
7163	return PyBool_FromLong(0);
7164
7165    e = p + PyUnicode_GET_SIZE(self);
7166    for (; p < e; p++) {
7167	if (!Py_UNICODE_ISNUMERIC(*p))
7168	    return PyBool_FromLong(0);
7169    }
7170    return PyBool_FromLong(1);
7171}
7172
7173int
7174PyUnicode_IsIdentifier(PyObject *self)
7175{
7176    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7177    register const Py_UNICODE *e;
7178
7179    /* Special case for empty strings */
7180    if (PyUnicode_GET_SIZE(self) == 0)
7181	return 0;
7182
7183    /* PEP 3131 says that the first character must be in
7184       XID_Start and subsequent characters in XID_Continue,
7185       and for the ASCII range, the 2.x rules apply (i.e
7186       start with letters and underscore, continue with
7187       letters, digits, underscore). However, given the current
7188       definition of XID_Start and XID_Continue, it is sufficient
7189       to check just for these, except that _ must be allowed
7190       as starting an identifier.  */
7191    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7192        return 0;
7193
7194    e = p + PyUnicode_GET_SIZE(self);
7195    for (p++; p < e; p++) {
7196	if (!_PyUnicode_IsXidContinue(*p))
7197	    return 0;
7198    }
7199    return 1;
7200}
7201
7202PyDoc_STRVAR(isidentifier__doc__,
7203"S.isidentifier() -> bool\n\
7204\n\
7205Return True if S is a valid identifier according\n\
7206to the language definition.");
7207
7208static PyObject*
7209unicode_isidentifier(PyObject *self)
7210{
7211    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7212}
7213
7214PyDoc_STRVAR(isprintable__doc__,
7215"S.isprintable() -> bool\n\
7216\n\
7217Return True if all characters in S are considered\n\
7218printable in repr() or S is empty, False otherwise.");
7219
7220static PyObject*
7221unicode_isprintable(PyObject *self)
7222{
7223    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7224    register const Py_UNICODE *e;
7225
7226    /* Shortcut for single character strings */
7227    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7228        Py_RETURN_TRUE;
7229    }
7230
7231    e = p + PyUnicode_GET_SIZE(self);
7232    for (; p < e; p++) {
7233        if (!Py_UNICODE_ISPRINTABLE(*p)) {
7234            Py_RETURN_FALSE;
7235        }
7236    }
7237    Py_RETURN_TRUE;
7238}
7239
7240PyDoc_STRVAR(join__doc__,
7241"S.join(sequence) -> str\n\
7242\n\
7243Return a string which is the concatenation of the strings in the\n\
7244sequence.  The separator between elements is S.");
7245
7246static PyObject*
7247unicode_join(PyObject *self, PyObject *data)
7248{
7249    return PyUnicode_Join(self, data);
7250}
7251
7252static Py_ssize_t
7253unicode_length(PyUnicodeObject *self)
7254{
7255    return self->length;
7256}
7257
7258PyDoc_STRVAR(ljust__doc__,
7259"S.ljust(width[, fillchar]) -> str\n\
7260\n\
7261Return S left justified in a Unicode string of length width. Padding is\n\
7262done using the specified fill character (default is a space).");
7263
7264static PyObject *
7265unicode_ljust(PyUnicodeObject *self, PyObject *args)
7266{
7267    Py_ssize_t width;
7268    Py_UNICODE fillchar = ' ';
7269
7270    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7271        return NULL;
7272
7273    if (self->length >= width && PyUnicode_CheckExact(self)) {
7274        Py_INCREF(self);
7275        return (PyObject*) self;
7276    }
7277
7278    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7279}
7280
7281PyDoc_STRVAR(lower__doc__,
7282"S.lower() -> str\n\
7283\n\
7284Return a copy of the string S converted to lowercase.");
7285
7286static PyObject*
7287unicode_lower(PyUnicodeObject *self)
7288{
7289    return fixup(self, fixlower);
7290}
7291
7292#define LEFTSTRIP 0
7293#define RIGHTSTRIP 1
7294#define BOTHSTRIP 2
7295
7296/* Arrays indexed by above */
7297static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7298
7299#define STRIPNAME(i) (stripformat[i]+3)
7300
7301/* externally visible for str.strip(unicode) */
7302PyObject *
7303_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7304{
7305	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7306	Py_ssize_t len = PyUnicode_GET_SIZE(self);
7307	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7308	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7309	Py_ssize_t i, j;
7310
7311        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7312
7313	i = 0;
7314	if (striptype != RIGHTSTRIP) {
7315            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7316                i++;
7317            }
7318	}
7319
7320	j = len;
7321	if (striptype != LEFTSTRIP) {
7322            do {
7323                j--;
7324            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7325            j++;
7326	}
7327
7328	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7329            Py_INCREF(self);
7330            return (PyObject*)self;
7331	}
7332	else
7333            return PyUnicode_FromUnicode(s+i, j-i);
7334}
7335
7336
7337static PyObject *
7338do_strip(PyUnicodeObject *self, int striptype)
7339{
7340	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7341	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7342
7343	i = 0;
7344	if (striptype != RIGHTSTRIP) {
7345		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7346			i++;
7347		}
7348	}
7349
7350	j = len;
7351	if (striptype != LEFTSTRIP) {
7352		do {
7353			j--;
7354		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7355		j++;
7356	}
7357
7358	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7359		Py_INCREF(self);
7360		return (PyObject*)self;
7361	}
7362	else
7363		return PyUnicode_FromUnicode(s+i, j-i);
7364}
7365
7366
7367static PyObject *
7368do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7369{
7370	PyObject *sep = NULL;
7371
7372	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7373		return NULL;
7374
7375	if (sep != NULL && sep != Py_None) {
7376		if (PyUnicode_Check(sep))
7377			return _PyUnicode_XStrip(self, striptype, sep);
7378		else {
7379			PyErr_Format(PyExc_TypeError,
7380				     "%s arg must be None or str",
7381				     STRIPNAME(striptype));
7382			return NULL;
7383		}
7384	}
7385
7386	return do_strip(self, striptype);
7387}
7388
7389
7390PyDoc_STRVAR(strip__doc__,
7391"S.strip([chars]) -> str\n\
7392\n\
7393Return a copy of the string S with leading and trailing\n\
7394whitespace removed.\n\
7395If chars is given and not None, remove characters in chars instead.");
7396
7397static PyObject *
7398unicode_strip(PyUnicodeObject *self, PyObject *args)
7399{
7400	if (PyTuple_GET_SIZE(args) == 0)
7401		return do_strip(self, BOTHSTRIP); /* Common case */
7402	else
7403		return do_argstrip(self, BOTHSTRIP, args);
7404}
7405
7406
7407PyDoc_STRVAR(lstrip__doc__,
7408"S.lstrip([chars]) -> str\n\
7409\n\
7410Return a copy of the string S with leading whitespace removed.\n\
7411If chars is given and not None, remove characters in chars instead.");
7412
7413static PyObject *
7414unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7415{
7416	if (PyTuple_GET_SIZE(args) == 0)
7417		return do_strip(self, LEFTSTRIP); /* Common case */
7418	else
7419		return do_argstrip(self, LEFTSTRIP, args);
7420}
7421
7422
7423PyDoc_STRVAR(rstrip__doc__,
7424"S.rstrip([chars]) -> str\n\
7425\n\
7426Return a copy of the string S with trailing whitespace removed.\n\
7427If chars is given and not None, remove characters in chars instead.");
7428
7429static PyObject *
7430unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7431{
7432	if (PyTuple_GET_SIZE(args) == 0)
7433		return do_strip(self, RIGHTSTRIP); /* Common case */
7434	else
7435		return do_argstrip(self, RIGHTSTRIP, args);
7436}
7437
7438
7439static PyObject*
7440unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7441{
7442    PyUnicodeObject *u;
7443    Py_UNICODE *p;
7444    Py_ssize_t nchars;
7445    size_t nbytes;
7446
7447    if (len < 0)
7448        len = 0;
7449
7450    if (len == 1 && PyUnicode_CheckExact(str)) {
7451        /* no repeat, return original string */
7452        Py_INCREF(str);
7453        return (PyObject*) str;
7454    }
7455
7456    /* ensure # of chars needed doesn't overflow int and # of bytes
7457     * needed doesn't overflow size_t
7458     */
7459    nchars = len * str->length;
7460    if (len && nchars / len != str->length) {
7461        PyErr_SetString(PyExc_OverflowError,
7462                        "repeated string is too long");
7463        return NULL;
7464    }
7465    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7466    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7467        PyErr_SetString(PyExc_OverflowError,
7468                        "repeated string is too long");
7469        return NULL;
7470    }
7471    u = _PyUnicode_New(nchars);
7472    if (!u)
7473        return NULL;
7474
7475    p = u->str;
7476
7477    if (str->length == 1 && len > 0) {
7478        Py_UNICODE_FILL(p, str->str[0], len);
7479    } else {
7480	Py_ssize_t done = 0; /* number of characters copied this far */
7481	if (done < nchars) {
7482            Py_UNICODE_COPY(p, str->str, str->length);
7483            done = str->length;
7484	}
7485	while (done < nchars) {
7486            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7487            Py_UNICODE_COPY(p+done, p, n);
7488            done += n;
7489	}
7490    }
7491
7492    return (PyObject*) u;
7493}
7494
7495PyObject *PyUnicode_Replace(PyObject *obj,
7496			    PyObject *subobj,
7497			    PyObject *replobj,
7498			    Py_ssize_t maxcount)
7499{
7500    PyObject *self;
7501    PyObject *str1;
7502    PyObject *str2;
7503    PyObject *result;
7504
7505    self = PyUnicode_FromObject(obj);
7506    if (self == NULL)
7507	return NULL;
7508    str1 = PyUnicode_FromObject(subobj);
7509    if (str1 == NULL) {
7510	Py_DECREF(self);
7511	return NULL;
7512    }
7513    str2 = PyUnicode_FromObject(replobj);
7514    if (str2 == NULL) {
7515	Py_DECREF(self);
7516	Py_DECREF(str1);
7517	return NULL;
7518    }
7519    result = replace((PyUnicodeObject *)self,
7520		     (PyUnicodeObject *)str1,
7521		     (PyUnicodeObject *)str2,
7522		     maxcount);
7523    Py_DECREF(self);
7524    Py_DECREF(str1);
7525    Py_DECREF(str2);
7526    return result;
7527}
7528
7529PyDoc_STRVAR(replace__doc__,
7530"S.replace (old, new[, count]) -> str\n\
7531\n\
7532Return a copy of S with all occurrences of substring\n\
7533old replaced by new.  If the optional argument count is\n\
7534given, only the first count occurrences are replaced.");
7535
7536static PyObject*
7537unicode_replace(PyUnicodeObject *self, PyObject *args)
7538{
7539    PyUnicodeObject *str1;
7540    PyUnicodeObject *str2;
7541    Py_ssize_t maxcount = -1;
7542    PyObject *result;
7543
7544    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7545        return NULL;
7546    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7547    if (str1 == NULL)
7548	return NULL;
7549    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7550    if (str2 == NULL) {
7551	Py_DECREF(str1);
7552	return NULL;
7553    }
7554
7555    result = replace(self, str1, str2, maxcount);
7556
7557    Py_DECREF(str1);
7558    Py_DECREF(str2);
7559    return result;
7560}
7561
7562static
7563PyObject *unicode_repr(PyObject *unicode)
7564{
7565    PyObject *repr;
7566    Py_UNICODE *p;
7567    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7568    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7569
7570    /* XXX(nnorwitz): rather than over-allocating, it would be
7571       better to choose a different scheme.  Perhaps scan the
7572       first N-chars of the string and allocate based on that size.
7573    */
7574    /* Initial allocation is based on the longest-possible unichr
7575       escape.
7576
7577       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7578       unichr, so in this case it's the longest unichr escape. In
7579       narrow (UTF-16) builds this is five chars per source unichr
7580       since there are two unichrs in the surrogate pair, so in narrow
7581       (UTF-16) builds it's not the longest unichr escape.
7582
7583       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7584       so in the narrow (UTF-16) build case it's the longest unichr
7585       escape.
7586    */
7587
7588    repr = PyUnicode_FromUnicode(NULL,
7589        2 /* quotes */
7590#ifdef Py_UNICODE_WIDE
7591        + 10*size
7592#else
7593        + 6*size
7594#endif
7595        + 1);
7596    if (repr == NULL)
7597        return NULL;
7598
7599    p = PyUnicode_AS_UNICODE(repr);
7600
7601    /* Add quote */
7602    *p++ = (findchar(s, size, '\'') &&
7603            !findchar(s, size, '"')) ? '"' : '\'';
7604    while (size-- > 0) {
7605        Py_UNICODE ch = *s++;
7606
7607        /* Escape quotes and backslashes */
7608        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7609            *p++ = '\\';
7610            *p++ = ch;
7611            continue;
7612        }
7613
7614	/* Map special whitespace to '\t', \n', '\r' */
7615        if (ch == '\t') {
7616            *p++ = '\\';
7617            *p++ = 't';
7618        }
7619        else if (ch == '\n') {
7620            *p++ = '\\';
7621            *p++ = 'n';
7622        }
7623        else if (ch == '\r') {
7624            *p++ = '\\';
7625            *p++ = 'r';
7626        }
7627
7628        /* Map non-printable US ASCII to '\xhh' */
7629        else if (ch < ' ' || ch == 0x7F) {
7630            *p++ = '\\';
7631            *p++ = 'x';
7632            *p++ = hexdigits[(ch >> 4) & 0x000F];
7633            *p++ = hexdigits[ch & 0x000F];
7634        }
7635
7636        /* Copy ASCII characters as-is */
7637        else if (ch < 0x7F) {
7638            *p++ = ch;
7639        }
7640
7641	/* Non-ASCII characters */
7642        else {
7643            Py_UCS4 ucs = ch;
7644
7645#ifndef Py_UNICODE_WIDE
7646            Py_UNICODE ch2 = 0;
7647            /* Get code point from surrogate pair */
7648            if (size > 0) {
7649                ch2 = *s;
7650                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7651                            && ch2 <= 0xDFFF) {
7652                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7653                            + 0x00010000;
7654                    s++;
7655                    size--;
7656                }
7657            }
7658#endif
7659            /* Map Unicode whitespace and control characters
7660               (categories Z* and C* except ASCII space)
7661            */
7662            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7663                /* Map 8-bit characters to '\xhh' */
7664                if (ucs <= 0xff) {
7665                    *p++ = '\\';
7666                    *p++ = 'x';
7667                    *p++ = hexdigits[(ch >> 4) & 0x000F];
7668                    *p++ = hexdigits[ch & 0x000F];
7669                }
7670                /* Map 21-bit characters to '\U00xxxxxx' */
7671                else if (ucs >= 0x10000) {
7672                    *p++ = '\\';
7673                    *p++ = 'U';
7674                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7675                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7676                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7677                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7678                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7679                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7680                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7681                    *p++ = hexdigits[ucs & 0x0000000F];
7682                }
7683                /* Map 16-bit characters to '\uxxxx' */
7684                else {
7685                    *p++ = '\\';
7686                    *p++ = 'u';
7687                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
7688                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
7689                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
7690                    *p++ = hexdigits[ucs & 0x000F];
7691                }
7692            }
7693            /* Copy characters as-is */
7694            else {
7695                *p++ = ch;
7696#ifndef Py_UNICODE_WIDE
7697                if (ucs >= 0x10000)
7698                    *p++ = ch2;
7699#endif
7700            }
7701        }
7702    }
7703    /* Add quote */
7704    *p++ = PyUnicode_AS_UNICODE(repr)[0];
7705
7706    *p = '\0';
7707    _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7708    return repr;
7709}
7710
7711PyDoc_STRVAR(rfind__doc__,
7712"S.rfind(sub[, start[, end]]) -> int\n\
7713\n\
7714Return the highest index in S where substring sub is found,\n\
7715such that sub is contained within s[start:end].  Optional\n\
7716arguments start and end are interpreted as in slice notation.\n\
7717\n\
7718Return -1 on failure.");
7719
7720static PyObject *
7721unicode_rfind(PyUnicodeObject *self, PyObject *args)
7722{
7723    PyObject *substring;
7724    Py_ssize_t start;
7725    Py_ssize_t end;
7726    Py_ssize_t result;
7727
7728    if (!_ParseTupleFinds(args, &substring, &start, &end))
7729	    return NULL;
7730
7731    result = stringlib_rfind_slice(
7732        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7733        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7734        start, end
7735        );
7736
7737    Py_DECREF(substring);
7738
7739    return PyLong_FromSsize_t(result);
7740}
7741
7742PyDoc_STRVAR(rindex__doc__,
7743"S.rindex(sub[, start[, end]]) -> int\n\
7744\n\
7745Like S.rfind() but raise ValueError when the substring is not found.");
7746
7747static PyObject *
7748unicode_rindex(PyUnicodeObject *self, PyObject *args)
7749{
7750    PyObject *substring;
7751    Py_ssize_t start;
7752    Py_ssize_t end;
7753    Py_ssize_t result;
7754
7755    if (!_ParseTupleFinds(args, &substring, &start, &end))
7756	    return NULL;
7757
7758    result = stringlib_rfind_slice(
7759        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7760        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7761        start, end
7762        );
7763
7764    Py_DECREF(substring);
7765
7766    if (result < 0) {
7767        PyErr_SetString(PyExc_ValueError, "substring not found");
7768        return NULL;
7769    }
7770    return PyLong_FromSsize_t(result);
7771}
7772
7773PyDoc_STRVAR(rjust__doc__,
7774"S.rjust(width[, fillchar]) -> str\n\
7775\n\
7776Return S right justified in a string of length width. Padding is\n\
7777done using the specified fill character (default is a space).");
7778
7779static PyObject *
7780unicode_rjust(PyUnicodeObject *self, PyObject *args)
7781{
7782    Py_ssize_t width;
7783    Py_UNICODE fillchar = ' ';
7784
7785    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7786        return NULL;
7787
7788    if (self->length >= width && PyUnicode_CheckExact(self)) {
7789        Py_INCREF(self);
7790        return (PyObject*) self;
7791    }
7792
7793    return (PyObject*) pad(self, width - self->length, 0, fillchar);
7794}
7795
7796PyObject *PyUnicode_Split(PyObject *s,
7797			  PyObject *sep,
7798			  Py_ssize_t maxsplit)
7799{
7800    PyObject *result;
7801
7802    s = PyUnicode_FromObject(s);
7803    if (s == NULL)
7804	return NULL;
7805    if (sep != NULL) {
7806	sep = PyUnicode_FromObject(sep);
7807	if (sep == NULL) {
7808	    Py_DECREF(s);
7809	    return NULL;
7810	}
7811    }
7812
7813    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7814
7815    Py_DECREF(s);
7816    Py_XDECREF(sep);
7817    return result;
7818}
7819
7820PyDoc_STRVAR(split__doc__,
7821"S.split([sep[, maxsplit]]) -> list of strings\n\
7822\n\
7823Return a list of the words in S, using sep as the\n\
7824delimiter string.  If maxsplit is given, at most maxsplit\n\
7825splits are done. If sep is not specified or is None, any\n\
7826whitespace string is a separator and empty strings are\n\
7827removed from the result.");
7828
7829static PyObject*
7830unicode_split(PyUnicodeObject *self, PyObject *args)
7831{
7832    PyObject *substring = Py_None;
7833    Py_ssize_t maxcount = -1;
7834
7835    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7836        return NULL;
7837
7838    if (substring == Py_None)
7839	return split(self, NULL, maxcount);
7840    else if (PyUnicode_Check(substring))
7841	return split(self, (PyUnicodeObject *)substring, maxcount);
7842    else
7843	return PyUnicode_Split((PyObject *)self, substring, maxcount);
7844}
7845
7846PyObject *
7847PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7848{
7849    PyObject* str_obj;
7850    PyObject* sep_obj;
7851    PyObject* out;
7852
7853    str_obj = PyUnicode_FromObject(str_in);
7854    if (!str_obj)
7855	return NULL;
7856    sep_obj = PyUnicode_FromObject(sep_in);
7857    if (!sep_obj) {
7858        Py_DECREF(str_obj);
7859        return NULL;
7860    }
7861
7862    out = stringlib_partition(
7863        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7864        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7865        );
7866
7867    Py_DECREF(sep_obj);
7868    Py_DECREF(str_obj);
7869
7870    return out;
7871}
7872
7873
7874PyObject *
7875PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7876{
7877    PyObject* str_obj;
7878    PyObject* sep_obj;
7879    PyObject* out;
7880
7881    str_obj = PyUnicode_FromObject(str_in);
7882    if (!str_obj)
7883	return NULL;
7884    sep_obj = PyUnicode_FromObject(sep_in);
7885    if (!sep_obj) {
7886        Py_DECREF(str_obj);
7887        return NULL;
7888    }
7889
7890    out = stringlib_rpartition(
7891        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7892        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7893        );
7894
7895    Py_DECREF(sep_obj);
7896    Py_DECREF(str_obj);
7897
7898    return out;
7899}
7900
7901PyDoc_STRVAR(partition__doc__,
7902"S.partition(sep) -> (head, sep, tail)\n\
7903\n\
7904Search for the separator sep in S, and return the part before it,\n\
7905the separator itself, and the part after it.  If the separator is not\n\
7906found, returns S and two empty strings.");
7907
7908static PyObject*
7909unicode_partition(PyUnicodeObject *self, PyObject *separator)
7910{
7911    return PyUnicode_Partition((PyObject *)self, separator);
7912}
7913
7914PyDoc_STRVAR(rpartition__doc__,
7915"S.rpartition(sep) -> (tail, sep, head)\n\
7916\n\
7917Search for the separator sep in S, starting at the end of S, and return\n\
7918the part before it, the separator itself, and the part after it.  If the\n\
7919separator is not found, returns two empty strings and S.");
7920
7921static PyObject*
7922unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7923{
7924    return PyUnicode_RPartition((PyObject *)self, separator);
7925}
7926
7927PyObject *PyUnicode_RSplit(PyObject *s,
7928			   PyObject *sep,
7929			   Py_ssize_t maxsplit)
7930{
7931    PyObject *result;
7932
7933    s = PyUnicode_FromObject(s);
7934    if (s == NULL)
7935	return NULL;
7936    if (sep != NULL) {
7937	sep = PyUnicode_FromObject(sep);
7938	if (sep == NULL) {
7939	    Py_DECREF(s);
7940	    return NULL;
7941	}
7942    }
7943
7944    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7945
7946    Py_DECREF(s);
7947    Py_XDECREF(sep);
7948    return result;
7949}
7950
7951PyDoc_STRVAR(rsplit__doc__,
7952"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
7953\n\
7954Return a list of the words in S, using sep as the\n\
7955delimiter string, starting at the end of the string and\n\
7956working to the front.  If maxsplit is given, at most maxsplit\n\
7957splits are done. If sep is not specified, any whitespace string\n\
7958is a separator.");
7959
7960static PyObject*
7961unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7962{
7963    PyObject *substring = Py_None;
7964    Py_ssize_t maxcount = -1;
7965
7966    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7967        return NULL;
7968
7969    if (substring == Py_None)
7970	return rsplit(self, NULL, maxcount);
7971    else if (PyUnicode_Check(substring))
7972	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7973    else
7974	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7975}
7976
7977PyDoc_STRVAR(splitlines__doc__,
7978"S.splitlines([keepends]]) -> list of strings\n\
7979\n\
7980Return a list of the lines in S, breaking at line boundaries.\n\
7981Line breaks are not included in the resulting list unless keepends\n\
7982is given and true.");
7983
7984static PyObject*
7985unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7986{
7987    int keepends = 0;
7988
7989    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7990        return NULL;
7991
7992    return PyUnicode_Splitlines((PyObject *)self, keepends);
7993}
7994
7995static
7996PyObject *unicode_str(PyObject *self)
7997{
7998    if (PyUnicode_CheckExact(self)) {
7999        Py_INCREF(self);
8000        return self;
8001    } else
8002        /* Subtype -- return genuine unicode string with the same value. */
8003        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8004                                     PyUnicode_GET_SIZE(self));
8005}
8006
8007PyDoc_STRVAR(swapcase__doc__,
8008"S.swapcase() -> str\n\
8009\n\
8010Return a copy of S with uppercase characters converted to lowercase\n\
8011and vice versa.");
8012
8013static PyObject*
8014unicode_swapcase(PyUnicodeObject *self)
8015{
8016    return fixup(self, fixswapcase);
8017}
8018
8019PyDoc_STRVAR(maketrans__doc__,
8020"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8021\n\
8022Return a translation table usable for str.translate().\n\
8023If there is only one argument, it must be a dictionary mapping Unicode\n\
8024ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8025Character keys will be then converted to ordinals.\n\
8026If there are two arguments, they must be strings of equal length, and\n\
8027in the resulting dictionary, each character in x will be mapped to the\n\
8028character at the same position in y. If there is a third argument, it\n\
8029must be a string, whose characters will be mapped to None in the result.");
8030
8031static PyObject*
8032unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8033{
8034    PyObject *x, *y = NULL, *z = NULL;
8035    PyObject *new = NULL, *key, *value;
8036    Py_ssize_t i = 0;
8037    int res;
8038
8039    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8040        return NULL;
8041    new = PyDict_New();
8042    if (!new)
8043        return NULL;
8044    if (y != NULL) {
8045        /* x must be a string too, of equal length */
8046        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8047        if (!PyUnicode_Check(x)) {
8048            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8049                            "be a string if there is a second argument");
8050            goto err;
8051        }
8052        if (PyUnicode_GET_SIZE(x) != ylen) {
8053            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8054                            "arguments must have equal length");
8055            goto err;
8056        }
8057        /* create entries for translating chars in x to those in y */
8058        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8059            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8060            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8061            if (!key || !value)
8062                goto err;
8063            res = PyDict_SetItem(new, key, value);
8064            Py_DECREF(key);
8065            Py_DECREF(value);
8066            if (res < 0)
8067                goto err;
8068        }
8069        /* create entries for deleting chars in z */
8070        if (z != NULL) {
8071            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8072                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8073                if (!key)
8074                    goto err;
8075                res = PyDict_SetItem(new, key, Py_None);
8076                Py_DECREF(key);
8077                if (res < 0)
8078                    goto err;
8079            }
8080        }
8081    } else {
8082        /* x must be a dict */
8083        if (!PyDict_Check(x)) {
8084            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8085                            "to maketrans it must be a dict");
8086            goto err;
8087        }
8088        /* copy entries into the new dict, converting string keys to int keys */
8089        while (PyDict_Next(x, &i, &key, &value)) {
8090            if (PyUnicode_Check(key)) {
8091                /* convert string keys to integer keys */
8092                PyObject *newkey;
8093                if (PyUnicode_GET_SIZE(key) != 1) {
8094                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8095                                    "table must be of length 1");
8096                    goto err;
8097                }
8098                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8099                if (!newkey)
8100                    goto err;
8101                res = PyDict_SetItem(new, newkey, value);
8102                Py_DECREF(newkey);
8103                if (res < 0)
8104                    goto err;
8105            } else if (PyLong_Check(key)) {
8106                /* just keep integer keys */
8107                if (PyDict_SetItem(new, key, value) < 0)
8108                    goto err;
8109            } else {
8110                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8111                                "be strings or integers");
8112                goto err;
8113            }
8114        }
8115    }
8116    return new;
8117  err:
8118    Py_DECREF(new);
8119    return NULL;
8120}
8121
8122PyDoc_STRVAR(translate__doc__,
8123"S.translate(table) -> str\n\
8124\n\
8125Return a copy of the string S, where all characters have been mapped\n\
8126through the given translation table, which must be a mapping of\n\
8127Unicode ordinals to Unicode ordinals, strings, or None.\n\
8128Unmapped characters are left untouched. Characters mapped to None\n\
8129are deleted.");
8130
8131static PyObject*
8132unicode_translate(PyUnicodeObject *self, PyObject *table)
8133{
8134    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8135}
8136
8137PyDoc_STRVAR(upper__doc__,
8138"S.upper() -> str\n\
8139\n\
8140Return a copy of S converted to uppercase.");
8141
8142static PyObject*
8143unicode_upper(PyUnicodeObject *self)
8144{
8145    return fixup(self, fixupper);
8146}
8147
8148PyDoc_STRVAR(zfill__doc__,
8149"S.zfill(width) -> str\n\
8150\n\
8151Pad a numeric string x with zeros on the left, to fill a field\n\
8152of the specified width. The string x is never truncated.");
8153
8154static PyObject *
8155unicode_zfill(PyUnicodeObject *self, PyObject *args)
8156{
8157    Py_ssize_t fill;
8158    PyUnicodeObject *u;
8159
8160    Py_ssize_t width;
8161    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8162        return NULL;
8163
8164    if (self->length >= width) {
8165        if (PyUnicode_CheckExact(self)) {
8166            Py_INCREF(self);
8167            return (PyObject*) self;
8168        }
8169        else
8170            return PyUnicode_FromUnicode(
8171                PyUnicode_AS_UNICODE(self),
8172                PyUnicode_GET_SIZE(self)
8173            );
8174    }
8175
8176    fill = width - self->length;
8177
8178    u = pad(self, fill, 0, '0');
8179
8180    if (u == NULL)
8181        return NULL;
8182
8183    if (u->str[fill] == '+' || u->str[fill] == '-') {
8184        /* move sign to beginning of string */
8185        u->str[0] = u->str[fill];
8186        u->str[fill] = '0';
8187    }
8188
8189    return (PyObject*) u;
8190}
8191
8192#if 0
8193static PyObject*
8194unicode_freelistsize(PyUnicodeObject *self)
8195{
8196    return PyLong_FromLong(numfree);
8197}
8198#endif
8199
8200PyDoc_STRVAR(startswith__doc__,
8201"S.startswith(prefix[, start[, end]]) -> bool\n\
8202\n\
8203Return True if S starts with the specified prefix, False otherwise.\n\
8204With optional start, test S beginning at that position.\n\
8205With optional end, stop comparing S at that position.\n\
8206prefix can also be a tuple of strings to try.");
8207
8208static PyObject *
8209unicode_startswith(PyUnicodeObject *self,
8210		   PyObject *args)
8211{
8212    PyObject *subobj;
8213    PyUnicodeObject *substring;
8214    Py_ssize_t start = 0;
8215    Py_ssize_t end = PY_SSIZE_T_MAX;
8216    int result;
8217
8218    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8219		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8220	return NULL;
8221    if (PyTuple_Check(subobj)) {
8222        Py_ssize_t i;
8223        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8224            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8225                            PyTuple_GET_ITEM(subobj, i));
8226            if (substring == NULL)
8227                return NULL;
8228            result = tailmatch(self, substring, start, end, -1);
8229            Py_DECREF(substring);
8230            if (result) {
8231                Py_RETURN_TRUE;
8232            }
8233        }
8234        /* nothing matched */
8235        Py_RETURN_FALSE;
8236    }
8237    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8238    if (substring == NULL)
8239         return NULL;
8240    result = tailmatch(self, substring, start, end, -1);
8241    Py_DECREF(substring);
8242    return PyBool_FromLong(result);
8243}
8244
8245
8246PyDoc_STRVAR(endswith__doc__,
8247"S.endswith(suffix[, start[, end]]) -> bool\n\
8248\n\
8249Return True if S ends with the specified suffix, False otherwise.\n\
8250With optional start, test S beginning at that position.\n\
8251With optional end, stop comparing S at that position.\n\
8252suffix can also be a tuple of strings to try.");
8253
8254static PyObject *
8255unicode_endswith(PyUnicodeObject *self,
8256		 PyObject *args)
8257{
8258    PyObject *subobj;
8259    PyUnicodeObject *substring;
8260    Py_ssize_t start = 0;
8261    Py_ssize_t end = PY_SSIZE_T_MAX;
8262    int result;
8263
8264    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8265        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8266	return NULL;
8267    if (PyTuple_Check(subobj)) {
8268        Py_ssize_t i;
8269        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8270            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8271                            PyTuple_GET_ITEM(subobj, i));
8272            if (substring == NULL)
8273            return NULL;
8274            result = tailmatch(self, substring, start, end, +1);
8275            Py_DECREF(substring);
8276            if (result) {
8277                Py_RETURN_TRUE;
8278            }
8279        }
8280        Py_RETURN_FALSE;
8281    }
8282    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8283    if (substring == NULL)
8284    return NULL;
8285
8286    result = tailmatch(self, substring, start, end, +1);
8287    Py_DECREF(substring);
8288    return PyBool_FromLong(result);
8289}
8290
8291#include "stringlib/string_format.h"
8292
8293PyDoc_STRVAR(format__doc__,
8294"S.format(*args, **kwargs) -> str\n\
8295\n\
8296");
8297
8298static PyObject *
8299unicode__format__(PyObject* self, PyObject* args)
8300{
8301    PyObject *format_spec;
8302
8303    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8304        return NULL;
8305
8306    return _PyUnicode_FormatAdvanced(self,
8307                                     PyUnicode_AS_UNICODE(format_spec),
8308                                     PyUnicode_GET_SIZE(format_spec));
8309}
8310
8311PyDoc_STRVAR(p_format__doc__,
8312"S.__format__(format_spec) -> str\n\
8313\n\
8314");
8315
8316static PyObject *
8317unicode__sizeof__(PyUnicodeObject *v)
8318{
8319    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8320                              sizeof(Py_UNICODE) * (v->length + 1));
8321}
8322
8323PyDoc_STRVAR(sizeof__doc__,
8324"S.__sizeof__() -> size of S in memory, in bytes");
8325
8326static PyObject *
8327unicode_getnewargs(PyUnicodeObject *v)
8328{
8329	return Py_BuildValue("(u#)", v->str, v->length);
8330}
8331
8332
8333static PyMethodDef unicode_methods[] = {
8334
8335    /* Order is according to common usage: often used methods should
8336       appear first, since lookup is done sequentially. */
8337
8338    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8339    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8340    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8341    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8342    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8343    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8344    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8345    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8346    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8347    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8348    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8349    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8350    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8351    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8352    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8353    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8354    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8355    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8356    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8357    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8358    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8359    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8360    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8361    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8362    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8363    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8364    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8365    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8366    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8367    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8368    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8369    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8370    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8371    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8372    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8373    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8374    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8375    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8376    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
8377    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8378    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8379    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8380    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8381    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8382    {"maketrans", (PyCFunction) unicode_maketrans,
8383     METH_VARARGS | METH_STATIC, maketrans__doc__},
8384    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8385#if 0
8386    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8387#endif
8388
8389#if 0
8390    /* This one is just used for debugging the implementation. */
8391    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8392#endif
8393
8394    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
8395    {NULL, NULL}
8396};
8397
8398static PyObject *
8399unicode_mod(PyObject *v, PyObject *w)
8400{
8401       if (!PyUnicode_Check(v)) {
8402               Py_INCREF(Py_NotImplemented);
8403               return Py_NotImplemented;
8404       }
8405       return PyUnicode_Format(v, w);
8406}
8407
8408static PyNumberMethods unicode_as_number = {
8409	0,				/*nb_add*/
8410	0,				/*nb_subtract*/
8411	0,				/*nb_multiply*/
8412	unicode_mod,			/*nb_remainder*/
8413};
8414
8415static PySequenceMethods unicode_as_sequence = {
8416    (lenfunc) unicode_length, 		/* sq_length */
8417    PyUnicode_Concat,		 	/* sq_concat */
8418    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
8419    (ssizeargfunc) unicode_getitem, 	/* sq_item */
8420    0,				 	/* sq_slice */
8421    0, 					/* sq_ass_item */
8422    0, 					/* sq_ass_slice */
8423    PyUnicode_Contains, 		/* sq_contains */
8424};
8425
8426static PyObject*
8427unicode_subscript(PyUnicodeObject* self, PyObject* item)
8428{
8429    if (PyIndex_Check(item)) {
8430        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8431        if (i == -1 && PyErr_Occurred())
8432            return NULL;
8433        if (i < 0)
8434            i += PyUnicode_GET_SIZE(self);
8435        return unicode_getitem(self, i);
8436    } else if (PySlice_Check(item)) {
8437        Py_ssize_t start, stop, step, slicelength, cur, i;
8438        Py_UNICODE* source_buf;
8439        Py_UNICODE* result_buf;
8440        PyObject* result;
8441
8442        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8443				 &start, &stop, &step, &slicelength) < 0) {
8444            return NULL;
8445        }
8446
8447        if (slicelength <= 0) {
8448            return PyUnicode_FromUnicode(NULL, 0);
8449        } else if (start == 0 && step == 1 && slicelength == self->length &&
8450                   PyUnicode_CheckExact(self)) {
8451            Py_INCREF(self);
8452            return (PyObject *)self;
8453        } else if (step == 1) {
8454            return PyUnicode_FromUnicode(self->str + start, slicelength);
8455        } else {
8456            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8457            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8458                                                       sizeof(Py_UNICODE));
8459
8460	    if (result_buf == NULL)
8461		    return PyErr_NoMemory();
8462
8463            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8464                result_buf[i] = source_buf[cur];
8465            }
8466
8467            result = PyUnicode_FromUnicode(result_buf, slicelength);
8468            PyObject_FREE(result_buf);
8469            return result;
8470        }
8471    } else {
8472        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8473        return NULL;
8474    }
8475}
8476
8477static PyMappingMethods unicode_as_mapping = {
8478    (lenfunc)unicode_length,		/* mp_length */
8479    (binaryfunc)unicode_subscript,	/* mp_subscript */
8480    (objobjargproc)0,			/* mp_ass_subscript */
8481};
8482
8483
8484/* Helpers for PyUnicode_Format() */
8485
8486static PyObject *
8487getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8488{
8489    Py_ssize_t argidx = *p_argidx;
8490    if (argidx < arglen) {
8491	(*p_argidx)++;
8492	if (arglen < 0)
8493	    return args;
8494	else
8495	    return PyTuple_GetItem(args, argidx);
8496    }
8497    PyErr_SetString(PyExc_TypeError,
8498		    "not enough arguments for format string");
8499    return NULL;
8500}
8501
8502static Py_ssize_t
8503strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8504{
8505    register Py_ssize_t i;
8506    Py_ssize_t len = strlen(charbuffer);
8507    for (i = len - 1; i >= 0; i--)
8508	buffer[i] = (Py_UNICODE) charbuffer[i];
8509
8510    return len;
8511}
8512
8513static int
8514doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8515{
8516    Py_ssize_t result;
8517
8518    PyOS_ascii_formatd((char *)buffer, len, format, x);
8519    result = strtounicode(buffer, (char *)buffer);
8520    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8521}
8522
8523#if 0
8524static int
8525longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8526{
8527    Py_ssize_t result;
8528
8529    PyOS_snprintf((char *)buffer, len, format, x);
8530    result = strtounicode(buffer, (char *)buffer);
8531    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8532}
8533#endif
8534
8535/* XXX To save some code duplication, formatfloat/long/int could have been
8536   shared with stringobject.c, converting from 8-bit to Unicode after the
8537   formatting is done. */
8538
8539static int
8540formatfloat(Py_UNICODE *buf,
8541	    size_t buflen,
8542	    int flags,
8543	    int prec,
8544	    int type,
8545	    PyObject *v)
8546{
8547    /* fmt = '%#.' + `prec` + `type`
8548       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8549    char fmt[20];
8550    double x;
8551
8552    x = PyFloat_AsDouble(v);
8553    if (x == -1.0 && PyErr_Occurred())
8554	return -1;
8555    if (prec < 0)
8556	prec = 6;
8557    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8558	type = 'g';
8559    /* Worst case length calc to ensure no buffer overrun:
8560
8561       'g' formats:
8562	 fmt = %#.<prec>g
8563	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8564	    for any double rep.)
8565	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8566
8567       'f' formats:
8568	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8569	 len = 1 + 50 + 1 + prec = 52 + prec
8570
8571       If prec=0 the effective precision is 1 (the leading digit is
8572       always given), therefore increase the length by one.
8573
8574    */
8575    if (((type == 'g' || type == 'G') &&
8576          buflen <= (size_t)10 + (size_t)prec) ||
8577	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8578	PyErr_SetString(PyExc_OverflowError,
8579			"formatted float is too long (precision too large?)");
8580	return -1;
8581    }
8582    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8583		  (flags&F_ALT) ? "#" : "",
8584		  prec, type);
8585    return doubletounicode(buf, buflen, fmt, x);
8586}
8587
8588static PyObject*
8589formatlong(PyObject *val, int flags, int prec, int type)
8590{
8591	char *buf;
8592	int len;
8593	PyObject *str; /* temporary string object. */
8594	PyObject *result;
8595
8596	str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8597	if (!str)
8598		return NULL;
8599	result = PyUnicode_FromStringAndSize(buf, len);
8600	Py_DECREF(str);
8601	return result;
8602}
8603
8604#if 0
8605static int
8606formatint(Py_UNICODE *buf,
8607	  size_t buflen,
8608	  int flags,
8609	  int prec,
8610	  int type,
8611	  PyObject *v)
8612{
8613    /* fmt = '%#.' + `prec` + 'l' + `type`
8614     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8615     *                     + 1 + 1
8616     *                   = 24
8617     */
8618    char fmt[64]; /* plenty big enough! */
8619    char *sign;
8620    long x;
8621
8622    x = PyLong_AsLong(v);
8623    if (x == -1 && PyErr_Occurred())
8624        return -1;
8625    if (x < 0 && type == 'u') {
8626        type = 'd';
8627    }
8628    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8629        sign = "-";
8630    else
8631        sign = "";
8632    if (prec < 0)
8633        prec = 1;
8634
8635    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8636     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8637     */
8638    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8639        PyErr_SetString(PyExc_OverflowError,
8640    	        "formatted integer is too long (precision too large?)");
8641        return -1;
8642    }
8643
8644    if ((flags & F_ALT) &&
8645        (type == 'x' || type == 'X' || type == 'o')) {
8646        /* When converting under %#o, %#x or %#X, there are a number
8647         * of issues that cause pain:
8648	 * - for %#o, we want a different base marker than C
8649         * - when 0 is being converted, the C standard leaves off
8650         *   the '0x' or '0X', which is inconsistent with other
8651         *   %#x/%#X conversions and inconsistent with Python's
8652         *   hex() function
8653         * - there are platforms that violate the standard and
8654         *   convert 0 with the '0x' or '0X'
8655         *   (Metrowerks, Compaq Tru64)
8656         * - there are platforms that give '0x' when converting
8657         *   under %#X, but convert 0 in accordance with the
8658         *   standard (OS/2 EMX)
8659         *
8660         * We can achieve the desired consistency by inserting our
8661         * own '0x' or '0X' prefix, and substituting %x/%X in place
8662         * of %#x/%#X.
8663         *
8664         * Note that this is the same approach as used in
8665         * formatint() in stringobject.c
8666         */
8667        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8668                      sign, type, prec, type);
8669    }
8670    else {
8671        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8672                      sign, (flags&F_ALT) ? "#" : "",
8673                      prec, type);
8674    }
8675    if (sign[0])
8676        return longtounicode(buf, buflen, fmt, -x);
8677    else
8678        return longtounicode(buf, buflen, fmt, x);
8679}
8680#endif
8681
8682static int
8683formatchar(Py_UNICODE *buf,
8684           size_t buflen,
8685           PyObject *v)
8686{
8687    /* presume that the buffer is at least 3 characters long */
8688    if (PyUnicode_Check(v)) {
8689	if (PyUnicode_GET_SIZE(v) == 1) {
8690	    buf[0] = PyUnicode_AS_UNICODE(v)[0];
8691	    buf[1] = '\0';
8692	    return 1;
8693	}
8694#ifndef Py_UNICODE_WIDE
8695	if (PyUnicode_GET_SIZE(v) == 2) {
8696	    /* Decode a valid surrogate pair */
8697	    int c0 = PyUnicode_AS_UNICODE(v)[0];
8698	    int c1 = PyUnicode_AS_UNICODE(v)[1];
8699	    if (0xD800 <= c0 && c0 <= 0xDBFF &&
8700		0xDC00 <= c1 && c1 <= 0xDFFF) {
8701		buf[0] = c0;
8702		buf[1] = c1;
8703		buf[2] = '\0';
8704		return 2;
8705	    }
8706	}
8707#endif
8708	goto onError;
8709    }
8710    else {
8711	/* Integer input truncated to a character */
8712        long x;
8713	x = PyLong_AsLong(v);
8714	if (x == -1 && PyErr_Occurred())
8715	    goto onError;
8716
8717	if (x < 0 || x > 0x10ffff) {
8718	    PyErr_SetString(PyExc_OverflowError,
8719			    "%c arg not in range(0x110000)");
8720	    return -1;
8721	}
8722
8723#ifndef Py_UNICODE_WIDE
8724	if (x > 0xffff) {
8725	    x -= 0x10000;
8726	    buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8727	    buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8728	    return 2;
8729	}
8730#endif
8731	buf[0] = (Py_UNICODE) x;
8732	buf[1] = '\0';
8733	return 1;
8734    }
8735
8736 onError:
8737    PyErr_SetString(PyExc_TypeError,
8738		    "%c requires int or char");
8739    return -1;
8740}
8741
8742/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8743
8744   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8745   chars are formatted. XXX This is a magic number. Each formatting
8746   routine does bounds checking to ensure no overflow, but a better
8747   solution may be to malloc a buffer of appropriate size for each
8748   format. For now, the current solution is sufficient.
8749*/
8750#define FORMATBUFLEN (size_t)120
8751
8752PyObject *PyUnicode_Format(PyObject *format,
8753			   PyObject *args)
8754{
8755    Py_UNICODE *fmt, *res;
8756    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8757    int args_owned = 0;
8758    PyUnicodeObject *result = NULL;
8759    PyObject *dict = NULL;
8760    PyObject *uformat;
8761
8762    if (format == NULL || args == NULL) {
8763	PyErr_BadInternalCall();
8764	return NULL;
8765    }
8766    uformat = PyUnicode_FromObject(format);
8767    if (uformat == NULL)
8768	return NULL;
8769    fmt = PyUnicode_AS_UNICODE(uformat);
8770    fmtcnt = PyUnicode_GET_SIZE(uformat);
8771
8772    reslen = rescnt = fmtcnt + 100;
8773    result = _PyUnicode_New(reslen);
8774    if (result == NULL)
8775	goto onError;
8776    res = PyUnicode_AS_UNICODE(result);
8777
8778    if (PyTuple_Check(args)) {
8779	arglen = PyTuple_Size(args);
8780	argidx = 0;
8781    }
8782    else {
8783	arglen = -1;
8784	argidx = -2;
8785    }
8786    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8787        !PyUnicode_Check(args))
8788	dict = args;
8789
8790    while (--fmtcnt >= 0) {
8791	if (*fmt != '%') {
8792	    if (--rescnt < 0) {
8793		rescnt = fmtcnt + 100;
8794		reslen += rescnt;
8795		if (_PyUnicode_Resize(&result, reslen) < 0)
8796		    goto onError;
8797		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8798		--rescnt;
8799	    }
8800	    *res++ = *fmt++;
8801	}
8802	else {
8803	    /* Got a format specifier */
8804	    int flags = 0;
8805	    Py_ssize_t width = -1;
8806	    int prec = -1;
8807	    Py_UNICODE c = '\0';
8808	    Py_UNICODE fill;
8809	    int isnumok;
8810	    PyObject *v = NULL;
8811	    PyObject *temp = NULL;
8812	    Py_UNICODE *pbuf;
8813	    Py_UNICODE sign;
8814	    Py_ssize_t len;
8815	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8816
8817	    fmt++;
8818	    if (*fmt == '(') {
8819		Py_UNICODE *keystart;
8820		Py_ssize_t keylen;
8821		PyObject *key;
8822		int pcount = 1;
8823
8824		if (dict == NULL) {
8825		    PyErr_SetString(PyExc_TypeError,
8826				    "format requires a mapping");
8827		    goto onError;
8828		}
8829		++fmt;
8830		--fmtcnt;
8831		keystart = fmt;
8832		/* Skip over balanced parentheses */
8833		while (pcount > 0 && --fmtcnt >= 0) {
8834		    if (*fmt == ')')
8835			--pcount;
8836		    else if (*fmt == '(')
8837			++pcount;
8838		    fmt++;
8839		}
8840		keylen = fmt - keystart - 1;
8841		if (fmtcnt < 0 || pcount > 0) {
8842		    PyErr_SetString(PyExc_ValueError,
8843				    "incomplete format key");
8844		    goto onError;
8845		}
8846#if 0
8847		/* keys are converted to strings using UTF-8 and
8848		   then looked up since Python uses strings to hold
8849		   variables names etc. in its namespaces and we
8850		   wouldn't want to break common idioms. */
8851		key = PyUnicode_EncodeUTF8(keystart,
8852					   keylen,
8853					   NULL);
8854#else
8855		key = PyUnicode_FromUnicode(keystart, keylen);
8856#endif
8857		if (key == NULL)
8858		    goto onError;
8859		if (args_owned) {
8860		    Py_DECREF(args);
8861		    args_owned = 0;
8862		}
8863		args = PyObject_GetItem(dict, key);
8864		Py_DECREF(key);
8865		if (args == NULL) {
8866		    goto onError;
8867		}
8868		args_owned = 1;
8869		arglen = -1;
8870		argidx = -2;
8871	    }
8872	    while (--fmtcnt >= 0) {
8873		switch (c = *fmt++) {
8874		case '-': flags |= F_LJUST; continue;
8875		case '+': flags |= F_SIGN; continue;
8876		case ' ': flags |= F_BLANK; continue;
8877		case '#': flags |= F_ALT; continue;
8878		case '0': flags |= F_ZERO; continue;
8879		}
8880		break;
8881	    }
8882	    if (c == '*') {
8883		v = getnextarg(args, arglen, &argidx);
8884		if (v == NULL)
8885		    goto onError;
8886		if (!PyLong_Check(v)) {
8887		    PyErr_SetString(PyExc_TypeError,
8888				    "* wants int");
8889		    goto onError;
8890		}
8891		width = PyLong_AsLong(v);
8892		if (width == -1 && PyErr_Occurred())
8893			goto onError;
8894		if (width < 0) {
8895		    flags |= F_LJUST;
8896		    width = -width;
8897		}
8898		if (--fmtcnt >= 0)
8899		    c = *fmt++;
8900	    }
8901	    else if (c >= '0' && c <= '9') {
8902		width = c - '0';
8903		while (--fmtcnt >= 0) {
8904		    c = *fmt++;
8905		    if (c < '0' || c > '9')
8906			break;
8907		    if ((width*10) / 10 != width) {
8908			PyErr_SetString(PyExc_ValueError,
8909					"width too big");
8910			goto onError;
8911		    }
8912		    width = width*10 + (c - '0');
8913		}
8914	    }
8915	    if (c == '.') {
8916		prec = 0;
8917		if (--fmtcnt >= 0)
8918		    c = *fmt++;
8919		if (c == '*') {
8920		    v = getnextarg(args, arglen, &argidx);
8921		    if (v == NULL)
8922			goto onError;
8923		    if (!PyLong_Check(v)) {
8924			PyErr_SetString(PyExc_TypeError,
8925					"* wants int");
8926			goto onError;
8927		    }
8928		    prec = PyLong_AsLong(v);
8929		    if (prec == -1 && PyErr_Occurred())
8930			goto onError;
8931		    if (prec < 0)
8932			prec = 0;
8933		    if (--fmtcnt >= 0)
8934			c = *fmt++;
8935		}
8936		else if (c >= '0' && c <= '9') {
8937		    prec = c - '0';
8938		    while (--fmtcnt >= 0) {
8939			c = Py_CHARMASK(*fmt++);
8940			if (c < '0' || c > '9')
8941			    break;
8942			if ((prec*10) / 10 != prec) {
8943			    PyErr_SetString(PyExc_ValueError,
8944					    "prec too big");
8945			    goto onError;
8946			}
8947			prec = prec*10 + (c - '0');
8948		    }
8949		}
8950	    } /* prec */
8951	    if (fmtcnt >= 0) {
8952		if (c == 'h' || c == 'l' || c == 'L') {
8953		    if (--fmtcnt >= 0)
8954			c = *fmt++;
8955		}
8956	    }
8957	    if (fmtcnt < 0) {
8958		PyErr_SetString(PyExc_ValueError,
8959				"incomplete format");
8960		goto onError;
8961	    }
8962	    if (c != '%') {
8963		v = getnextarg(args, arglen, &argidx);
8964		if (v == NULL)
8965		    goto onError;
8966	    }
8967	    sign = 0;
8968	    fill = ' ';
8969	    switch (c) {
8970
8971	    case '%':
8972		pbuf = formatbuf;
8973		/* presume that buffer length is at least 1 */
8974		pbuf[0] = '%';
8975		len = 1;
8976		break;
8977
8978	    case 's':
8979	    case 'r':
8980	    case 'a':
8981		if (PyUnicode_Check(v) && c == 's') {
8982		    temp = v;
8983		    Py_INCREF(temp);
8984		}
8985		else {
8986		    if (c == 's')
8987			temp = PyObject_Str(v);
8988		    else if (c == 'r')
8989			temp = PyObject_Repr(v);
8990		    else
8991			temp = PyObject_ASCII(v);
8992		    if (temp == NULL)
8993			goto onError;
8994                    if (PyUnicode_Check(temp))
8995                        /* nothing to do */;
8996		    else {
8997			Py_DECREF(temp);
8998			PyErr_SetString(PyExc_TypeError,
8999					"%s argument has non-string str()");
9000			goto onError;
9001		    }
9002		}
9003		pbuf = PyUnicode_AS_UNICODE(temp);
9004		len = PyUnicode_GET_SIZE(temp);
9005		if (prec >= 0 && len > prec)
9006		    len = prec;
9007		break;
9008
9009	    case 'i':
9010	    case 'd':
9011	    case 'u':
9012	    case 'o':
9013	    case 'x':
9014	    case 'X':
9015		if (c == 'i')
9016		    c = 'd';
9017		isnumok = 0;
9018		if (PyNumber_Check(v)) {
9019			PyObject *iobj=NULL;
9020
9021			if (PyLong_Check(v)) {
9022				iobj = v;
9023				Py_INCREF(iobj);
9024			}
9025			else {
9026				iobj = PyNumber_Long(v);
9027			}
9028			if (iobj!=NULL) {
9029				if (PyLong_Check(iobj)) {
9030					isnumok = 1;
9031					temp = formatlong(iobj, flags, prec, c);
9032					Py_DECREF(iobj);
9033					if (!temp)
9034					    goto onError;
9035					pbuf = PyUnicode_AS_UNICODE(temp);
9036					len = PyUnicode_GET_SIZE(temp);
9037					sign = 1;
9038				}
9039				else {
9040					Py_DECREF(iobj);
9041				}
9042			}
9043		}
9044		if (!isnumok) {
9045			PyErr_Format(PyExc_TypeError,
9046			    "%%%c format: a number is required, "
9047                                     "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9048			goto onError;
9049		}
9050		if (flags & F_ZERO)
9051		    fill = '0';
9052		break;
9053
9054	    case 'e':
9055	    case 'E':
9056	    case 'f':
9057	    case 'F':
9058	    case 'g':
9059	    case 'G':
9060		if (c == 'F')
9061			c = 'f';
9062		pbuf = formatbuf;
9063		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9064			flags, prec, c, v);
9065		if (len < 0)
9066		    goto onError;
9067		sign = 1;
9068		if (flags & F_ZERO)
9069		    fill = '0';
9070		break;
9071
9072	    case 'c':
9073		pbuf = formatbuf;
9074		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9075		if (len < 0)
9076		    goto onError;
9077		break;
9078
9079	    default:
9080		PyErr_Format(PyExc_ValueError,
9081			     "unsupported format character '%c' (0x%x) "
9082			     "at index %zd",
9083			     (31<=c && c<=126) ? (char)c : '?',
9084                             (int)c,
9085			     (Py_ssize_t)(fmt - 1 -
9086					  PyUnicode_AS_UNICODE(uformat)));
9087		goto onError;
9088	    }
9089	    if (sign) {
9090		if (*pbuf == '-' || *pbuf == '+') {
9091		    sign = *pbuf++;
9092		    len--;
9093		}
9094		else if (flags & F_SIGN)
9095		    sign = '+';
9096		else if (flags & F_BLANK)
9097		    sign = ' ';
9098		else
9099		    sign = 0;
9100	    }
9101	    if (width < len)
9102		width = len;
9103	    if (rescnt - (sign != 0) < width) {
9104		reslen -= rescnt;
9105		rescnt = width + fmtcnt + 100;
9106		reslen += rescnt;
9107		if (reslen < 0) {
9108		    Py_XDECREF(temp);
9109		    PyErr_NoMemory();
9110		    goto onError;
9111		}
9112		if (_PyUnicode_Resize(&result, reslen) < 0) {
9113		    Py_XDECREF(temp);
9114		    goto onError;
9115		}
9116		res = PyUnicode_AS_UNICODE(result)
9117		    + reslen - rescnt;
9118	    }
9119	    if (sign) {
9120		if (fill != ' ')
9121		    *res++ = sign;
9122		rescnt--;
9123		if (width > len)
9124		    width--;
9125	    }
9126	    if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9127		assert(pbuf[0] == '0');
9128		assert(pbuf[1] == c);
9129		if (fill != ' ') {
9130		    *res++ = *pbuf++;
9131		    *res++ = *pbuf++;
9132		}
9133		rescnt -= 2;
9134		width -= 2;
9135		if (width < 0)
9136		    width = 0;
9137		len -= 2;
9138	    }
9139	    if (width > len && !(flags & F_LJUST)) {
9140		do {
9141		    --rescnt;
9142		    *res++ = fill;
9143		} while (--width > len);
9144	    }
9145	    if (fill == ' ') {
9146		if (sign)
9147		    *res++ = sign;
9148		if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9149		    assert(pbuf[0] == '0');
9150		    assert(pbuf[1] == c);
9151		    *res++ = *pbuf++;
9152		    *res++ = *pbuf++;
9153		}
9154	    }
9155	    Py_UNICODE_COPY(res, pbuf, len);
9156	    res += len;
9157	    rescnt -= len;
9158	    while (--width >= len) {
9159		--rescnt;
9160		*res++ = ' ';
9161	    }
9162	    if (dict && (argidx < arglen) && c != '%') {
9163		PyErr_SetString(PyExc_TypeError,
9164				"not all arguments converted during string formatting");
9165                Py_XDECREF(temp);
9166		goto onError;
9167	    }
9168	    Py_XDECREF(temp);
9169	} /* '%' */
9170    } /* until end */
9171    if (argidx < arglen && !dict) {
9172	PyErr_SetString(PyExc_TypeError,
9173			"not all arguments converted during string formatting");
9174	goto onError;
9175    }
9176
9177    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9178	goto onError;
9179    if (args_owned) {
9180	Py_DECREF(args);
9181    }
9182    Py_DECREF(uformat);
9183    return (PyObject *)result;
9184
9185 onError:
9186    Py_XDECREF(result);
9187    Py_DECREF(uformat);
9188    if (args_owned) {
9189	Py_DECREF(args);
9190    }
9191    return NULL;
9192}
9193
9194static PyObject *
9195unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9196
9197static PyObject *
9198unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9199{
9200        PyObject *x = NULL;
9201	static char *kwlist[] = {"object", "encoding", "errors", 0};
9202	char *encoding = NULL;
9203	char *errors = NULL;
9204
9205	if (type != &PyUnicode_Type)
9206		return unicode_subtype_new(type, args, kwds);
9207	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9208					  kwlist, &x, &encoding, &errors))
9209	    return NULL;
9210	if (x == NULL)
9211		return (PyObject *)_PyUnicode_New(0);
9212	if (encoding == NULL && errors == NULL)
9213	    return PyObject_Str(x);
9214	else
9215	return PyUnicode_FromEncodedObject(x, encoding, errors);
9216}
9217
9218static PyObject *
9219unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9220{
9221	PyUnicodeObject *tmp, *pnew;
9222	Py_ssize_t n;
9223
9224	assert(PyType_IsSubtype(type, &PyUnicode_Type));
9225	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9226	if (tmp == NULL)
9227		return NULL;
9228	assert(PyUnicode_Check(tmp));
9229	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9230	if (pnew == NULL) {
9231		Py_DECREF(tmp);
9232		return NULL;
9233	}
9234	pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9235	if (pnew->str == NULL) {
9236		_Py_ForgetReference((PyObject *)pnew);
9237		PyObject_Del(pnew);
9238		Py_DECREF(tmp);
9239		return PyErr_NoMemory();
9240	}
9241	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9242	pnew->length = n;
9243	pnew->hash = tmp->hash;
9244	Py_DECREF(tmp);
9245	return (PyObject *)pnew;
9246}
9247
9248PyDoc_STRVAR(unicode_doc,
9249"str(string[, encoding[, errors]]) -> str\n\
9250\n\
9251Create a new string object from the given encoded string.\n\
9252encoding defaults to the current default string encoding.\n\
9253errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9254
9255static PyObject *unicode_iter(PyObject *seq);
9256
9257PyTypeObject PyUnicode_Type = {
9258    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9259    "str", 				/* tp_name */
9260    sizeof(PyUnicodeObject), 		/* tp_size */
9261    0, 					/* tp_itemsize */
9262    /* Slots */
9263    (destructor)unicode_dealloc, 	/* tp_dealloc */
9264    0, 					/* tp_print */
9265    0,				 	/* tp_getattr */
9266    0, 					/* tp_setattr */
9267    0, 					/* tp_compare */
9268    unicode_repr, 			/* tp_repr */
9269    &unicode_as_number, 		/* tp_as_number */
9270    &unicode_as_sequence, 		/* tp_as_sequence */
9271    &unicode_as_mapping, 		/* tp_as_mapping */
9272    (hashfunc) unicode_hash, 		/* tp_hash*/
9273    0, 					/* tp_call*/
9274    (reprfunc) unicode_str,	 	/* tp_str */
9275    PyObject_GenericGetAttr, 		/* tp_getattro */
9276    0,			 		/* tp_setattro */
9277    0, 					/* tp_as_buffer */
9278    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9279        Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
9280    unicode_doc,			/* tp_doc */
9281    0,					/* tp_traverse */
9282    0,					/* tp_clear */
9283    PyUnicode_RichCompare,		/* tp_richcompare */
9284    0,					/* tp_weaklistoffset */
9285    unicode_iter,			/* tp_iter */
9286    0,					/* tp_iternext */
9287    unicode_methods,			/* tp_methods */
9288    0,					/* tp_members */
9289    0,					/* tp_getset */
9290    &PyBaseObject_Type,			/* tp_base */
9291    0,					/* tp_dict */
9292    0,					/* tp_descr_get */
9293    0,					/* tp_descr_set */
9294    0,					/* tp_dictoffset */
9295    0,					/* tp_init */
9296    0,					/* tp_alloc */
9297    unicode_new,			/* tp_new */
9298    PyObject_Del,      		/* tp_free */
9299};
9300
9301/* Initialize the Unicode implementation */
9302
9303void _PyUnicode_Init(void)
9304{
9305    int i;
9306
9307    /* XXX - move this array to unicodectype.c ? */
9308    Py_UNICODE linebreak[] = {
9309        0x000A, /* LINE FEED */
9310        0x000D, /* CARRIAGE RETURN */
9311        0x001C, /* FILE SEPARATOR */
9312        0x001D, /* GROUP SEPARATOR */
9313        0x001E, /* RECORD SEPARATOR */
9314        0x0085, /* NEXT LINE */
9315        0x2028, /* LINE SEPARATOR */
9316        0x2029, /* PARAGRAPH SEPARATOR */
9317    };
9318
9319    /* Init the implementation */
9320    free_list = NULL;
9321    numfree = 0;
9322    unicode_empty = _PyUnicode_New(0);
9323    if (!unicode_empty)
9324	return;
9325
9326    for (i = 0; i < 256; i++)
9327	unicode_latin1[i] = NULL;
9328    if (PyType_Ready(&PyUnicode_Type) < 0)
9329	Py_FatalError("Can't initialize 'unicode'");
9330
9331    /* initialize the linebreak bloom filter */
9332    bloom_linebreak = make_bloom_mask(
9333        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9334        );
9335
9336    PyType_Ready(&EncodingMapType);
9337}
9338
9339/* Finalize the Unicode implementation */
9340
9341int
9342PyUnicode_ClearFreeList(void)
9343{
9344    int freelist_size = numfree;
9345    PyUnicodeObject *u;
9346
9347    for (u = free_list; u != NULL;) {
9348	PyUnicodeObject *v = u;
9349	u = *(PyUnicodeObject **)u;
9350	if (v->str)
9351	    PyObject_DEL(v->str);
9352	Py_XDECREF(v->defenc);
9353	PyObject_Del(v);
9354	numfree--;
9355    }
9356    free_list = NULL;
9357    assert(numfree == 0);
9358    return freelist_size;
9359}
9360
9361void
9362_PyUnicode_Fini(void)
9363{
9364    int i;
9365
9366    Py_XDECREF(unicode_empty);
9367    unicode_empty = NULL;
9368
9369    for (i = 0; i < 256; i++) {
9370	if (unicode_latin1[i]) {
9371	    Py_DECREF(unicode_latin1[i]);
9372	    unicode_latin1[i] = NULL;
9373	}
9374    }
9375    (void)PyUnicode_ClearFreeList();
9376}
9377
9378void
9379PyUnicode_InternInPlace(PyObject **p)
9380{
9381	register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9382	PyObject *t;
9383	if (s == NULL || !PyUnicode_Check(s))
9384		Py_FatalError(
9385		    "PyUnicode_InternInPlace: unicode strings only please!");
9386	/* If it's a subclass, we don't really know what putting
9387	   it in the interned dict might do. */
9388	if (!PyUnicode_CheckExact(s))
9389		return;
9390	if (PyUnicode_CHECK_INTERNED(s))
9391		return;
9392	if (interned == NULL) {
9393		interned = PyDict_New();
9394		if (interned == NULL) {
9395			PyErr_Clear(); /* Don't leave an exception */
9396			return;
9397		}
9398	}
9399	/* It might be that the GetItem call fails even
9400	   though the key is present in the dictionary,
9401	   namely when this happens during a stack overflow. */
9402	Py_ALLOW_RECURSION
9403	t = PyDict_GetItem(interned, (PyObject *)s);
9404	Py_END_ALLOW_RECURSION
9405
9406	if (t) {
9407		Py_INCREF(t);
9408		Py_DECREF(*p);
9409		*p = t;
9410		return;
9411	}
9412
9413	PyThreadState_GET()->recursion_critical = 1;
9414	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9415		PyErr_Clear();
9416		PyThreadState_GET()->recursion_critical = 0;
9417		return;
9418	}
9419	PyThreadState_GET()->recursion_critical = 0;
9420	/* The two references in interned are not counted by refcnt.
9421	   The deallocator will take care of this */
9422	Py_REFCNT(s) -= 2;
9423	PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9424}
9425
9426void
9427PyUnicode_InternImmortal(PyObject **p)
9428{
9429	PyUnicode_InternInPlace(p);
9430	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9431		PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9432		Py_INCREF(*p);
9433	}
9434}
9435
9436PyObject *
9437PyUnicode_InternFromString(const char *cp)
9438{
9439	PyObject *s = PyUnicode_FromString(cp);
9440	if (s == NULL)
9441		return NULL;
9442	PyUnicode_InternInPlace(&s);
9443	return s;
9444}
9445
9446void _Py_ReleaseInternedUnicodeStrings(void)
9447{
9448	PyObject *keys;
9449	PyUnicodeObject *s;
9450	Py_ssize_t i, n;
9451	Py_ssize_t immortal_size = 0, mortal_size = 0;
9452
9453	if (interned == NULL || !PyDict_Check(interned))
9454		return;
9455	keys = PyDict_Keys(interned);
9456	if (keys == NULL || !PyList_Check(keys)) {
9457		PyErr_Clear();
9458		return;
9459	}
9460
9461	/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9462	   detector, interned unicode strings are not forcibly deallocated;
9463	   rather, we give them their stolen references back, and then clear
9464	   and DECREF the interned dict. */
9465
9466	n = PyList_GET_SIZE(keys);
9467	fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9468		n);
9469	for (i = 0; i < n; i++) {
9470		s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9471		switch (s->state) {
9472		case SSTATE_NOT_INTERNED:
9473			/* XXX Shouldn't happen */
9474			break;
9475		case SSTATE_INTERNED_IMMORTAL:
9476			Py_REFCNT(s) += 1;
9477			immortal_size += s->length;
9478			break;
9479		case SSTATE_INTERNED_MORTAL:
9480			Py_REFCNT(s) += 2;
9481			mortal_size += s->length;
9482			break;
9483		default:
9484			Py_FatalError("Inconsistent interned string state.");
9485		}
9486		s->state = SSTATE_NOT_INTERNED;
9487	}
9488	fprintf(stderr, "total size of all interned strings: "
9489			"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9490			"mortal/immortal\n", mortal_size, immortal_size);
9491	Py_DECREF(keys);
9492	PyDict_Clear(interned);
9493	Py_DECREF(interned);
9494	interned = NULL;
9495}
9496
9497
9498/********************* Unicode Iterator **************************/
9499
9500typedef struct {
9501	PyObject_HEAD
9502	Py_ssize_t it_index;
9503	PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9504} unicodeiterobject;
9505
9506static void
9507unicodeiter_dealloc(unicodeiterobject *it)
9508{
9509	_PyObject_GC_UNTRACK(it);
9510	Py_XDECREF(it->it_seq);
9511	PyObject_GC_Del(it);
9512}
9513
9514static int
9515unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9516{
9517	Py_VISIT(it->it_seq);
9518	return 0;
9519}
9520
9521static PyObject *
9522unicodeiter_next(unicodeiterobject *it)
9523{
9524	PyUnicodeObject *seq;
9525	PyObject *item;
9526
9527	assert(it != NULL);
9528	seq = it->it_seq;
9529	if (seq == NULL)
9530		return NULL;
9531	assert(PyUnicode_Check(seq));
9532
9533	if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9534		item = PyUnicode_FromUnicode(
9535                    PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9536		if (item != NULL)
9537			++it->it_index;
9538		return item;
9539	}
9540
9541	Py_DECREF(seq);
9542	it->it_seq = NULL;
9543	return NULL;
9544}
9545
9546static PyObject *
9547unicodeiter_len(unicodeiterobject *it)
9548{
9549	Py_ssize_t len = 0;
9550	if (it->it_seq)
9551		len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9552	return PyLong_FromSsize_t(len);
9553}
9554
9555PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9556
9557static PyMethodDef unicodeiter_methods[] = {
9558	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9559         length_hint_doc},
9560 	{NULL,		NULL}		/* sentinel */
9561};
9562
9563PyTypeObject PyUnicodeIter_Type = {
9564	PyVarObject_HEAD_INIT(&PyType_Type, 0)
9565	"str_iterator",			/* tp_name */
9566	sizeof(unicodeiterobject),		/* tp_basicsize */
9567	0,					/* tp_itemsize */
9568	/* methods */
9569	(destructor)unicodeiter_dealloc,	/* tp_dealloc */
9570	0,					/* tp_print */
9571	0,					/* tp_getattr */
9572	0,					/* tp_setattr */
9573	0,					/* tp_compare */
9574	0,					/* tp_repr */
9575	0,					/* tp_as_number */
9576	0,					/* tp_as_sequence */
9577	0,					/* tp_as_mapping */
9578	0,					/* tp_hash */
9579	0,					/* tp_call */
9580	0,					/* tp_str */
9581	PyObject_GenericGetAttr,		/* tp_getattro */
9582	0,					/* tp_setattro */
9583	0,					/* tp_as_buffer */
9584	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9585	0,					/* tp_doc */
9586	(traverseproc)unicodeiter_traverse,	/* tp_traverse */
9587	0,					/* tp_clear */
9588	0,					/* tp_richcompare */
9589	0,					/* tp_weaklistoffset */
9590	PyObject_SelfIter,			/* tp_iter */
9591	(iternextfunc)unicodeiter_next,		/* tp_iternext */
9592	unicodeiter_methods,			/* tp_methods */
9593	0,
9594};
9595
9596static PyObject *
9597unicode_iter(PyObject *seq)
9598{
9599	unicodeiterobject *it;
9600
9601	if (!PyUnicode_Check(seq)) {
9602		PyErr_BadInternalCall();
9603		return NULL;
9604	}
9605	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9606	if (it == NULL)
9607		return NULL;
9608	it->it_index = 0;
9609	Py_INCREF(seq);
9610	it->it_seq = (PyUnicodeObject *)seq;
9611	_PyObject_GC_TRACK(it);
9612	return (PyObject *)it;
9613}
9614
9615size_t
9616Py_UNICODE_strlen(const Py_UNICODE *u)
9617{
9618    int res = 0;
9619    while(*u++)
9620        res++;
9621    return res;
9622}
9623
9624Py_UNICODE*
9625Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9626{
9627    Py_UNICODE *u = s1;
9628    while ((*u++ = *s2++));
9629    return s1;
9630}
9631
9632Py_UNICODE*
9633Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9634{
9635    Py_UNICODE *u = s1;
9636    while ((*u++ = *s2++))
9637        if (n-- == 0)
9638            break;
9639    return s1;
9640}
9641
9642int
9643Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9644{
9645    while (*s1 && *s2 && *s1 == *s2)
9646        s1++, s2++;
9647    if (*s1 && *s2)
9648        return (*s1 < *s2) ? -1 : +1;
9649    if (*s1)
9650        return 1;
9651    if (*s2)
9652        return -1;
9653    return 0;
9654}
9655
9656Py_UNICODE*
9657Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9658{
9659    const Py_UNICODE *p;
9660    for (p = s; *p; p++)
9661        if (*p == c)
9662            return (Py_UNICODE*)p;
9663    return NULL;
9664}
9665
9666
9667#ifdef __cplusplus
9668}
9669#endif
9670
9671
9672/*
9673Local variables:
9674c-basic-offset: 4
9675indent-tabs-mode: nil
9676End:
9677*/
9678