unicodeobject.c revision 63a28be01693584afcadc39ca650efc5fa8f2880
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44
45#include "unicodeobject.h"
46#include "ucnhash.h"
47
48#ifdef MS_WINDOWS
49#include <windows.h>
50#endif
51
52/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE       1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58   The implementation will keep allocated Unicode memory intact for
59   all objects on the free list having a size less than this
60   limit. This reduces malloc() overhead for small Unicode objects.
61
62   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64   malloc()-overhead) bytes of unused garbage.
65
66   Setting the limit to 0 effectively turns the feature off.
67
68   Note: This is an experimental feature ! If you get core dumps when
69   using Unicode objects, turn this feature off.
70
71*/
72
73#define KEEPALIVE_SIZE_LIMIT       9
74
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
83/* --- Globals ------------------------------------------------------------
84
85   The globals are initialized by the _PyUnicode_Init() API and should
86   not be used before calling that API.
87
88*/
89
90
91#ifdef __cplusplus
92extern "C" {
93#endif
94
95/* This dictionary holds all interned unicode strings.  Note that references
96   to strings in this dictionary are *not* counted in the string's ob_refcnt.
97   When the interned string reaches a refcnt of 0 the string deallocation
98   function will delete the reference from this dictionary.
99
100   Another way to look at this is that to say that the actual reference
101   count of a string is:  s->ob_refcnt + (s->ob_sstate?2:0)
102*/
103static PyObject *interned;
104
105/* Free list for Unicode objects */
106static PyUnicodeObject *unicode_freelist;
107static int unicode_freelist_size;
108
109/* The empty Unicode object is shared to improve performance. */
110static PyUnicodeObject *unicode_empty;
111
112/* Single character Unicode strings in the Latin-1 range are being
113   shared as well. */
114static PyUnicodeObject *unicode_latin1[256];
115
116/* Default encoding to use and assume when NULL is passed as encoding
117   parameter; it is fixed to "utf-8".  Always use the
118   PyUnicode_GetDefaultEncoding() API to access this global. */
119static const char unicode_default_encoding[] = "utf-8";
120
121Py_UNICODE
122PyUnicode_GetMax(void)
123{
124#ifdef Py_UNICODE_WIDE
125	return 0x10FFFF;
126#else
127	/* This is actually an illegal character, so it should
128	   not be passed to unichr. */
129	return 0xFFFF;
130#endif
131}
132
133/* --- Bloom Filters ----------------------------------------------------- */
134
135/* stuff to implement simple "bloom filters" for Unicode characters.
136   to keep things simple, we use a single bitmask, using the least 5
137   bits from each unicode characters as the bit index. */
138
139/* the linebreak mask is set up by Unicode_Init below */
140
141#define BLOOM_MASK unsigned long
142
143static BLOOM_MASK bloom_linebreak;
144
145#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
146
147#define BLOOM_LINEBREAK(ch)\
148    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
149
150Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
151{
152    /* calculate simple bloom-style bitmask for a given unicode string */
153
154    long mask;
155    Py_ssize_t i;
156
157    mask = 0;
158    for (i = 0; i < len; i++)
159        mask |= (1 << (ptr[i] & 0x1F));
160
161    return mask;
162}
163
164Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
165{
166    Py_ssize_t i;
167
168    for (i = 0; i < setlen; i++)
169        if (set[i] == chr)
170            return 1;
171
172    return 0;
173}
174
175#define BLOOM_MEMBER(mask, chr, set, setlen)\
176    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
177
178/* --- Unicode Object ----------------------------------------------------- */
179
180static
181int unicode_resize(register PyUnicodeObject *unicode,
182                      Py_ssize_t length)
183{
184    void *oldstr;
185
186    /* Shortcut if there's nothing much to do. */
187    if (unicode->length == length)
188	goto reset;
189
190    /* Resizing shared object (unicode_empty or single character
191       objects) in-place is not allowed. Use PyUnicode_Resize()
192       instead ! */
193
194    if (unicode == unicode_empty ||
195	(unicode->length == 1 &&
196	 unicode->str[0] < 256U &&
197	 unicode_latin1[unicode->str[0]] == unicode)) {
198        PyErr_SetString(PyExc_SystemError,
199                        "can't resize shared unicode objects");
200        return -1;
201    }
202
203    /* We allocate one more byte to make sure the string is Ux0000 terminated.
204       The overallocation is also used by fastsearch, which assumes that it's
205       safe to look at str[length] (without making any assumptions about what
206       it contains). */
207
208    oldstr = unicode->str;
209    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
210    if (!unicode->str) {
211	unicode->str = (Py_UNICODE *)oldstr;
212        PyErr_NoMemory();
213        return -1;
214    }
215    unicode->str[length] = 0;
216    unicode->length = length;
217
218 reset:
219    /* Reset the object caches */
220    if (unicode->defenc) {
221        Py_DECREF(unicode->defenc);
222        unicode->defenc = NULL;
223    }
224    unicode->hash = -1;
225
226    return 0;
227}
228
229/* We allocate one more byte to make sure the string is
230   Ux0000 terminated -- XXX is this needed ?
231
232   XXX This allocator could further be enhanced by assuring that the
233       free list never reduces its size below 1.
234
235*/
236
237static
238PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
239{
240    register PyUnicodeObject *unicode;
241
242    /* Optimization for empty strings */
243    if (length == 0 && unicode_empty != NULL) {
244        Py_INCREF(unicode_empty);
245        return unicode_empty;
246    }
247
248    /* Unicode freelist & memory allocation */
249    if (unicode_freelist) {
250        unicode = unicode_freelist;
251        unicode_freelist = *(PyUnicodeObject **)unicode;
252        unicode_freelist_size--;
253	if (unicode->str) {
254	    /* Keep-Alive optimization: we only upsize the buffer,
255	       never downsize it. */
256	    if ((unicode->length < length) &&
257                unicode_resize(unicode, length) < 0) {
258		PyMem_DEL(unicode->str);
259		goto onError;
260	    }
261	}
262        else {
263	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
264        }
265        PyObject_INIT(unicode, &PyUnicode_Type);
266    }
267    else {
268        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
269        if (unicode == NULL)
270            return NULL;
271	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
272    }
273
274    if (!unicode->str) {
275	PyErr_NoMemory();
276	goto onError;
277    }
278    /* Initialize the first element to guard against cases where
279     * the caller fails before initializing str -- unicode_resize()
280     * reads str[0], and the Keep-Alive optimization can keep memory
281     * allocated for str alive across a call to unicode_dealloc(unicode).
282     * We don't want unicode_resize to read uninitialized memory in
283     * that case.
284     */
285    unicode->str[0] = 0;
286    unicode->str[length] = 0;
287    unicode->length = length;
288    unicode->hash = -1;
289    unicode->state = 0;
290    unicode->defenc = NULL;
291    return unicode;
292
293 onError:
294    _Py_ForgetReference((PyObject *)unicode);
295    PyObject_Del(unicode);
296    return NULL;
297}
298
299static
300void unicode_dealloc(register PyUnicodeObject *unicode)
301{
302    switch (PyUnicode_CHECK_INTERNED(unicode)) {
303        case SSTATE_NOT_INTERNED:
304            break;
305
306        case SSTATE_INTERNED_MORTAL:
307            /* revive dead object temporarily for DelItem */
308            unicode->ob_refcnt = 3;
309            if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
310                Py_FatalError(
311                    "deletion of interned unicode string failed");
312            break;
313
314        case SSTATE_INTERNED_IMMORTAL:
315            Py_FatalError("Immortal interned unicode string died.");
316
317        default:
318            Py_FatalError("Inconsistent interned unicode string state.");
319    }
320
321    if (PyUnicode_CheckExact(unicode) &&
322	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
323        /* Keep-Alive optimization */
324	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
325	    PyMem_DEL(unicode->str);
326	    unicode->str = NULL;
327	    unicode->length = 0;
328	}
329	if (unicode->defenc) {
330	    Py_DECREF(unicode->defenc);
331	    unicode->defenc = NULL;
332	}
333	/* Add to free list */
334        *(PyUnicodeObject **)unicode = unicode_freelist;
335        unicode_freelist = unicode;
336        unicode_freelist_size++;
337    }
338    else {
339	PyMem_DEL(unicode->str);
340	Py_XDECREF(unicode->defenc);
341	unicode->ob_type->tp_free((PyObject *)unicode);
342    }
343}
344
345int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
346{
347    register PyUnicodeObject *v;
348
349    /* Argument checks */
350    if (unicode == NULL) {
351	PyErr_BadInternalCall();
352	return -1;
353    }
354    v = (PyUnicodeObject *)*unicode;
355    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
356	PyErr_BadInternalCall();
357	return -1;
358    }
359
360    /* Resizing unicode_empty and single character objects is not
361       possible since these are being shared. We simply return a fresh
362       copy with the same Unicode content. */
363    if (v->length != length &&
364	(v == unicode_empty || v->length == 1)) {
365	PyUnicodeObject *w = _PyUnicode_New(length);
366	if (w == NULL)
367	    return -1;
368	Py_UNICODE_COPY(w->str, v->str,
369			length < v->length ? length : v->length);
370	Py_DECREF(*unicode);
371	*unicode = (PyObject *)w;
372	return 0;
373    }
374
375    /* Note that we don't have to modify *unicode for unshared Unicode
376       objects, since we can modify them in-place. */
377    return unicode_resize(v, length);
378}
379
380/* Internal API for use in unicodeobject.c only ! */
381#define _PyUnicode_Resize(unicodevar, length) \
382        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
383
384PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
385				Py_ssize_t size)
386{
387    PyUnicodeObject *unicode;
388
389    /* If the Unicode data is known at construction time, we can apply
390       some optimizations which share commonly used objects. */
391    if (u != NULL) {
392
393	/* Optimization for empty strings */
394	if (size == 0 && unicode_empty != NULL) {
395	    Py_INCREF(unicode_empty);
396	    return (PyObject *)unicode_empty;
397	}
398
399	/* Single character Unicode objects in the Latin-1 range are
400	   shared when using this constructor */
401	if (size == 1 && *u < 256) {
402	    unicode = unicode_latin1[*u];
403	    if (!unicode) {
404		unicode = _PyUnicode_New(1);
405		if (!unicode)
406		    return NULL;
407		unicode->str[0] = *u;
408		unicode_latin1[*u] = unicode;
409	    }
410	    Py_INCREF(unicode);
411	    return (PyObject *)unicode;
412	}
413    }
414
415    unicode = _PyUnicode_New(size);
416    if (!unicode)
417        return NULL;
418
419    /* Copy the Unicode data into the new object */
420    if (u != NULL)
421	Py_UNICODE_COPY(unicode->str, u, size);
422
423    return (PyObject *)unicode;
424}
425
426PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
427{
428    PyUnicodeObject *unicode;
429    /* If the Unicode data is known at construction time, we can apply
430       some optimizations which share commonly used objects. */
431    if (u != NULL) {
432
433	/* Optimization for empty strings */
434	if (size == 0 && unicode_empty != NULL) {
435	    Py_INCREF(unicode_empty);
436	    return (PyObject *)unicode_empty;
437	}
438
439	/* Single characters are shared when using this constructor */
440	if (size == 1) {
441	    unicode = unicode_latin1[(int)*u];
442	    if (!unicode) {
443		unicode = _PyUnicode_New(1);
444		if (!unicode)
445		    return NULL;
446		unicode->str[0] = *u;
447		unicode_latin1[(int)*u] = unicode;
448	    }
449	    Py_INCREF(unicode);
450	    return (PyObject *)unicode;
451	}
452    }
453
454    unicode = _PyUnicode_New(size);
455    if (!unicode)
456        return NULL;
457
458    /* Copy the Unicode data into the new object */
459    if (u != NULL) {
460        Py_UNICODE *p = unicode->str;
461        while (size--)
462            *p++ = *u++;
463        /* Don't need to write trailing 0 because
464           that's already done by _PyUnicode_New */
465    }
466
467    return (PyObject *)unicode;
468}
469
470PyObject *PyUnicode_FromString(const char *u)
471{
472    size_t size = strlen(u);
473    if (size > PY_SSIZE_T_MAX) {
474        PyErr_SetString(PyExc_OverflowError, "input too long");
475        return NULL;
476    }
477
478    return PyUnicode_FromStringAndSize(u, size);
479}
480
481#ifdef HAVE_WCHAR_H
482
483PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
484				 Py_ssize_t size)
485{
486    PyUnicodeObject *unicode;
487
488    if (w == NULL) {
489	PyErr_BadInternalCall();
490	return NULL;
491    }
492
493    unicode = _PyUnicode_New(size);
494    if (!unicode)
495        return NULL;
496
497    /* Copy the wchar_t data into the new object */
498#ifdef HAVE_USABLE_WCHAR_T
499    memcpy(unicode->str, w, size * sizeof(wchar_t));
500#else
501    {
502	register Py_UNICODE *u;
503	register Py_ssize_t i;
504	u = PyUnicode_AS_UNICODE(unicode);
505	for (i = size; i > 0; i--)
506	    *u++ = *w++;
507    }
508#endif
509
510    return (PyObject *)unicode;
511}
512
513static void
514makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
515{
516	*fmt++ = '%';
517	if (width) {
518		if (zeropad)
519			*fmt++ = '0';
520		fmt += sprintf(fmt, "%d", width);
521	}
522	if (precision)
523		fmt += sprintf(fmt, ".%d", precision);
524	if (longflag)
525		*fmt++ = 'l';
526	else if (size_tflag) {
527		char *f = PY_FORMAT_SIZE_T;
528		while (*f)
529			*fmt++ = *f++;
530	}
531	*fmt++ = c;
532	*fmt = '\0';
533}
534
535#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
536
537PyObject *
538PyUnicode_FromFormatV(const char *format, va_list vargs)
539{
540	va_list count;
541	Py_ssize_t callcount = 0;
542	PyObject **callresults = NULL;
543	PyObject **callresult = NULL;
544	Py_ssize_t n = 0;
545	int width = 0;
546	int precision = 0;
547	int zeropad;
548	const char* f;
549	Py_UNICODE *s;
550	PyObject *string;
551	/* used by sprintf */
552	char buffer[21];
553	/* use abuffer instead of buffer, if we need more space
554	 * (which can happen if there's a format specifier with width). */
555	char *abuffer = NULL;
556	char *realbuffer;
557	Py_ssize_t abuffersize = 0;
558	char fmt[60]; /* should be enough for %0width.precisionld */
559	const char *copy;
560
561#ifdef VA_LIST_IS_ARRAY
562	Py_MEMCPY(count, vargs, sizeof(va_list));
563#else
564#ifdef  __va_copy
565	__va_copy(count, vargs);
566#else
567	count = vargs;
568#endif
569#endif
570	/* step 1: count the number of %S/%R format specifications
571	 * (we call PyObject_Unicode()/PyObject_Repr() for these objects
572	 * once during step 3 and put the result in an array) */
573	for (f = format; *f; f++) {
574		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
575			++callcount;
576	}
577	/* step 2: allocate memory for the results of
578	 * PyObject_Unicode()/PyObject_Repr() calls */
579	if (callcount) {
580		callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
581		if (!callresults) {
582			PyErr_NoMemory();
583			return NULL;
584		}
585		callresult = callresults;
586	}
587	/* step 3: figure out how large a buffer we need */
588	for (f = format; *f; f++) {
589		if (*f == '%') {
590			const char* p = f;
591			width = 0;
592			while (isdigit(Py_CHARMASK(*f)))
593				width = (width*10) + *f++ - '0';
594			while (*++f && *f != '%' && !isalpha(Py_CHARMASK(*f)))
595				;
596
597			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
598			 * they don't affect the amount of space we reserve.
599			 */
600			if ((*f == 'l' || *f == 'z') &&
601					(f[1] == 'd' || f[1] == 'u'))
602				++f;
603
604			switch (*f) {
605			case 'c':
606				(void)va_arg(count, int);
607				/* fall through... */
608			case '%':
609				n++;
610				break;
611			case 'd': case 'u': case 'i': case 'x':
612				(void) va_arg(count, int);
613				/* 20 bytes is enough to hold a 64-bit
614				   integer.  Decimal takes the most space.
615				   This isn't enough for octal.
616				   If a width is specified we need more
617				   (which we allocate later). */
618				if (width < 20)
619					width = 20;
620				n += width;
621				if (abuffersize < width)
622					abuffersize = width;
623				break;
624			case 's':
625				n += strlen(va_arg(count, char*));
626				break;
627			case 'U':
628			{
629				PyObject *obj = va_arg(count, PyObject *);
630				assert(obj && PyUnicode_Check(obj));
631				n += PyUnicode_GET_SIZE(obj);
632				break;
633			}
634			case 'V':
635			{
636				PyObject *obj = va_arg(count, PyObject *);
637				const char *str = va_arg(count, const char *);
638				assert(obj || str);
639				assert(!obj || PyUnicode_Check(obj));
640				if (obj)
641					n += PyUnicode_GET_SIZE(obj);
642				else
643					n += strlen(str);
644				break;
645			}
646			case 'S':
647			{
648				PyObject *obj = va_arg(count, PyObject *);
649				PyObject *str;
650				assert(obj);
651				str = PyObject_Unicode(obj);
652				if (!str)
653					goto fail;
654				n += PyUnicode_GET_SIZE(str);
655				/* Remember the str and switch to the next slot */
656				*callresult++ = str;
657				break;
658			}
659			case 'R':
660			{
661				PyObject *obj = va_arg(count, PyObject *);
662				PyObject *repr;
663				assert(obj);
664				repr = PyObject_Repr(obj);
665				if (!repr)
666					goto fail;
667				n += PyUnicode_GET_SIZE(repr);
668				/* Remember the repr and switch to the next slot */
669				*callresult++ = repr;
670				break;
671			}
672			case 'p':
673				(void) va_arg(count, int);
674				/* maximum 64-bit pointer representation:
675				 * 0xffffffffffffffff
676				 * so 19 characters is enough.
677				 * XXX I count 18 -- what's the extra for?
678				 */
679				n += 19;
680				break;
681			default:
682				/* if we stumble upon an unknown
683				   formatting code, copy the rest of
684				   the format string to the output
685				   string. (we cannot just skip the
686				   code, since there's no way to know
687				   what's in the argument list) */
688				n += strlen(p);
689				goto expand;
690			}
691		} else
692			n++;
693	}
694 expand:
695	if (abuffersize > 20) {
696		abuffer = PyMem_Malloc(abuffersize);
697		if (!abuffer) {
698			PyErr_NoMemory();
699			goto fail;
700		}
701		realbuffer = abuffer;
702	}
703	else
704		realbuffer = buffer;
705	/* step 4: fill the buffer */
706	/* Since we've analyzed how much space we need for the worst case,
707	   we don't have to resize the string.
708	   There can be no errors beyond this point. */
709	string = PyUnicode_FromUnicode(NULL, n);
710	if (!string)
711		goto fail;
712
713	s = PyUnicode_AS_UNICODE(string);
714	callresult = callresults;
715
716	for (f = format; *f; f++) {
717		if (*f == '%') {
718			const char* p = f++;
719			int longflag = 0;
720			int size_tflag = 0;
721			zeropad = (*f == '0');
722			/* parse the width.precision part */
723			width = 0;
724			while (isdigit(Py_CHARMASK(*f)))
725				width = (width*10) + *f++ - '0';
726			precision = 0;
727			if (*f == '.') {
728				f++;
729				while (isdigit(Py_CHARMASK(*f)))
730					precision = (precision*10) + *f++ - '0';
731			}
732			/* handle the long flag, but only for %ld and %lu.
733			   others can be added when necessary. */
734			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
735				longflag = 1;
736				++f;
737			}
738			/* handle the size_t flag. */
739			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
740				size_tflag = 1;
741				++f;
742			}
743
744			switch (*f) {
745			case 'c':
746				*s++ = va_arg(vargs, int);
747				break;
748			case 'd':
749				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
750				if (longflag)
751					sprintf(realbuffer, fmt, va_arg(vargs, long));
752				else if (size_tflag)
753					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
754				else
755					sprintf(realbuffer, fmt, va_arg(vargs, int));
756				appendstring(realbuffer);
757				break;
758			case 'u':
759				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
760				if (longflag)
761					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
762				else if (size_tflag)
763					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
764				else
765					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
766				appendstring(realbuffer);
767				break;
768			case 'i':
769				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
770				sprintf(realbuffer, fmt, va_arg(vargs, int));
771				appendstring(realbuffer);
772				break;
773			case 'x':
774				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
775				sprintf(realbuffer, fmt, va_arg(vargs, int));
776				appendstring(realbuffer);
777				break;
778			case 's':
779				p = va_arg(vargs, char*);
780				appendstring(p);
781				break;
782			case 'U':
783			{
784				PyObject *obj = va_arg(vargs, PyObject *);
785				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
786				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
787				s += size;
788				break;
789			}
790			case 'V':
791			{
792				PyObject *obj = va_arg(vargs, PyObject *);
793				const char *str = va_arg(vargs, const char *);
794				if (obj) {
795					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
796					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
797					s += size;
798				} else {
799					appendstring(str);
800				}
801				break;
802			}
803			case 'S':
804			case 'R':
805			{
806				Py_UNICODE *ucopy;
807				Py_ssize_t usize;
808				Py_ssize_t upos;
809				/* unused, since we already have the result */
810				(void) va_arg(vargs, PyObject *);
811				ucopy = PyUnicode_AS_UNICODE(*callresult);
812				usize = PyUnicode_GET_SIZE(*callresult);
813				for (upos = 0; upos<usize;)
814					*s++ = ucopy[upos++];
815				/* We're done with the unicode()/repr() => forget it */
816				Py_DECREF(*callresult);
817				/* switch to next unicode()/repr() result */
818				++callresult;
819				break;
820			}
821			case 'p':
822				sprintf(buffer, "%p", va_arg(vargs, void*));
823				/* %p is ill-defined:  ensure leading 0x. */
824				if (buffer[1] == 'X')
825					buffer[1] = 'x';
826				else if (buffer[1] != 'x') {
827					memmove(buffer+2, buffer, strlen(buffer)+1);
828					buffer[0] = '0';
829					buffer[1] = 'x';
830				}
831				appendstring(buffer);
832				break;
833			case '%':
834				*s++ = '%';
835				break;
836			default:
837				appendstring(p);
838				goto end;
839			}
840		} else
841			*s++ = *f;
842	}
843
844 end:
845	if (callresults)
846		PyMem_Free(callresults);
847	if (abuffer)
848		PyMem_Free(abuffer);
849	_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
850	return string;
851 fail:
852	if (callresults) {
853		PyObject **callresult2 = callresults;
854		while (callresult2 <= callresult) {
855			Py_DECREF(*callresult2);
856			++callresult2;
857		}
858		PyMem_Free(callresults);
859	}
860	if (abuffer)
861		PyMem_Free(abuffer);
862	return NULL;
863}
864
865#undef appendstring
866
867PyObject *
868PyUnicode_FromFormat(const char *format, ...)
869{
870	PyObject* ret;
871	va_list vargs;
872
873#ifdef HAVE_STDARG_PROTOTYPES
874	va_start(vargs, format);
875#else
876	va_start(vargs);
877#endif
878	ret = PyUnicode_FromFormatV(format, vargs);
879	va_end(vargs);
880	return ret;
881}
882
883Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
884				wchar_t *w,
885				Py_ssize_t size)
886{
887    if (unicode == NULL) {
888	PyErr_BadInternalCall();
889	return -1;
890    }
891
892    /* If possible, try to copy the 0-termination as well */
893    if (size > PyUnicode_GET_SIZE(unicode))
894	size = PyUnicode_GET_SIZE(unicode) + 1;
895
896#ifdef HAVE_USABLE_WCHAR_T
897    memcpy(w, unicode->str, size * sizeof(wchar_t));
898#else
899    {
900	register Py_UNICODE *u;
901	register Py_ssize_t i;
902	u = PyUnicode_AS_UNICODE(unicode);
903	for (i = size; i > 0; i--)
904	    *w++ = *u++;
905    }
906#endif
907
908    if (size > PyUnicode_GET_SIZE(unicode))
909        return PyUnicode_GET_SIZE(unicode);
910    else
911    return size;
912}
913
914#endif
915
916PyObject *PyUnicode_FromOrdinal(int ordinal)
917{
918    Py_UNICODE s[1];
919
920#ifdef Py_UNICODE_WIDE
921    if (ordinal < 0 || ordinal > 0x10ffff) {
922	PyErr_SetString(PyExc_ValueError,
923			"chr() arg not in range(0x110000) "
924			"(wide Python build)");
925	return NULL;
926    }
927#else
928    if (ordinal < 0 || ordinal > 0xffff) {
929	PyErr_SetString(PyExc_ValueError,
930			"chr() arg not in range(0x10000) "
931			"(narrow Python build)");
932	return NULL;
933    }
934#endif
935
936    s[0] = (Py_UNICODE)ordinal;
937    return PyUnicode_FromUnicode(s, 1);
938}
939
940PyObject *PyUnicode_FromObject(register PyObject *obj)
941{
942    /* XXX Perhaps we should make this API an alias of
943           PyObject_Unicode() instead ?! */
944    if (PyUnicode_CheckExact(obj)) {
945	Py_INCREF(obj);
946	return obj;
947    }
948    if (PyUnicode_Check(obj)) {
949	/* For a Unicode subtype that's not a Unicode object,
950	   return a true Unicode object with the same data. */
951	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
952				     PyUnicode_GET_SIZE(obj));
953    }
954    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
955}
956
957PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
958				      const char *encoding,
959				      const char *errors)
960{
961    const char *s = NULL;
962    Py_ssize_t len;
963    PyObject *v;
964
965    if (obj == NULL) {
966	PyErr_BadInternalCall();
967	return NULL;
968    }
969
970#if 0
971    /* For b/w compatibility we also accept Unicode objects provided
972       that no encodings is given and then redirect to
973       PyObject_Unicode() which then applies the additional logic for
974       Unicode subclasses.
975
976       NOTE: This API should really only be used for object which
977             represent *encoded* Unicode !
978
979    */
980	if (PyUnicode_Check(obj)) {
981	    if (encoding) {
982		PyErr_SetString(PyExc_TypeError,
983				"decoding Unicode is not supported");
984	    return NULL;
985	    }
986	return PyObject_Unicode(obj);
987	    }
988#else
989    if (PyUnicode_Check(obj)) {
990	PyErr_SetString(PyExc_TypeError,
991			"decoding Unicode is not supported");
992	return NULL;
993	}
994#endif
995
996    /* Coerce object */
997    if (PyString_Check(obj)) {
998	    s = PyString_AS_STRING(obj);
999	    len = PyString_GET_SIZE(obj);
1000	    }
1001    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1002	/* Overwrite the error message with something more useful in
1003	   case of a TypeError. */
1004	if (PyErr_ExceptionMatches(PyExc_TypeError))
1005	PyErr_Format(PyExc_TypeError,
1006			 "coercing to Unicode: need string or buffer, "
1007			 "%.80s found",
1008		     obj->ob_type->tp_name);
1009	goto onError;
1010    }
1011
1012    /* Convert to Unicode */
1013    if (len == 0) {
1014	Py_INCREF(unicode_empty);
1015	v = (PyObject *)unicode_empty;
1016    }
1017    else
1018	v = PyUnicode_Decode(s, len, encoding, errors);
1019
1020    return v;
1021
1022 onError:
1023    return NULL;
1024}
1025
1026PyObject *PyUnicode_Decode(const char *s,
1027			   Py_ssize_t size,
1028			   const char *encoding,
1029			   const char *errors)
1030{
1031    PyObject *buffer = NULL, *unicode;
1032
1033    if (encoding == NULL)
1034	encoding = PyUnicode_GetDefaultEncoding();
1035
1036    /* Shortcuts for common default encodings */
1037    if (strcmp(encoding, "utf-8") == 0)
1038        return PyUnicode_DecodeUTF8(s, size, errors);
1039    else if (strcmp(encoding, "latin-1") == 0)
1040        return PyUnicode_DecodeLatin1(s, size, errors);
1041#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1042    else if (strcmp(encoding, "mbcs") == 0)
1043        return PyUnicode_DecodeMBCS(s, size, errors);
1044#endif
1045    else if (strcmp(encoding, "ascii") == 0)
1046        return PyUnicode_DecodeASCII(s, size, errors);
1047
1048    /* Decode via the codec registry */
1049    buffer = PyBuffer_FromMemory((void *)s, size);
1050    if (buffer == NULL)
1051        goto onError;
1052    unicode = PyCodec_Decode(buffer, encoding, errors);
1053    if (unicode == NULL)
1054        goto onError;
1055    if (!PyUnicode_Check(unicode)) {
1056        PyErr_Format(PyExc_TypeError,
1057                     "decoder did not return an unicode object (type=%.400s)",
1058                     unicode->ob_type->tp_name);
1059        Py_DECREF(unicode);
1060        goto onError;
1061    }
1062    Py_DECREF(buffer);
1063    return unicode;
1064
1065 onError:
1066    Py_XDECREF(buffer);
1067    return NULL;
1068}
1069
1070PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1071                                    const char *encoding,
1072                                    const char *errors)
1073{
1074    PyObject *v;
1075
1076    if (!PyUnicode_Check(unicode)) {
1077        PyErr_BadArgument();
1078        goto onError;
1079    }
1080
1081    if (encoding == NULL)
1082	encoding = PyUnicode_GetDefaultEncoding();
1083
1084    /* Decode via the codec registry */
1085    v = PyCodec_Decode(unicode, encoding, errors);
1086    if (v == NULL)
1087        goto onError;
1088    return v;
1089
1090 onError:
1091    return NULL;
1092}
1093
1094PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1095			   Py_ssize_t size,
1096			   const char *encoding,
1097			   const char *errors)
1098{
1099    PyObject *v, *unicode;
1100
1101    unicode = PyUnicode_FromUnicode(s, size);
1102    if (unicode == NULL)
1103	return NULL;
1104    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1105    Py_DECREF(unicode);
1106    return v;
1107}
1108
1109PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1110                                    const char *encoding,
1111                                    const char *errors)
1112{
1113    PyObject *v;
1114
1115    if (!PyUnicode_Check(unicode)) {
1116        PyErr_BadArgument();
1117        goto onError;
1118    }
1119
1120    if (encoding == NULL)
1121	encoding = PyUnicode_GetDefaultEncoding();
1122
1123    /* Encode via the codec registry */
1124    v = PyCodec_Encode(unicode, encoding, errors);
1125    if (v == NULL)
1126        goto onError;
1127    return v;
1128
1129 onError:
1130    return NULL;
1131}
1132
1133PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1134                                    const char *encoding,
1135                                    const char *errors)
1136{
1137    PyObject *v;
1138
1139    if (!PyUnicode_Check(unicode)) {
1140        PyErr_BadArgument();
1141        goto onError;
1142    }
1143
1144    if (encoding == NULL)
1145	encoding = PyUnicode_GetDefaultEncoding();
1146
1147    /* Shortcuts for common default encodings */
1148    if (errors == NULL) {
1149	if (strcmp(encoding, "utf-8") == 0)
1150	    return PyUnicode_AsUTF8String(unicode);
1151	else if (strcmp(encoding, "latin-1") == 0)
1152	    return PyUnicode_AsLatin1String(unicode);
1153#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1154	else if (strcmp(encoding, "mbcs") == 0)
1155	    return PyUnicode_AsMBCSString(unicode);
1156#endif
1157	else if (strcmp(encoding, "ascii") == 0)
1158	    return PyUnicode_AsASCIIString(unicode);
1159    }
1160
1161    /* Encode via the codec registry */
1162    v = PyCodec_Encode(unicode, encoding, errors);
1163    if (v == NULL)
1164        goto onError;
1165    if (!PyBytes_Check(v)) {
1166        if (PyString_Check(v)) {
1167            /* Old codec, turn it into bytes */
1168            PyObject *b = PyBytes_FromObject(v);
1169            Py_DECREF(v);
1170            return b;
1171        }
1172        PyErr_Format(PyExc_TypeError,
1173                     "encoder did not return a bytes object "
1174                     "(type=%.400s, encoding=%.20s, errors=%.20s)",
1175                     v->ob_type->tp_name,
1176                     encoding ? encoding : "NULL",
1177                     errors ? errors : "NULL");
1178        Py_DECREF(v);
1179        goto onError;
1180    }
1181    return v;
1182
1183 onError:
1184    return NULL;
1185}
1186
1187PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1188					    const char *errors)
1189{
1190    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1191    PyObject *b;
1192    if (v)
1193        return v;
1194    if (errors != NULL)
1195        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1196    if (errors == NULL) {
1197        b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1198                                 PyUnicode_GET_SIZE(unicode),
1199                                 NULL);
1200    }
1201    else {
1202        b = PyUnicode_AsEncodedString(unicode, NULL, errors);
1203    }
1204    if (!b)
1205        return NULL;
1206    v = PyString_FromStringAndSize(PyBytes_AsString(b),
1207                                   PyBytes_Size(b));
1208    Py_DECREF(b);
1209    if (!errors) {
1210        Py_XINCREF(v);
1211        ((PyUnicodeObject *)unicode)->defenc = v;
1212    }
1213    return v;
1214}
1215
1216char*
1217PyUnicode_AsString(PyObject *unicode)
1218{
1219    assert(PyUnicode_Check(unicode));
1220    unicode = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1221    if (!unicode)
1222        return NULL;
1223    return PyString_AsString(unicode);
1224}
1225
1226Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1227{
1228    if (!PyUnicode_Check(unicode)) {
1229        PyErr_BadArgument();
1230        goto onError;
1231    }
1232    return PyUnicode_AS_UNICODE(unicode);
1233
1234 onError:
1235    return NULL;
1236}
1237
1238Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1239{
1240    if (!PyUnicode_Check(unicode)) {
1241        PyErr_BadArgument();
1242        goto onError;
1243    }
1244    return PyUnicode_GET_SIZE(unicode);
1245
1246 onError:
1247    return -1;
1248}
1249
1250const char *PyUnicode_GetDefaultEncoding(void)
1251{
1252    return unicode_default_encoding;
1253}
1254
1255int PyUnicode_SetDefaultEncoding(const char *encoding)
1256{
1257    if (strcmp(encoding, unicode_default_encoding) != 0) {
1258        PyErr_Format(PyExc_ValueError,
1259                     "Can only set default encoding to %s",
1260                     unicode_default_encoding);
1261        return -1;
1262    }
1263    return 0;
1264}
1265
1266/* error handling callback helper:
1267   build arguments, call the callback and check the arguments,
1268   if no exception occurred, copy the replacement to the output
1269   and adjust various state variables.
1270   return 0 on success, -1 on error
1271*/
1272
1273static
1274int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1275                 const char *encoding, const char *reason,
1276                 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1277                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1278{
1279    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1280
1281    PyObject *restuple = NULL;
1282    PyObject *repunicode = NULL;
1283    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1284    Py_ssize_t requiredsize;
1285    Py_ssize_t newpos;
1286    Py_UNICODE *repptr;
1287    Py_ssize_t repsize;
1288    int res = -1;
1289
1290    if (*errorHandler == NULL) {
1291	*errorHandler = PyCodec_LookupError(errors);
1292	if (*errorHandler == NULL)
1293	   goto onError;
1294    }
1295
1296    if (*exceptionObject == NULL) {
1297    	*exceptionObject = PyUnicodeDecodeError_Create(
1298	    encoding, input, insize, *startinpos, *endinpos, reason);
1299	if (*exceptionObject == NULL)
1300	   goto onError;
1301    }
1302    else {
1303	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1304	    goto onError;
1305	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1306	    goto onError;
1307	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1308	    goto onError;
1309    }
1310
1311    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1312    if (restuple == NULL)
1313	goto onError;
1314    if (!PyTuple_Check(restuple)) {
1315	PyErr_Format(PyExc_TypeError, &argparse[4]);
1316	goto onError;
1317    }
1318    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1319	goto onError;
1320    if (newpos<0)
1321	newpos = insize+newpos;
1322    if (newpos<0 || newpos>insize) {
1323	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1324	goto onError;
1325    }
1326
1327    /* need more space? (at least enough for what we
1328       have+the replacement+the rest of the string (starting
1329       at the new input position), so we won't have to check space
1330       when there are no errors in the rest of the string) */
1331    repptr = PyUnicode_AS_UNICODE(repunicode);
1332    repsize = PyUnicode_GET_SIZE(repunicode);
1333    requiredsize = *outpos + repsize + insize-newpos;
1334    if (requiredsize > outsize) {
1335	if (requiredsize<2*outsize)
1336	    requiredsize = 2*outsize;
1337	if (PyUnicode_Resize(output, requiredsize) < 0)
1338	    goto onError;
1339	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1340    }
1341    *endinpos = newpos;
1342    *inptr = input + newpos;
1343    Py_UNICODE_COPY(*outptr, repptr, repsize);
1344    *outptr += repsize;
1345    *outpos += repsize;
1346    /* we made it! */
1347    res = 0;
1348
1349    onError:
1350    Py_XDECREF(restuple);
1351    return res;
1352}
1353
1354/* --- UTF-7 Codec -------------------------------------------------------- */
1355
1356/* see RFC2152 for details */
1357
1358static
1359char utf7_special[128] = {
1360    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1361       encoded:
1362	   0 - not special
1363	   1 - special
1364	   2 - whitespace (optional)
1365	   3 - RFC2152 Set O (optional) */
1366    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1367    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1368    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1369    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1370    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1371    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1372    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1373    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1374
1375};
1376
1377/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1378   warnings about the comparison always being false; since
1379   utf7_special[0] is 1, we can safely make that one comparison
1380   true  */
1381
1382#define SPECIAL(c, encodeO, encodeWS) \
1383    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1384     (encodeWS && (utf7_special[(c)] == 2)) || \
1385     (encodeO && (utf7_special[(c)] == 3)))
1386
1387#define B64(n)  \
1388    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1389#define B64CHAR(c) \
1390    (isalnum(c) || (c) == '+' || (c) == '/')
1391#define UB64(c) \
1392    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1393     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1394
1395#define ENCODE(out, ch, bits)                   \
1396    while (bits >= 6) {                         \
1397        *out++ = B64(ch >> (bits-6));           \
1398        bits -= 6;                              \
1399    }
1400
1401#define DECODE(out, ch, bits, surrogate)                                \
1402    while (bits >= 16) {                                                \
1403        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1404        bits -= 16;                                                     \
1405        if (surrogate) {                                                \
1406            /* We have already generated an error for the high surrogate \
1407               so let's not bother seeing if the low surrogate is correct or not */ \
1408            surrogate = 0;                                              \
1409        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1410            /* This is a surrogate pair. Unfortunately we can't represent \
1411               it in a 16-bit character */                              \
1412            surrogate = 1;                                              \
1413            errmsg = "code pairs are not supported";                    \
1414            goto utf7Error;                                             \
1415        } else {                                                        \
1416            *out++ = outCh;                                             \
1417        }                                                               \
1418    }
1419
1420PyObject *PyUnicode_DecodeUTF7(const char *s,
1421			       Py_ssize_t size,
1422			       const char *errors)
1423{
1424    const char *starts = s;
1425    Py_ssize_t startinpos;
1426    Py_ssize_t endinpos;
1427    Py_ssize_t outpos;
1428    const char *e;
1429    PyUnicodeObject *unicode;
1430    Py_UNICODE *p;
1431    const char *errmsg = "";
1432    int inShift = 0;
1433    unsigned int bitsleft = 0;
1434    unsigned long charsleft = 0;
1435    int surrogate = 0;
1436    PyObject *errorHandler = NULL;
1437    PyObject *exc = NULL;
1438
1439    unicode = _PyUnicode_New(size);
1440    if (!unicode)
1441        return NULL;
1442    if (size == 0)
1443        return (PyObject *)unicode;
1444
1445    p = unicode->str;
1446    e = s + size;
1447
1448    while (s < e) {
1449        Py_UNICODE ch;
1450        restart:
1451        ch = *s;
1452
1453        if (inShift) {
1454            if ((ch == '-') || !B64CHAR(ch)) {
1455                inShift = 0;
1456                s++;
1457
1458                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1459                if (bitsleft >= 6) {
1460                    /* The shift sequence has a partial character in it. If
1461                       bitsleft < 6 then we could just classify it as padding
1462                       but that is not the case here */
1463
1464                    errmsg = "partial character in shift sequence";
1465                    goto utf7Error;
1466                }
1467                /* According to RFC2152 the remaining bits should be zero. We
1468                   choose to signal an error/insert a replacement character
1469                   here so indicate the potential of a misencoded character. */
1470
1471                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1472                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1473                    errmsg = "non-zero padding bits in shift sequence";
1474                    goto utf7Error;
1475                }
1476
1477                if (ch == '-') {
1478                    if ((s < e) && (*(s) == '-')) {
1479                        *p++ = '-';
1480                        inShift = 1;
1481                    }
1482                } else if (SPECIAL(ch,0,0)) {
1483                    errmsg = "unexpected special character";
1484	                goto utf7Error;
1485                } else  {
1486                    *p++ = ch;
1487                }
1488            } else {
1489                charsleft = (charsleft << 6) | UB64(ch);
1490                bitsleft += 6;
1491                s++;
1492                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1493            }
1494        }
1495        else if ( ch == '+' ) {
1496            startinpos = s-starts;
1497            s++;
1498            if (s < e && *s == '-') {
1499                s++;
1500                *p++ = '+';
1501            } else
1502            {
1503                inShift = 1;
1504                bitsleft = 0;
1505            }
1506        }
1507        else if (SPECIAL(ch,0,0)) {
1508            errmsg = "unexpected special character";
1509            s++;
1510	        goto utf7Error;
1511        }
1512        else {
1513            *p++ = ch;
1514            s++;
1515        }
1516        continue;
1517    utf7Error:
1518        outpos = p-PyUnicode_AS_UNICODE(unicode);
1519        endinpos = s-starts;
1520        if (unicode_decode_call_errorhandler(
1521             errors, &errorHandler,
1522             "utf7", errmsg,
1523             starts, size, &startinpos, &endinpos, &exc, &s,
1524             (PyObject **)&unicode, &outpos, &p))
1525        goto onError;
1526    }
1527
1528    if (inShift) {
1529        outpos = p-PyUnicode_AS_UNICODE(unicode);
1530        endinpos = size;
1531        if (unicode_decode_call_errorhandler(
1532             errors, &errorHandler,
1533             "utf7", "unterminated shift sequence",
1534             starts, size, &startinpos, &endinpos, &exc, &s,
1535             (PyObject **)&unicode, &outpos, &p))
1536            goto onError;
1537        if (s < e)
1538           goto restart;
1539    }
1540
1541    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1542        goto onError;
1543
1544    Py_XDECREF(errorHandler);
1545    Py_XDECREF(exc);
1546    return (PyObject *)unicode;
1547
1548onError:
1549    Py_XDECREF(errorHandler);
1550    Py_XDECREF(exc);
1551    Py_DECREF(unicode);
1552    return NULL;
1553}
1554
1555
1556PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1557                   Py_ssize_t size,
1558                   int encodeSetO,
1559                   int encodeWhiteSpace,
1560                   const char *errors)
1561{
1562    PyObject *v;
1563    /* It might be possible to tighten this worst case */
1564    Py_ssize_t cbAllocated = 5 * size;
1565    int inShift = 0;
1566    Py_ssize_t i = 0;
1567    unsigned int bitsleft = 0;
1568    unsigned long charsleft = 0;
1569    char * out;
1570    char * start;
1571
1572    if (size == 0)
1573	return PyBytes_FromStringAndSize(NULL, 0);
1574
1575    v = PyBytes_FromStringAndSize(NULL, cbAllocated);
1576    if (v == NULL)
1577        return NULL;
1578
1579    start = out = PyBytes_AS_STRING(v);
1580    for (;i < size; ++i) {
1581        Py_UNICODE ch = s[i];
1582
1583        if (!inShift) {
1584            if (ch == '+') {
1585                *out++ = '+';
1586                *out++ = '-';
1587            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1588                charsleft = ch;
1589                bitsleft = 16;
1590                *out++ = '+';
1591                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1592                inShift = bitsleft > 0;
1593            } else {
1594                *out++ = (char) ch;
1595            }
1596        } else {
1597            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1598                *out++ = B64(charsleft << (6-bitsleft));
1599                charsleft = 0;
1600                bitsleft = 0;
1601                /* Characters not in the BASE64 set implicitly unshift the sequence
1602                   so no '-' is required, except if the character is itself a '-' */
1603                if (B64CHAR(ch) || ch == '-') {
1604                    *out++ = '-';
1605                }
1606                inShift = 0;
1607                *out++ = (char) ch;
1608            } else {
1609                bitsleft += 16;
1610                charsleft = (charsleft << 16) | ch;
1611                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1612
1613                /* If the next character is special then we dont' need to terminate
1614                   the shift sequence. If the next character is not a BASE64 character
1615                   or '-' then the shift sequence will be terminated implicitly and we
1616                   don't have to insert a '-'. */
1617
1618                if (bitsleft == 0) {
1619                    if (i + 1 < size) {
1620                        Py_UNICODE ch2 = s[i+1];
1621
1622                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1623
1624                        } else if (B64CHAR(ch2) || ch2 == '-') {
1625                            *out++ = '-';
1626                            inShift = 0;
1627                        } else {
1628                            inShift = 0;
1629                        }
1630
1631                    }
1632                    else {
1633                        *out++ = '-';
1634                        inShift = 0;
1635                    }
1636                }
1637            }
1638        }
1639    }
1640    if (bitsleft) {
1641        *out++= B64(charsleft << (6-bitsleft) );
1642        *out++ = '-';
1643    }
1644
1645    if (PyBytes_Resize(v, out - start)) {
1646        Py_DECREF(v);
1647        return NULL;
1648    }
1649    return v;
1650}
1651
1652#undef SPECIAL
1653#undef B64
1654#undef B64CHAR
1655#undef UB64
1656#undef ENCODE
1657#undef DECODE
1658
1659/* --- UTF-8 Codec -------------------------------------------------------- */
1660
1661static
1662char utf8_code_length[256] = {
1663    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1664       illegal prefix.  see RFC 2279 for details */
1665    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1666    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1667    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1668    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1669    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1670    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1671    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1672    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1673    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1674    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1675    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1676    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1677    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1678    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1679    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1680    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1681};
1682
1683PyObject *PyUnicode_DecodeUTF8(const char *s,
1684			       Py_ssize_t size,
1685			       const char *errors)
1686{
1687    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1688}
1689
1690PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1691			                Py_ssize_t size,
1692			                const char *errors,
1693			                Py_ssize_t *consumed)
1694{
1695    const char *starts = s;
1696    int n;
1697    Py_ssize_t startinpos;
1698    Py_ssize_t endinpos;
1699    Py_ssize_t outpos;
1700    const char *e;
1701    PyUnicodeObject *unicode;
1702    Py_UNICODE *p;
1703    const char *errmsg = "";
1704    PyObject *errorHandler = NULL;
1705    PyObject *exc = NULL;
1706
1707    /* Note: size will always be longer than the resulting Unicode
1708       character count */
1709    unicode = _PyUnicode_New(size);
1710    if (!unicode)
1711        return NULL;
1712    if (size == 0) {
1713        if (consumed)
1714            *consumed = 0;
1715        return (PyObject *)unicode;
1716    }
1717
1718    /* Unpack UTF-8 encoded data */
1719    p = unicode->str;
1720    e = s + size;
1721
1722    while (s < e) {
1723        Py_UCS4 ch = (unsigned char)*s;
1724
1725        if (ch < 0x80) {
1726            *p++ = (Py_UNICODE)ch;
1727            s++;
1728            continue;
1729        }
1730
1731        n = utf8_code_length[ch];
1732
1733        if (s + n > e) {
1734	    if (consumed)
1735		break;
1736	    else {
1737		errmsg = "unexpected end of data";
1738		startinpos = s-starts;
1739		endinpos = size;
1740		goto utf8Error;
1741	    }
1742	}
1743
1744        switch (n) {
1745
1746        case 0:
1747            errmsg = "unexpected code byte";
1748	    startinpos = s-starts;
1749	    endinpos = startinpos+1;
1750	    goto utf8Error;
1751
1752        case 1:
1753            errmsg = "internal error";
1754	    startinpos = s-starts;
1755	    endinpos = startinpos+1;
1756	    goto utf8Error;
1757
1758        case 2:
1759            if ((s[1] & 0xc0) != 0x80) {
1760                errmsg = "invalid data";
1761		startinpos = s-starts;
1762		endinpos = startinpos+2;
1763		goto utf8Error;
1764	    }
1765            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1766            if (ch < 0x80) {
1767		startinpos = s-starts;
1768		endinpos = startinpos+2;
1769                errmsg = "illegal encoding";
1770		goto utf8Error;
1771	    }
1772	    else
1773		*p++ = (Py_UNICODE)ch;
1774            break;
1775
1776        case 3:
1777            if ((s[1] & 0xc0) != 0x80 ||
1778                (s[2] & 0xc0) != 0x80) {
1779                errmsg = "invalid data";
1780		startinpos = s-starts;
1781		endinpos = startinpos+3;
1782		goto utf8Error;
1783	    }
1784            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1785            if (ch < 0x0800) {
1786		/* Note: UTF-8 encodings of surrogates are considered
1787		   legal UTF-8 sequences;
1788
1789		   XXX For wide builds (UCS-4) we should probably try
1790		       to recombine the surrogates into a single code
1791		       unit.
1792		*/
1793                errmsg = "illegal encoding";
1794		startinpos = s-starts;
1795		endinpos = startinpos+3;
1796		goto utf8Error;
1797	    }
1798	    else
1799		*p++ = (Py_UNICODE)ch;
1800            break;
1801
1802        case 4:
1803            if ((s[1] & 0xc0) != 0x80 ||
1804                (s[2] & 0xc0) != 0x80 ||
1805                (s[3] & 0xc0) != 0x80) {
1806                errmsg = "invalid data";
1807		startinpos = s-starts;
1808		endinpos = startinpos+4;
1809		goto utf8Error;
1810	    }
1811            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1812                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1813            /* validate and convert to UTF-16 */
1814            if ((ch < 0x10000)        /* minimum value allowed for 4
1815					 byte encoding */
1816                || (ch > 0x10ffff))   /* maximum value allowed for
1817					 UTF-16 */
1818	    {
1819                errmsg = "illegal encoding";
1820		startinpos = s-starts;
1821		endinpos = startinpos+4;
1822		goto utf8Error;
1823	    }
1824#ifdef Py_UNICODE_WIDE
1825	    *p++ = (Py_UNICODE)ch;
1826#else
1827            /*  compute and append the two surrogates: */
1828
1829            /*  translate from 10000..10FFFF to 0..FFFF */
1830            ch -= 0x10000;
1831
1832            /*  high surrogate = top 10 bits added to D800 */
1833            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1834
1835            /*  low surrogate = bottom 10 bits added to DC00 */
1836            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1837#endif
1838            break;
1839
1840        default:
1841            /* Other sizes are only needed for UCS-4 */
1842            errmsg = "unsupported Unicode code range";
1843	    startinpos = s-starts;
1844	    endinpos = startinpos+n;
1845	    goto utf8Error;
1846        }
1847        s += n;
1848	continue;
1849
1850    utf8Error:
1851    outpos = p-PyUnicode_AS_UNICODE(unicode);
1852    if (unicode_decode_call_errorhandler(
1853	     errors, &errorHandler,
1854	     "utf8", errmsg,
1855	     starts, size, &startinpos, &endinpos, &exc, &s,
1856	     (PyObject **)&unicode, &outpos, &p))
1857	goto onError;
1858    }
1859    if (consumed)
1860	*consumed = s-starts;
1861
1862    /* Adjust length */
1863    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1864        goto onError;
1865
1866    Py_XDECREF(errorHandler);
1867    Py_XDECREF(exc);
1868    return (PyObject *)unicode;
1869
1870onError:
1871    Py_XDECREF(errorHandler);
1872    Py_XDECREF(exc);
1873    Py_DECREF(unicode);
1874    return NULL;
1875}
1876
1877/* Allocation strategy:  if the string is short, convert into a stack buffer
1878   and allocate exactly as much space needed at the end.  Else allocate the
1879   maximum possible needed (4 result bytes per Unicode character), and return
1880   the excess memory at the end.
1881*/
1882PyObject *
1883PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1884		     Py_ssize_t size,
1885		     const char *errors)
1886{
1887#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1888
1889    Py_ssize_t i;           /* index into s of next input byte */
1890    PyObject *v;        /* result string object */
1891    char *p;            /* next free byte in output buffer */
1892    Py_ssize_t nallocated;  /* number of result bytes allocated */
1893    Py_ssize_t nneeded;        /* number of result bytes needed */
1894    char stackbuf[MAX_SHORT_UNICHARS * 4];
1895
1896    assert(s != NULL);
1897    assert(size >= 0);
1898
1899    if (size <= MAX_SHORT_UNICHARS) {
1900        /* Write into the stack buffer; nallocated can't overflow.
1901         * At the end, we'll allocate exactly as much heap space as it
1902         * turns out we need.
1903         */
1904        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1905        v = NULL;   /* will allocate after we're done */
1906        p = stackbuf;
1907    }
1908    else {
1909        /* Overallocate on the heap, and give the excess back at the end. */
1910        nallocated = size * 4;
1911        if (nallocated / 4 != size)  /* overflow! */
1912            return PyErr_NoMemory();
1913        v = PyBytes_FromStringAndSize(NULL, nallocated);
1914        if (v == NULL)
1915            return NULL;
1916        p = PyBytes_AS_STRING(v);
1917    }
1918
1919    for (i = 0; i < size;) {
1920        Py_UCS4 ch = s[i++];
1921
1922        if (ch < 0x80)
1923            /* Encode ASCII */
1924            *p++ = (char) ch;
1925
1926        else if (ch < 0x0800) {
1927            /* Encode Latin-1 */
1928            *p++ = (char)(0xc0 | (ch >> 6));
1929            *p++ = (char)(0x80 | (ch & 0x3f));
1930        }
1931        else {
1932            /* Encode UCS2 Unicode ordinals */
1933            if (ch < 0x10000) {
1934                /* Special case: check for high surrogate */
1935                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1936                    Py_UCS4 ch2 = s[i];
1937                    /* Check for low surrogate and combine the two to
1938                       form a UCS4 value */
1939                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1940                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1941                        i++;
1942                        goto encodeUCS4;
1943                    }
1944                    /* Fall through: handles isolated high surrogates */
1945                }
1946                *p++ = (char)(0xe0 | (ch >> 12));
1947                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1948                *p++ = (char)(0x80 | (ch & 0x3f));
1949                continue;
1950    	    }
1951encodeUCS4:
1952            /* Encode UCS4 Unicode ordinals */
1953            *p++ = (char)(0xf0 | (ch >> 18));
1954            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1955            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1956            *p++ = (char)(0x80 | (ch & 0x3f));
1957        }
1958    }
1959
1960    if (v == NULL) {
1961        /* This was stack allocated. */
1962        nneeded = p - stackbuf;
1963        assert(nneeded <= nallocated);
1964        v = PyBytes_FromStringAndSize(stackbuf, nneeded);
1965    }
1966    else {
1967    	/* Cut back to size actually needed. */
1968        nneeded = p - PyBytes_AS_STRING(v);
1969        assert(nneeded <= nallocated);
1970        PyBytes_Resize(v, nneeded);
1971    }
1972    return v;
1973
1974#undef MAX_SHORT_UNICHARS
1975}
1976
1977PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1978{
1979    if (!PyUnicode_Check(unicode)) {
1980        PyErr_BadArgument();
1981        return NULL;
1982    }
1983    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1984				PyUnicode_GET_SIZE(unicode),
1985				NULL);
1986}
1987
1988/* --- UTF-16 Codec ------------------------------------------------------- */
1989
1990PyObject *
1991PyUnicode_DecodeUTF16(const char *s,
1992		      Py_ssize_t size,
1993		      const char *errors,
1994		      int *byteorder)
1995{
1996    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1997}
1998
1999PyObject *
2000PyUnicode_DecodeUTF16Stateful(const char *s,
2001			      Py_ssize_t size,
2002			      const char *errors,
2003			      int *byteorder,
2004			      Py_ssize_t *consumed)
2005{
2006    const char *starts = s;
2007    Py_ssize_t startinpos;
2008    Py_ssize_t endinpos;
2009    Py_ssize_t outpos;
2010    PyUnicodeObject *unicode;
2011    Py_UNICODE *p;
2012    const unsigned char *q, *e;
2013    int bo = 0;       /* assume native ordering by default */
2014    const char *errmsg = "";
2015    /* Offsets from q for retrieving byte pairs in the right order. */
2016#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2017    int ihi = 1, ilo = 0;
2018#else
2019    int ihi = 0, ilo = 1;
2020#endif
2021    PyObject *errorHandler = NULL;
2022    PyObject *exc = NULL;
2023
2024    /* Note: size will always be longer than the resulting Unicode
2025       character count */
2026    unicode = _PyUnicode_New(size);
2027    if (!unicode)
2028        return NULL;
2029    if (size == 0)
2030        return (PyObject *)unicode;
2031
2032    /* Unpack UTF-16 encoded data */
2033    p = unicode->str;
2034    q = (unsigned char *)s;
2035    e = q + size;
2036
2037    if (byteorder)
2038        bo = *byteorder;
2039
2040    /* Check for BOM marks (U+FEFF) in the input and adjust current
2041       byte order setting accordingly. In native mode, the leading BOM
2042       mark is skipped, in all other modes, it is copied to the output
2043       stream as-is (giving a ZWNBSP character). */
2044    if (bo == 0) {
2045        if (size >= 2) {
2046            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2047#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2048	    if (bom == 0xFEFF) {
2049		q += 2;
2050		bo = -1;
2051	    }
2052	    else if (bom == 0xFFFE) {
2053		q += 2;
2054		bo = 1;
2055	    }
2056#else
2057	    if (bom == 0xFEFF) {
2058		q += 2;
2059		bo = 1;
2060	    }
2061	    else if (bom == 0xFFFE) {
2062		q += 2;
2063		bo = -1;
2064	    }
2065#endif
2066	}
2067    }
2068
2069    if (bo == -1) {
2070        /* force LE */
2071        ihi = 1;
2072        ilo = 0;
2073    }
2074    else if (bo == 1) {
2075        /* force BE */
2076        ihi = 0;
2077        ilo = 1;
2078    }
2079
2080    while (q < e) {
2081	Py_UNICODE ch;
2082	/* remaining bytes at the end? (size should be even) */
2083	if (e-q<2) {
2084	    if (consumed)
2085		break;
2086	    errmsg = "truncated data";
2087	    startinpos = ((const char *)q)-starts;
2088	    endinpos = ((const char *)e)-starts;
2089	    goto utf16Error;
2090	    /* The remaining input chars are ignored if the callback
2091	       chooses to skip the input */
2092	}
2093	ch = (q[ihi] << 8) | q[ilo];
2094
2095	q += 2;
2096
2097	if (ch < 0xD800 || ch > 0xDFFF) {
2098	    *p++ = ch;
2099	    continue;
2100	}
2101
2102	/* UTF-16 code pair: */
2103	if (q >= e) {
2104	    errmsg = "unexpected end of data";
2105	    startinpos = (((const char *)q)-2)-starts;
2106	    endinpos = ((const char *)e)-starts;
2107	    goto utf16Error;
2108	}
2109	if (0xD800 <= ch && ch <= 0xDBFF) {
2110	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2111	    q += 2;
2112	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2113#ifndef Py_UNICODE_WIDE
2114		*p++ = ch;
2115		*p++ = ch2;
2116#else
2117		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2118#endif
2119		continue;
2120	    }
2121	    else {
2122                errmsg = "illegal UTF-16 surrogate";
2123		startinpos = (((const char *)q)-4)-starts;
2124		endinpos = startinpos+2;
2125		goto utf16Error;
2126	    }
2127
2128	}
2129	errmsg = "illegal encoding";
2130	startinpos = (((const char *)q)-2)-starts;
2131	endinpos = startinpos+2;
2132	/* Fall through to report the error */
2133
2134    utf16Error:
2135	outpos = p-PyUnicode_AS_UNICODE(unicode);
2136	if (unicode_decode_call_errorhandler(
2137	         errors, &errorHandler,
2138	         "utf16", errmsg,
2139	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2140	         (PyObject **)&unicode, &outpos, &p))
2141	    goto onError;
2142    }
2143
2144    if (byteorder)
2145        *byteorder = bo;
2146
2147    if (consumed)
2148	*consumed = (const char *)q-starts;
2149
2150    /* Adjust length */
2151    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2152        goto onError;
2153
2154    Py_XDECREF(errorHandler);
2155    Py_XDECREF(exc);
2156    return (PyObject *)unicode;
2157
2158onError:
2159    Py_DECREF(unicode);
2160    Py_XDECREF(errorHandler);
2161    Py_XDECREF(exc);
2162    return NULL;
2163}
2164
2165PyObject *
2166PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2167		      Py_ssize_t size,
2168		      const char *errors,
2169		      int byteorder)
2170{
2171    PyObject *v;
2172    unsigned char *p;
2173#ifdef Py_UNICODE_WIDE
2174    int i, pairs;
2175#else
2176    const int pairs = 0;
2177#endif
2178    /* Offsets from p for storing byte pairs in the right order. */
2179#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2180    int ihi = 1, ilo = 0;
2181#else
2182    int ihi = 0, ilo = 1;
2183#endif
2184
2185#define STORECHAR(CH)                   \
2186    do {                                \
2187        p[ihi] = ((CH) >> 8) & 0xff;    \
2188        p[ilo] = (CH) & 0xff;           \
2189        p += 2;                         \
2190    } while(0)
2191
2192#ifdef Py_UNICODE_WIDE
2193    for (i = pairs = 0; i < size; i++)
2194	if (s[i] >= 0x10000)
2195	    pairs++;
2196#endif
2197    v = PyBytes_FromStringAndSize(NULL,
2198		  2 * (size + pairs + (byteorder == 0)));
2199    if (v == NULL)
2200        return NULL;
2201
2202    p = (unsigned char *)PyBytes_AS_STRING(v);
2203    if (byteorder == 0)
2204	STORECHAR(0xFEFF);
2205    if (size == 0)
2206        return v;
2207
2208    if (byteorder == -1) {
2209        /* force LE */
2210        ihi = 1;
2211        ilo = 0;
2212    }
2213    else if (byteorder == 1) {
2214        /* force BE */
2215        ihi = 0;
2216        ilo = 1;
2217    }
2218
2219    while (size-- > 0) {
2220	Py_UNICODE ch = *s++;
2221	Py_UNICODE ch2 = 0;
2222#ifdef Py_UNICODE_WIDE
2223	if (ch >= 0x10000) {
2224	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2225	    ch  = 0xD800 | ((ch-0x10000) >> 10);
2226	}
2227#endif
2228        STORECHAR(ch);
2229        if (ch2)
2230            STORECHAR(ch2);
2231    }
2232    return v;
2233#undef STORECHAR
2234}
2235
2236PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2237{
2238    if (!PyUnicode_Check(unicode)) {
2239        PyErr_BadArgument();
2240        return NULL;
2241    }
2242    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2243				 PyUnicode_GET_SIZE(unicode),
2244				 NULL,
2245				 0);
2246}
2247
2248/* --- Unicode Escape Codec ----------------------------------------------- */
2249
2250static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2251
2252PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2253					Py_ssize_t size,
2254					const char *errors)
2255{
2256    const char *starts = s;
2257    Py_ssize_t startinpos;
2258    Py_ssize_t endinpos;
2259    Py_ssize_t outpos;
2260    int i;
2261    PyUnicodeObject *v;
2262    Py_UNICODE *p;
2263    const char *end;
2264    char* message;
2265    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2266    PyObject *errorHandler = NULL;
2267    PyObject *exc = NULL;
2268
2269    /* Escaped strings will always be longer than the resulting
2270       Unicode string, so we start with size here and then reduce the
2271       length after conversion to the true value.
2272       (but if the error callback returns a long replacement string
2273       we'll have to allocate more space) */
2274    v = _PyUnicode_New(size);
2275    if (v == NULL)
2276        goto onError;
2277    if (size == 0)
2278        return (PyObject *)v;
2279
2280    p = PyUnicode_AS_UNICODE(v);
2281    end = s + size;
2282
2283    while (s < end) {
2284        unsigned char c;
2285        Py_UNICODE x;
2286        int digits;
2287
2288        /* Non-escape characters are interpreted as Unicode ordinals */
2289        if (*s != '\\') {
2290            *p++ = (unsigned char) *s++;
2291            continue;
2292        }
2293
2294        startinpos = s-starts;
2295        /* \ - Escapes */
2296        s++;
2297        switch (*s++) {
2298
2299        /* \x escapes */
2300        case '\n': break;
2301        case '\\': *p++ = '\\'; break;
2302        case '\'': *p++ = '\''; break;
2303        case '\"': *p++ = '\"'; break;
2304        case 'b': *p++ = '\b'; break;
2305        case 'f': *p++ = '\014'; break; /* FF */
2306        case 't': *p++ = '\t'; break;
2307        case 'n': *p++ = '\n'; break;
2308        case 'r': *p++ = '\r'; break;
2309        case 'v': *p++ = '\013'; break; /* VT */
2310        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2311
2312        /* \OOO (octal) escapes */
2313        case '0': case '1': case '2': case '3':
2314        case '4': case '5': case '6': case '7':
2315            x = s[-1] - '0';
2316            if ('0' <= *s && *s <= '7') {
2317                x = (x<<3) + *s++ - '0';
2318                if ('0' <= *s && *s <= '7')
2319                    x = (x<<3) + *s++ - '0';
2320            }
2321            *p++ = x;
2322            break;
2323
2324        /* hex escapes */
2325        /* \xXX */
2326        case 'x':
2327            digits = 2;
2328            message = "truncated \\xXX escape";
2329            goto hexescape;
2330
2331        /* \uXXXX */
2332        case 'u':
2333            digits = 4;
2334            message = "truncated \\uXXXX escape";
2335            goto hexescape;
2336
2337        /* \UXXXXXXXX */
2338        case 'U':
2339            digits = 8;
2340            message = "truncated \\UXXXXXXXX escape";
2341        hexescape:
2342            chr = 0;
2343            outpos = p-PyUnicode_AS_UNICODE(v);
2344            if (s+digits>end) {
2345                endinpos = size;
2346                if (unicode_decode_call_errorhandler(
2347                    errors, &errorHandler,
2348                    "unicodeescape", "end of string in escape sequence",
2349                    starts, size, &startinpos, &endinpos, &exc, &s,
2350                    (PyObject **)&v, &outpos, &p))
2351                    goto onError;
2352                goto nextByte;
2353            }
2354            for (i = 0; i < digits; ++i) {
2355                c = (unsigned char) s[i];
2356                if (!isxdigit(c)) {
2357                    endinpos = (s+i+1)-starts;
2358                    if (unicode_decode_call_errorhandler(
2359                        errors, &errorHandler,
2360                        "unicodeescape", message,
2361                        starts, size, &startinpos, &endinpos, &exc, &s,
2362                        (PyObject **)&v, &outpos, &p))
2363                        goto onError;
2364                    goto nextByte;
2365                }
2366                chr = (chr<<4) & ~0xF;
2367                if (c >= '0' && c <= '9')
2368                    chr += c - '0';
2369                else if (c >= 'a' && c <= 'f')
2370                    chr += 10 + c - 'a';
2371                else
2372                    chr += 10 + c - 'A';
2373            }
2374            s += i;
2375            if (chr == 0xffffffff && PyErr_Occurred())
2376                /* _decoding_error will have already written into the
2377                   target buffer. */
2378                break;
2379        store:
2380            /* when we get here, chr is a 32-bit unicode character */
2381            if (chr <= 0xffff)
2382                /* UCS-2 character */
2383                *p++ = (Py_UNICODE) chr;
2384            else if (chr <= 0x10ffff) {
2385                /* UCS-4 character. Either store directly, or as
2386                   surrogate pair. */
2387#ifdef Py_UNICODE_WIDE
2388                *p++ = chr;
2389#else
2390                chr -= 0x10000L;
2391                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2392                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2393#endif
2394            } else {
2395                endinpos = s-starts;
2396                outpos = p-PyUnicode_AS_UNICODE(v);
2397                if (unicode_decode_call_errorhandler(
2398                    errors, &errorHandler,
2399                    "unicodeescape", "illegal Unicode character",
2400                    starts, size, &startinpos, &endinpos, &exc, &s,
2401                    (PyObject **)&v, &outpos, &p))
2402                    goto onError;
2403            }
2404            break;
2405
2406        /* \N{name} */
2407        case 'N':
2408            message = "malformed \\N character escape";
2409            if (ucnhash_CAPI == NULL) {
2410                /* load the unicode data module */
2411                PyObject *m, *api;
2412                m = PyImport_ImportModule("unicodedata");
2413                if (m == NULL)
2414                    goto ucnhashError;
2415                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2416                Py_DECREF(m);
2417                if (api == NULL)
2418                    goto ucnhashError;
2419                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2420                Py_DECREF(api);
2421                if (ucnhash_CAPI == NULL)
2422                    goto ucnhashError;
2423            }
2424            if (*s == '{') {
2425                const char *start = s+1;
2426                /* look for the closing brace */
2427                while (*s != '}' && s < end)
2428                    s++;
2429                if (s > start && s < end && *s == '}') {
2430                    /* found a name.  look it up in the unicode database */
2431                    message = "unknown Unicode character name";
2432                    s++;
2433                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2434                        goto store;
2435                }
2436            }
2437            endinpos = s-starts;
2438            outpos = p-PyUnicode_AS_UNICODE(v);
2439            if (unicode_decode_call_errorhandler(
2440                errors, &errorHandler,
2441                "unicodeescape", message,
2442                starts, size, &startinpos, &endinpos, &exc, &s,
2443                (PyObject **)&v, &outpos, &p))
2444                goto onError;
2445            break;
2446
2447        default:
2448            if (s > end) {
2449                message = "\\ at end of string";
2450                s--;
2451                endinpos = s-starts;
2452                outpos = p-PyUnicode_AS_UNICODE(v);
2453                if (unicode_decode_call_errorhandler(
2454                    errors, &errorHandler,
2455                    "unicodeescape", message,
2456                    starts, size, &startinpos, &endinpos, &exc, &s,
2457                    (PyObject **)&v, &outpos, &p))
2458                    goto onError;
2459            }
2460            else {
2461                *p++ = '\\';
2462                *p++ = (unsigned char)s[-1];
2463            }
2464            break;
2465        }
2466        nextByte:
2467        ;
2468    }
2469    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2470        goto onError;
2471    Py_XDECREF(errorHandler);
2472    Py_XDECREF(exc);
2473    return (PyObject *)v;
2474
2475ucnhashError:
2476    PyErr_SetString(
2477        PyExc_UnicodeError,
2478        "\\N escapes not supported (can't load unicodedata module)"
2479        );
2480    Py_XDECREF(v);
2481    Py_XDECREF(errorHandler);
2482    Py_XDECREF(exc);
2483    return NULL;
2484
2485onError:
2486    Py_XDECREF(v);
2487    Py_XDECREF(errorHandler);
2488    Py_XDECREF(exc);
2489    return NULL;
2490}
2491
2492/* Return a Unicode-Escape string version of the Unicode object.
2493
2494   If quotes is true, the string is enclosed in u"" or u'' quotes as
2495   appropriate.
2496
2497*/
2498
2499Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2500                                      Py_ssize_t size,
2501                                      Py_UNICODE ch)
2502{
2503    /* like wcschr, but doesn't stop at NULL characters */
2504
2505    while (size-- > 0) {
2506        if (*s == ch)
2507            return s;
2508        s++;
2509    }
2510
2511    return NULL;
2512}
2513
2514static const char *hexdigits = "0123456789abcdef";
2515
2516PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2517					Py_ssize_t size)
2518{
2519    PyObject *repr;
2520    char *p;
2521
2522    /* XXX(nnorwitz): rather than over-allocating, it would be
2523       better to choose a different scheme.  Perhaps scan the
2524       first N-chars of the string and allocate based on that size.
2525    */
2526    /* Initial allocation is based on the longest-possible unichr
2527       escape.
2528
2529       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2530       unichr, so in this case it's the longest unichr escape. In
2531       narrow (UTF-16) builds this is five chars per source unichr
2532       since there are two unichrs in the surrogate pair, so in narrow
2533       (UTF-16) builds it's not the longest unichr escape.
2534
2535       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2536       so in the narrow (UTF-16) build case it's the longest unichr
2537       escape.
2538    */
2539
2540    repr = PyBytes_FromStringAndSize(NULL,
2541#ifdef Py_UNICODE_WIDE
2542        + 10*size
2543#else
2544        + 6*size
2545#endif
2546        + 1);
2547    if (repr == NULL)
2548        return NULL;
2549
2550    p = PyBytes_AS_STRING(repr);
2551
2552    while (size-- > 0) {
2553        Py_UNICODE ch = *s++;
2554
2555        /* Escape backslashes */
2556        if (ch == '\\') {
2557            *p++ = '\\';
2558            *p++ = (char) ch;
2559            continue;
2560        }
2561
2562#ifdef Py_UNICODE_WIDE
2563        /* Map 21-bit characters to '\U00xxxxxx' */
2564        else if (ch >= 0x10000) {
2565            *p++ = '\\';
2566            *p++ = 'U';
2567            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
2568            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
2569            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
2570            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
2571            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
2572            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
2573            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
2574            *p++ = hexdigits[ch & 0x0000000F];
2575	    continue;
2576        }
2577#else
2578	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2579	else if (ch >= 0xD800 && ch < 0xDC00) {
2580	    Py_UNICODE ch2;
2581	    Py_UCS4 ucs;
2582
2583	    ch2 = *s++;
2584	    size--;
2585	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2586		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2587		*p++ = '\\';
2588		*p++ = 'U';
2589		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
2590		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
2591		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
2592		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
2593		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
2594		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
2595		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
2596		*p++ = hexdigits[ucs & 0x0000000F];
2597		continue;
2598	    }
2599	    /* Fall through: isolated surrogates are copied as-is */
2600	    s--;
2601	    size++;
2602	}
2603#endif
2604
2605        /* Map 16-bit characters to '\uxxxx' */
2606        if (ch >= 256) {
2607            *p++ = '\\';
2608            *p++ = 'u';
2609            *p++ = hexdigits[(ch >> 12) & 0x000F];
2610            *p++ = hexdigits[(ch >> 8) & 0x000F];
2611            *p++ = hexdigits[(ch >> 4) & 0x000F];
2612            *p++ = hexdigits[ch & 0x000F];
2613        }
2614
2615        /* Map special whitespace to '\t', \n', '\r' */
2616        else if (ch == '\t') {
2617            *p++ = '\\';
2618            *p++ = 't';
2619        }
2620        else if (ch == '\n') {
2621            *p++ = '\\';
2622            *p++ = 'n';
2623        }
2624        else if (ch == '\r') {
2625            *p++ = '\\';
2626            *p++ = 'r';
2627        }
2628
2629        /* Map non-printable US ASCII to '\xhh' */
2630        else if (ch < ' ' || ch >= 0x7F) {
2631            *p++ = '\\';
2632            *p++ = 'x';
2633            *p++ = hexdigits[(ch >> 4) & 0x000F];
2634            *p++ = hexdigits[ch & 0x000F];
2635        }
2636
2637        /* Copy everything else as-is */
2638        else
2639            *p++ = (char) ch;
2640    }
2641
2642    *p = '\0';
2643    if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
2644        Py_DECREF(repr);
2645        return NULL;
2646    }
2647    return repr;
2648}
2649
2650PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2651{
2652    PyObject *s, *result;
2653    if (!PyUnicode_Check(unicode)) {
2654        PyErr_BadArgument();
2655        return NULL;
2656    }
2657    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2658                                      PyUnicode_GET_SIZE(unicode));
2659
2660    if (!s)
2661        return NULL;
2662    result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2663                                        PyBytes_GET_SIZE(s));
2664    Py_DECREF(s);
2665    return result;
2666}
2667
2668/* --- Raw Unicode Escape Codec ------------------------------------------- */
2669
2670PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2671					   Py_ssize_t size,
2672					   const char *errors)
2673{
2674    const char *starts = s;
2675    Py_ssize_t startinpos;
2676    Py_ssize_t endinpos;
2677    Py_ssize_t outpos;
2678    PyUnicodeObject *v;
2679    Py_UNICODE *p;
2680    const char *end;
2681    const char *bs;
2682    PyObject *errorHandler = NULL;
2683    PyObject *exc = NULL;
2684
2685    /* Escaped strings will always be longer than the resulting
2686       Unicode string, so we start with size here and then reduce the
2687       length after conversion to the true value. (But decoding error
2688       handler might have to resize the string) */
2689    v = _PyUnicode_New(size);
2690    if (v == NULL)
2691	goto onError;
2692    if (size == 0)
2693	return (PyObject *)v;
2694    p = PyUnicode_AS_UNICODE(v);
2695    end = s + size;
2696    while (s < end) {
2697	unsigned char c;
2698	Py_UCS4 x;
2699	int i;
2700        int count;
2701
2702	/* Non-escape characters are interpreted as Unicode ordinals */
2703	if (*s != '\\') {
2704	    *p++ = (unsigned char)*s++;
2705	    continue;
2706	}
2707	startinpos = s-starts;
2708
2709	/* \u-escapes are only interpreted iff the number of leading
2710	   backslashes if odd */
2711	bs = s;
2712	for (;s < end;) {
2713	    if (*s != '\\')
2714		break;
2715	    *p++ = (unsigned char)*s++;
2716	}
2717	if (((s - bs) & 1) == 0 ||
2718	    s >= end ||
2719	    (*s != 'u' && *s != 'U')) {
2720	    continue;
2721	}
2722	p--;
2723        count = *s=='u' ? 4 : 8;
2724	s++;
2725
2726	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2727	outpos = p-PyUnicode_AS_UNICODE(v);
2728	for (x = 0, i = 0; i < count; ++i, ++s) {
2729	    c = (unsigned char)*s;
2730	    if (!isxdigit(c)) {
2731		endinpos = s-starts;
2732		if (unicode_decode_call_errorhandler(
2733		    errors, &errorHandler,
2734		    "rawunicodeescape", "truncated \\uXXXX",
2735		    starts, size, &startinpos, &endinpos, &exc, &s,
2736		    (PyObject **)&v, &outpos, &p))
2737		    goto onError;
2738		goto nextByte;
2739	    }
2740	    x = (x<<4) & ~0xF;
2741	    if (c >= '0' && c <= '9')
2742		x += c - '0';
2743	    else if (c >= 'a' && c <= 'f')
2744		x += 10 + c - 'a';
2745	    else
2746		x += 10 + c - 'A';
2747	}
2748#ifndef Py_UNICODE_WIDE
2749        if (x > 0x10000) {
2750            if (unicode_decode_call_errorhandler(
2751                    errors, &errorHandler,
2752                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
2753		    starts, size, &startinpos, &endinpos, &exc, &s,
2754		    (PyObject **)&v, &outpos, &p))
2755		    goto onError;
2756        }
2757#endif
2758	*p++ = x;
2759	nextByte:
2760	;
2761    }
2762    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2763	goto onError;
2764    Py_XDECREF(errorHandler);
2765    Py_XDECREF(exc);
2766    return (PyObject *)v;
2767
2768 onError:
2769    Py_XDECREF(v);
2770    Py_XDECREF(errorHandler);
2771    Py_XDECREF(exc);
2772    return NULL;
2773}
2774
2775PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2776					   Py_ssize_t size)
2777{
2778    PyObject *repr;
2779    char *p;
2780    char *q;
2781
2782#ifdef Py_UNICODE_WIDE
2783    repr = PyBytes_FromStringAndSize(NULL, 10 * size);
2784#else
2785    repr = PyBytes_FromStringAndSize(NULL, 6 * size);
2786#endif
2787    if (repr == NULL)
2788        return NULL;
2789    if (size == 0)
2790	return repr;
2791
2792    p = q = PyBytes_AS_STRING(repr);
2793    while (size-- > 0) {
2794        Py_UNICODE ch = *s++;
2795#ifdef Py_UNICODE_WIDE
2796	/* Map 32-bit characters to '\Uxxxxxxxx' */
2797	if (ch >= 0x10000) {
2798            *p++ = '\\';
2799            *p++ = 'U';
2800            *p++ = hexdigits[(ch >> 28) & 0xf];
2801            *p++ = hexdigits[(ch >> 24) & 0xf];
2802            *p++ = hexdigits[(ch >> 20) & 0xf];
2803            *p++ = hexdigits[(ch >> 16) & 0xf];
2804            *p++ = hexdigits[(ch >> 12) & 0xf];
2805            *p++ = hexdigits[(ch >> 8) & 0xf];
2806            *p++ = hexdigits[(ch >> 4) & 0xf];
2807            *p++ = hexdigits[ch & 15];
2808        }
2809        else
2810#endif
2811	/* Map 16-bit characters to '\uxxxx' */
2812	if (ch >= 256) {
2813            *p++ = '\\';
2814            *p++ = 'u';
2815            *p++ = hexdigits[(ch >> 12) & 0xf];
2816            *p++ = hexdigits[(ch >> 8) & 0xf];
2817            *p++ = hexdigits[(ch >> 4) & 0xf];
2818            *p++ = hexdigits[ch & 15];
2819        }
2820	/* Copy everything else as-is */
2821	else
2822            *p++ = (char) ch;
2823    }
2824    *p = '\0';
2825    if (PyBytes_Resize(repr, p - q)) {
2826        Py_DECREF(repr);
2827        return NULL;
2828    }
2829    return repr;
2830}
2831
2832PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2833{
2834    PyObject *s, *result;
2835    if (!PyUnicode_Check(unicode)) {
2836        PyErr_BadArgument();
2837        return NULL;
2838    }
2839    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2840                                         PyUnicode_GET_SIZE(unicode));
2841
2842    if (!s)
2843        return NULL;
2844    result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
2845                                        PyBytes_GET_SIZE(s));
2846    Py_DECREF(s);
2847    return result;
2848}
2849
2850/* --- Unicode Internal Codec ------------------------------------------- */
2851
2852PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2853					   Py_ssize_t size,
2854					   const char *errors)
2855{
2856    const char *starts = s;
2857    Py_ssize_t startinpos;
2858    Py_ssize_t endinpos;
2859    Py_ssize_t outpos;
2860    PyUnicodeObject *v;
2861    Py_UNICODE *p;
2862    const char *end;
2863    const char *reason;
2864    PyObject *errorHandler = NULL;
2865    PyObject *exc = NULL;
2866
2867#ifdef Py_UNICODE_WIDE
2868    Py_UNICODE unimax = PyUnicode_GetMax();
2869#endif
2870
2871    /* XXX overflow detection missing */
2872    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2873    if (v == NULL)
2874	goto onError;
2875    if (PyUnicode_GetSize((PyObject *)v) == 0)
2876	return (PyObject *)v;
2877    p = PyUnicode_AS_UNICODE(v);
2878    end = s + size;
2879
2880    while (s < end) {
2881        memcpy(p, s, sizeof(Py_UNICODE));
2882        /* We have to sanity check the raw data, otherwise doom looms for
2883           some malformed UCS-4 data. */
2884        if (
2885            #ifdef Py_UNICODE_WIDE
2886            *p > unimax || *p < 0 ||
2887            #endif
2888            end-s < Py_UNICODE_SIZE
2889            )
2890            {
2891            startinpos = s - starts;
2892            if (end-s < Py_UNICODE_SIZE) {
2893                endinpos = end-starts;
2894                reason = "truncated input";
2895            }
2896            else {
2897                endinpos = s - starts + Py_UNICODE_SIZE;
2898                reason = "illegal code point (> 0x10FFFF)";
2899            }
2900            outpos = p - PyUnicode_AS_UNICODE(v);
2901            if (unicode_decode_call_errorhandler(
2902                    errors, &errorHandler,
2903                    "unicode_internal", reason,
2904                    starts, size, &startinpos, &endinpos, &exc, &s,
2905                    (PyObject **)&v, &outpos, &p)) {
2906                goto onError;
2907            }
2908        }
2909        else {
2910            p++;
2911            s += Py_UNICODE_SIZE;
2912        }
2913    }
2914
2915    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2916        goto onError;
2917    Py_XDECREF(errorHandler);
2918    Py_XDECREF(exc);
2919    return (PyObject *)v;
2920
2921 onError:
2922    Py_XDECREF(v);
2923    Py_XDECREF(errorHandler);
2924    Py_XDECREF(exc);
2925    return NULL;
2926}
2927
2928/* --- Latin-1 Codec ------------------------------------------------------ */
2929
2930PyObject *PyUnicode_DecodeLatin1(const char *s,
2931				 Py_ssize_t size,
2932				 const char *errors)
2933{
2934    PyUnicodeObject *v;
2935    Py_UNICODE *p;
2936
2937    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2938    if (size == 1) {
2939	Py_UNICODE r = *(unsigned char*)s;
2940	return PyUnicode_FromUnicode(&r, 1);
2941    }
2942
2943    v = _PyUnicode_New(size);
2944    if (v == NULL)
2945	goto onError;
2946    if (size == 0)
2947	return (PyObject *)v;
2948    p = PyUnicode_AS_UNICODE(v);
2949    while (size-- > 0)
2950	*p++ = (unsigned char)*s++;
2951    return (PyObject *)v;
2952
2953 onError:
2954    Py_XDECREF(v);
2955    return NULL;
2956}
2957
2958/* create or adjust a UnicodeEncodeError */
2959static void make_encode_exception(PyObject **exceptionObject,
2960    const char *encoding,
2961    const Py_UNICODE *unicode, Py_ssize_t size,
2962    Py_ssize_t startpos, Py_ssize_t endpos,
2963    const char *reason)
2964{
2965    if (*exceptionObject == NULL) {
2966	*exceptionObject = PyUnicodeEncodeError_Create(
2967	    encoding, unicode, size, startpos, endpos, reason);
2968    }
2969    else {
2970	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2971	    goto onError;
2972	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2973	    goto onError;
2974	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2975	    goto onError;
2976	return;
2977	onError:
2978	Py_DECREF(*exceptionObject);
2979	*exceptionObject = NULL;
2980    }
2981}
2982
2983/* raises a UnicodeEncodeError */
2984static void raise_encode_exception(PyObject **exceptionObject,
2985    const char *encoding,
2986    const Py_UNICODE *unicode, Py_ssize_t size,
2987    Py_ssize_t startpos, Py_ssize_t endpos,
2988    const char *reason)
2989{
2990    make_encode_exception(exceptionObject,
2991	encoding, unicode, size, startpos, endpos, reason);
2992    if (*exceptionObject != NULL)
2993	PyCodec_StrictErrors(*exceptionObject);
2994}
2995
2996/* error handling callback helper:
2997   build arguments, call the callback and check the arguments,
2998   put the result into newpos and return the replacement string, which
2999   has to be freed by the caller */
3000static PyObject *unicode_encode_call_errorhandler(const char *errors,
3001    PyObject **errorHandler,
3002    const char *encoding, const char *reason,
3003    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3004    Py_ssize_t startpos, Py_ssize_t endpos,
3005    Py_ssize_t *newpos)
3006{
3007    static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3008
3009    PyObject *restuple;
3010    PyObject *resunicode;
3011
3012    if (*errorHandler == NULL) {
3013	*errorHandler = PyCodec_LookupError(errors);
3014        if (*errorHandler == NULL)
3015	    return NULL;
3016    }
3017
3018    make_encode_exception(exceptionObject,
3019	encoding, unicode, size, startpos, endpos, reason);
3020    if (*exceptionObject == NULL)
3021	return NULL;
3022
3023    restuple = PyObject_CallFunctionObjArgs(
3024	*errorHandler, *exceptionObject, NULL);
3025    if (restuple == NULL)
3026	return NULL;
3027    if (!PyTuple_Check(restuple)) {
3028	PyErr_Format(PyExc_TypeError, &argparse[4]);
3029	Py_DECREF(restuple);
3030	return NULL;
3031    }
3032    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3033	&resunicode, newpos)) {
3034	Py_DECREF(restuple);
3035	return NULL;
3036    }
3037    if (*newpos<0)
3038	*newpos = size+*newpos;
3039    if (*newpos<0 || *newpos>size) {
3040	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3041	Py_DECREF(restuple);
3042	return NULL;
3043    }
3044    Py_INCREF(resunicode);
3045    Py_DECREF(restuple);
3046    return resunicode;
3047}
3048
3049static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3050				 Py_ssize_t size,
3051				 const char *errors,
3052				 int limit)
3053{
3054    /* output object */
3055    PyObject *res;
3056    /* pointers to the beginning and end+1 of input */
3057    const Py_UNICODE *startp = p;
3058    const Py_UNICODE *endp = p + size;
3059    /* pointer to the beginning of the unencodable characters */
3060    /* const Py_UNICODE *badp = NULL; */
3061    /* pointer into the output */
3062    char *str;
3063    /* current output position */
3064    Py_ssize_t respos = 0;
3065    Py_ssize_t ressize;
3066    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3067    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3068    PyObject *errorHandler = NULL;
3069    PyObject *exc = NULL;
3070    /* the following variable is used for caching string comparisons
3071     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3072    int known_errorHandler = -1;
3073
3074    /* allocate enough for a simple encoding without
3075       replacements, if we need more, we'll resize */
3076    res = PyBytes_FromStringAndSize(NULL, size);
3077    if (res == NULL)
3078        goto onError;
3079    if (size == 0)
3080	return res;
3081    str = PyBytes_AS_STRING(res);
3082    ressize = size;
3083
3084    while (p<endp) {
3085	Py_UNICODE c = *p;
3086
3087	/* can we encode this? */
3088	if (c<limit) {
3089	    /* no overflow check, because we know that the space is enough */
3090	    *str++ = (char)c;
3091	    ++p;
3092	}
3093	else {
3094	    Py_ssize_t unicodepos = p-startp;
3095	    Py_ssize_t requiredsize;
3096	    PyObject *repunicode;
3097	    Py_ssize_t repsize;
3098	    Py_ssize_t newpos;
3099	    Py_ssize_t respos;
3100	    Py_UNICODE *uni2;
3101	    /* startpos for collecting unencodable chars */
3102	    const Py_UNICODE *collstart = p;
3103	    const Py_UNICODE *collend = p;
3104	    /* find all unecodable characters */
3105	    while ((collend < endp) && ((*collend)>=limit))
3106		++collend;
3107	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3108	    if (known_errorHandler==-1) {
3109		if ((errors==NULL) || (!strcmp(errors, "strict")))
3110		    known_errorHandler = 1;
3111		else if (!strcmp(errors, "replace"))
3112		    known_errorHandler = 2;
3113		else if (!strcmp(errors, "ignore"))
3114		    known_errorHandler = 3;
3115		else if (!strcmp(errors, "xmlcharrefreplace"))
3116		    known_errorHandler = 4;
3117		else
3118		    known_errorHandler = 0;
3119	    }
3120	    switch (known_errorHandler) {
3121		case 1: /* strict */
3122		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3123		    goto onError;
3124		case 2: /* replace */
3125		    while (collstart++<collend)
3126			*str++ = '?'; /* fall through */
3127		case 3: /* ignore */
3128		    p = collend;
3129		    break;
3130		case 4: /* xmlcharrefreplace */
3131		    respos = str - PyBytes_AS_STRING(res);
3132		    /* determine replacement size (temporarily (mis)uses p) */
3133		    for (p = collstart, repsize = 0; p < collend; ++p) {
3134			if (*p<10)
3135			    repsize += 2+1+1;
3136			else if (*p<100)
3137			    repsize += 2+2+1;
3138			else if (*p<1000)
3139			    repsize += 2+3+1;
3140			else if (*p<10000)
3141			    repsize += 2+4+1;
3142#ifndef Py_UNICODE_WIDE
3143			else
3144			    repsize += 2+5+1;
3145#else
3146			else if (*p<100000)
3147			    repsize += 2+5+1;
3148			else if (*p<1000000)
3149			    repsize += 2+6+1;
3150			else
3151			    repsize += 2+7+1;
3152#endif
3153		    }
3154		    requiredsize = respos+repsize+(endp-collend);
3155		    if (requiredsize > ressize) {
3156			if (requiredsize<2*ressize)
3157			    requiredsize = 2*ressize;
3158			if (PyBytes_Resize(res, requiredsize))
3159			    goto onError;
3160			str = PyBytes_AS_STRING(res) + respos;
3161			ressize = requiredsize;
3162		    }
3163		    /* generate replacement (temporarily (mis)uses p) */
3164		    for (p = collstart; p < collend; ++p) {
3165			str += sprintf(str, "&#%d;", (int)*p);
3166		    }
3167		    p = collend;
3168		    break;
3169		default:
3170		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3171			encoding, reason, startp, size, &exc,
3172			collstart-startp, collend-startp, &newpos);
3173		    if (repunicode == NULL)
3174			goto onError;
3175		    /* need more space? (at least enough for what we
3176		       have+the replacement+the rest of the string, so
3177		       we won't have to check space for encodable characters) */
3178		    respos = str - PyBytes_AS_STRING(res);
3179		    repsize = PyUnicode_GET_SIZE(repunicode);
3180		    requiredsize = respos+repsize+(endp-collend);
3181		    if (requiredsize > ressize) {
3182			if (requiredsize<2*ressize)
3183			    requiredsize = 2*ressize;
3184			if (PyBytes_Resize(res, requiredsize)) {
3185			    Py_DECREF(repunicode);
3186			    goto onError;
3187			}
3188			str = PyBytes_AS_STRING(res) + respos;
3189			ressize = requiredsize;
3190		    }
3191		    /* check if there is anything unencodable in the replacement
3192		       and copy it to the output */
3193		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3194			c = *uni2;
3195			if (c >= limit) {
3196			    raise_encode_exception(&exc, encoding, startp, size,
3197				unicodepos, unicodepos+1, reason);
3198			    Py_DECREF(repunicode);
3199			    goto onError;
3200			}
3201			*str = (char)c;
3202		    }
3203		    p = startp + newpos;
3204		    Py_DECREF(repunicode);
3205	    }
3206	}
3207    }
3208    /* Resize if we allocated to much */
3209    respos = str - PyBytes_AS_STRING(res);
3210    if (respos<ressize)
3211       /* If this falls res will be NULL */
3212	PyBytes_Resize(res, respos);
3213    Py_XDECREF(errorHandler);
3214    Py_XDECREF(exc);
3215    return res;
3216
3217    onError:
3218    Py_XDECREF(res);
3219    Py_XDECREF(errorHandler);
3220    Py_XDECREF(exc);
3221    return NULL;
3222}
3223
3224PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3225				 Py_ssize_t size,
3226				 const char *errors)
3227{
3228    return unicode_encode_ucs1(p, size, errors, 256);
3229}
3230
3231PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3232{
3233    if (!PyUnicode_Check(unicode)) {
3234	PyErr_BadArgument();
3235	return NULL;
3236    }
3237    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3238				  PyUnicode_GET_SIZE(unicode),
3239				  NULL);
3240}
3241
3242/* --- 7-bit ASCII Codec -------------------------------------------------- */
3243
3244PyObject *PyUnicode_DecodeASCII(const char *s,
3245				Py_ssize_t size,
3246				const char *errors)
3247{
3248    const char *starts = s;
3249    PyUnicodeObject *v;
3250    Py_UNICODE *p;
3251    Py_ssize_t startinpos;
3252    Py_ssize_t endinpos;
3253    Py_ssize_t outpos;
3254    const char *e;
3255    PyObject *errorHandler = NULL;
3256    PyObject *exc = NULL;
3257
3258    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3259    if (size == 1 && *(unsigned char*)s < 128) {
3260	Py_UNICODE r = *(unsigned char*)s;
3261	return PyUnicode_FromUnicode(&r, 1);
3262    }
3263
3264    v = _PyUnicode_New(size);
3265    if (v == NULL)
3266	goto onError;
3267    if (size == 0)
3268	return (PyObject *)v;
3269    p = PyUnicode_AS_UNICODE(v);
3270    e = s + size;
3271    while (s < e) {
3272	register unsigned char c = (unsigned char)*s;
3273	if (c < 128) {
3274	    *p++ = c;
3275	    ++s;
3276	}
3277	else {
3278	    startinpos = s-starts;
3279	    endinpos = startinpos + 1;
3280	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3281	    if (unicode_decode_call_errorhandler(
3282		 errors, &errorHandler,
3283		 "ascii", "ordinal not in range(128)",
3284		 starts, size, &startinpos, &endinpos, &exc, &s,
3285		 (PyObject **)&v, &outpos, &p))
3286		goto onError;
3287	}
3288    }
3289    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3290	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3291	    goto onError;
3292    Py_XDECREF(errorHandler);
3293    Py_XDECREF(exc);
3294    return (PyObject *)v;
3295
3296 onError:
3297    Py_XDECREF(v);
3298    Py_XDECREF(errorHandler);
3299    Py_XDECREF(exc);
3300    return NULL;
3301}
3302
3303PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3304				Py_ssize_t size,
3305				const char *errors)
3306{
3307    return unicode_encode_ucs1(p, size, errors, 128);
3308}
3309
3310PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3311{
3312    if (!PyUnicode_Check(unicode)) {
3313	PyErr_BadArgument();
3314	return NULL;
3315    }
3316    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3317				 PyUnicode_GET_SIZE(unicode),
3318				 NULL);
3319}
3320
3321#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3322
3323/* --- MBCS codecs for Windows -------------------------------------------- */
3324
3325#if SIZEOF_INT < SIZEOF_SSIZE_T
3326#define NEED_RETRY
3327#endif
3328
3329/* XXX This code is limited to "true" double-byte encodings, as
3330   a) it assumes an incomplete character consists of a single byte, and
3331   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3332      encodings, see IsDBCSLeadByteEx documentation. */
3333
3334static int is_dbcs_lead_byte(const char *s, int offset)
3335{
3336    const char *curr = s + offset;
3337
3338    if (IsDBCSLeadByte(*curr)) {
3339	const char *prev = CharPrev(s, curr);
3340	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3341    }
3342    return 0;
3343}
3344
3345/*
3346 * Decode MBCS string into unicode object. If 'final' is set, converts
3347 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3348 */
3349static int decode_mbcs(PyUnicodeObject **v,
3350			const char *s, /* MBCS string */
3351			int size, /* sizeof MBCS string */
3352			int final)
3353{
3354    Py_UNICODE *p;
3355    Py_ssize_t n = 0;
3356    int usize = 0;
3357
3358    assert(size >= 0);
3359
3360    /* Skip trailing lead-byte unless 'final' is set */
3361    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3362	--size;
3363
3364    /* First get the size of the result */
3365    if (size > 0) {
3366	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3367	if (usize == 0) {
3368	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3369	    return -1;
3370	}
3371    }
3372
3373    if (*v == NULL) {
3374	/* Create unicode object */
3375	*v = _PyUnicode_New(usize);
3376	if (*v == NULL)
3377	    return -1;
3378    }
3379    else {
3380	/* Extend unicode object */
3381	n = PyUnicode_GET_SIZE(*v);
3382	if (_PyUnicode_Resize(v, n + usize) < 0)
3383	    return -1;
3384    }
3385
3386    /* Do the conversion */
3387    if (size > 0) {
3388	p = PyUnicode_AS_UNICODE(*v) + n;
3389	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3390	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3391	    return -1;
3392	}
3393    }
3394
3395    return size;
3396}
3397
3398PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3399					Py_ssize_t size,
3400					const char *errors,
3401					Py_ssize_t *consumed)
3402{
3403    PyUnicodeObject *v = NULL;
3404    int done;
3405
3406    if (consumed)
3407	*consumed = 0;
3408
3409#ifdef NEED_RETRY
3410  retry:
3411    if (size > INT_MAX)
3412	done = decode_mbcs(&v, s, INT_MAX, 0);
3413    else
3414#endif
3415	done = decode_mbcs(&v, s, (int)size, !consumed);
3416
3417    if (done < 0) {
3418        Py_XDECREF(v);
3419	return NULL;
3420    }
3421
3422    if (consumed)
3423	*consumed += done;
3424
3425#ifdef NEED_RETRY
3426    if (size > INT_MAX) {
3427	s += done;
3428	size -= done;
3429	goto retry;
3430    }
3431#endif
3432
3433    return (PyObject *)v;
3434}
3435
3436PyObject *PyUnicode_DecodeMBCS(const char *s,
3437				Py_ssize_t size,
3438				const char *errors)
3439{
3440    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3441}
3442
3443/*
3444 * Convert unicode into string object (MBCS).
3445 * Returns 0 if succeed, -1 otherwise.
3446 */
3447static int encode_mbcs(PyObject **repr,
3448			const Py_UNICODE *p, /* unicode */
3449			int size) /* size of unicode */
3450{
3451    int mbcssize = 0;
3452    Py_ssize_t n = 0;
3453
3454    assert(size >= 0);
3455
3456    /* First get the size of the result */
3457    if (size > 0) {
3458	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3459	if (mbcssize == 0) {
3460	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3461	    return -1;
3462	}
3463    }
3464
3465    if (*repr == NULL) {
3466	/* Create string object */
3467	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
3468	if (*repr == NULL)
3469	    return -1;
3470    }
3471    else {
3472	/* Extend string object */
3473	n = PyBytes_Size(*repr);
3474	if (PyBytes_Resize(*repr, n + mbcssize) < 0)
3475	    return -1;
3476    }
3477
3478    /* Do the conversion */
3479    if (size > 0) {
3480	char *s = PyBytes_AS_STRING(*repr) + n;
3481	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3482	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3483	    return -1;
3484	}
3485    }
3486
3487    return 0;
3488}
3489
3490PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3491				Py_ssize_t size,
3492				const char *errors)
3493{
3494    PyObject *repr = NULL;
3495    int ret;
3496
3497#ifdef NEED_RETRY
3498 retry:
3499    if (size > INT_MAX)
3500	ret = encode_mbcs(&repr, p, INT_MAX);
3501    else
3502#endif
3503	ret = encode_mbcs(&repr, p, (int)size);
3504
3505    if (ret < 0) {
3506	Py_XDECREF(repr);
3507	return NULL;
3508    }
3509
3510#ifdef NEED_RETRY
3511    if (size > INT_MAX) {
3512	p += INT_MAX;
3513	size -= INT_MAX;
3514	goto retry;
3515    }
3516#endif
3517
3518    return repr;
3519}
3520
3521PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3522{
3523    if (!PyUnicode_Check(unicode)) {
3524        PyErr_BadArgument();
3525        return NULL;
3526    }
3527    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3528				PyUnicode_GET_SIZE(unicode),
3529				NULL);
3530}
3531
3532#undef NEED_RETRY
3533
3534#endif /* MS_WINDOWS */
3535
3536/* --- Character Mapping Codec -------------------------------------------- */
3537
3538PyObject *PyUnicode_DecodeCharmap(const char *s,
3539				  Py_ssize_t size,
3540				  PyObject *mapping,
3541				  const char *errors)
3542{
3543    const char *starts = s;
3544    Py_ssize_t startinpos;
3545    Py_ssize_t endinpos;
3546    Py_ssize_t outpos;
3547    const char *e;
3548    PyUnicodeObject *v;
3549    Py_UNICODE *p;
3550    Py_ssize_t extrachars = 0;
3551    PyObject *errorHandler = NULL;
3552    PyObject *exc = NULL;
3553    Py_UNICODE *mapstring = NULL;
3554    Py_ssize_t maplen = 0;
3555
3556    /* Default to Latin-1 */
3557    if (mapping == NULL)
3558	return PyUnicode_DecodeLatin1(s, size, errors);
3559
3560    v = _PyUnicode_New(size);
3561    if (v == NULL)
3562	goto onError;
3563    if (size == 0)
3564	return (PyObject *)v;
3565    p = PyUnicode_AS_UNICODE(v);
3566    e = s + size;
3567    if (PyUnicode_CheckExact(mapping)) {
3568	mapstring = PyUnicode_AS_UNICODE(mapping);
3569	maplen = PyUnicode_GET_SIZE(mapping);
3570	while (s < e) {
3571	    unsigned char ch = *s;
3572	    Py_UNICODE x = 0xfffe; /* illegal value */
3573
3574	    if (ch < maplen)
3575		x = mapstring[ch];
3576
3577	    if (x == 0xfffe) {
3578		/* undefined mapping */
3579		outpos = p-PyUnicode_AS_UNICODE(v);
3580		startinpos = s-starts;
3581		endinpos = startinpos+1;
3582		if (unicode_decode_call_errorhandler(
3583		     errors, &errorHandler,
3584		     "charmap", "character maps to <undefined>",
3585		     starts, size, &startinpos, &endinpos, &exc, &s,
3586		     (PyObject **)&v, &outpos, &p)) {
3587		    goto onError;
3588		}
3589		continue;
3590	    }
3591	    *p++ = x;
3592	    ++s;
3593	}
3594    }
3595    else {
3596	while (s < e) {
3597	    unsigned char ch = *s;
3598	    PyObject *w, *x;
3599
3600	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
3601	    w = PyInt_FromLong((long)ch);
3602	    if (w == NULL)
3603		goto onError;
3604	    x = PyObject_GetItem(mapping, w);
3605	    Py_DECREF(w);
3606	    if (x == NULL) {
3607		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3608		    /* No mapping found means: mapping is undefined. */
3609		    PyErr_Clear();
3610		    x = Py_None;
3611		    Py_INCREF(x);
3612		} else
3613		    goto onError;
3614	    }
3615
3616	    /* Apply mapping */
3617	    if (PyInt_Check(x)) {
3618		long value = PyInt_AS_LONG(x);
3619		if (value < 0 || value > 65535) {
3620		    PyErr_SetString(PyExc_TypeError,
3621				    "character mapping must be in range(65536)");
3622		    Py_DECREF(x);
3623		    goto onError;
3624		}
3625		*p++ = (Py_UNICODE)value;
3626	    }
3627	    else if (x == Py_None) {
3628		/* undefined mapping */
3629		outpos = p-PyUnicode_AS_UNICODE(v);
3630		startinpos = s-starts;
3631		endinpos = startinpos+1;
3632		if (unicode_decode_call_errorhandler(
3633		     errors, &errorHandler,
3634		     "charmap", "character maps to <undefined>",
3635		     starts, size, &startinpos, &endinpos, &exc, &s,
3636		     (PyObject **)&v, &outpos, &p)) {
3637		    Py_DECREF(x);
3638		    goto onError;
3639		}
3640		Py_DECREF(x);
3641		continue;
3642	    }
3643	    else if (PyUnicode_Check(x)) {
3644		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3645
3646		if (targetsize == 1)
3647		    /* 1-1 mapping */
3648		    *p++ = *PyUnicode_AS_UNICODE(x);
3649
3650		else if (targetsize > 1) {
3651		    /* 1-n mapping */
3652		    if (targetsize > extrachars) {
3653			/* resize first */
3654			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3655			Py_ssize_t needed = (targetsize - extrachars) + \
3656				     (targetsize << 2);
3657			extrachars += needed;
3658			/* XXX overflow detection missing */
3659			if (_PyUnicode_Resize(&v,
3660					     PyUnicode_GET_SIZE(v) + needed) < 0) {
3661			    Py_DECREF(x);
3662			    goto onError;
3663			}
3664			p = PyUnicode_AS_UNICODE(v) + oldpos;
3665		    }
3666		    Py_UNICODE_COPY(p,
3667				    PyUnicode_AS_UNICODE(x),
3668				    targetsize);
3669		    p += targetsize;
3670		    extrachars -= targetsize;
3671		}
3672		/* 1-0 mapping: skip the character */
3673	    }
3674	    else {
3675		/* wrong return value */
3676		PyErr_SetString(PyExc_TypeError,
3677		      "character mapping must return integer, None or unicode");
3678		Py_DECREF(x);
3679		goto onError;
3680	    }
3681	    Py_DECREF(x);
3682	    ++s;
3683	}
3684    }
3685    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3686	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3687	    goto onError;
3688    Py_XDECREF(errorHandler);
3689    Py_XDECREF(exc);
3690    return (PyObject *)v;
3691
3692 onError:
3693    Py_XDECREF(errorHandler);
3694    Py_XDECREF(exc);
3695    Py_XDECREF(v);
3696    return NULL;
3697}
3698
3699/* Charmap encoding: the lookup table */
3700
3701struct encoding_map{
3702  PyObject_HEAD
3703  unsigned char level1[32];
3704  int count2, count3;
3705  unsigned char level23[1];
3706};
3707
3708static PyObject*
3709encoding_map_size(PyObject *obj, PyObject* args)
3710{
3711    struct encoding_map *map = (struct encoding_map*)obj;
3712    return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3713                          128*map->count3);
3714}
3715
3716static PyMethodDef encoding_map_methods[] = {
3717	{"size", encoding_map_size, METH_NOARGS,
3718         PyDoc_STR("Return the size (in bytes) of this object") },
3719        { 0 }
3720};
3721
3722static void
3723encoding_map_dealloc(PyObject* o)
3724{
3725	PyObject_FREE(o);
3726}
3727
3728static PyTypeObject EncodingMapType = {
3729	PyObject_HEAD_INIT(NULL)
3730        0,                      /*ob_size*/
3731        "EncodingMap",          /*tp_name*/
3732        sizeof(struct encoding_map),   /*tp_basicsize*/
3733        0,                      /*tp_itemsize*/
3734        /* methods */
3735        encoding_map_dealloc,   /*tp_dealloc*/
3736        0,                      /*tp_print*/
3737        0,                      /*tp_getattr*/
3738        0,                      /*tp_setattr*/
3739        0,                      /*tp_compare*/
3740        0,                      /*tp_repr*/
3741        0,                      /*tp_as_number*/
3742        0,                      /*tp_as_sequence*/
3743        0,                      /*tp_as_mapping*/
3744        0,                      /*tp_hash*/
3745        0,                      /*tp_call*/
3746        0,                      /*tp_str*/
3747        0,                      /*tp_getattro*/
3748        0,                      /*tp_setattro*/
3749        0,                      /*tp_as_buffer*/
3750        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
3751        0,                      /*tp_doc*/
3752        0,                      /*tp_traverse*/
3753        0,                      /*tp_clear*/
3754        0,                      /*tp_richcompare*/
3755        0,                      /*tp_weaklistoffset*/
3756        0,                      /*tp_iter*/
3757        0,                      /*tp_iternext*/
3758        encoding_map_methods,   /*tp_methods*/
3759        0,                      /*tp_members*/
3760        0,                      /*tp_getset*/
3761        0,                      /*tp_base*/
3762        0,                      /*tp_dict*/
3763        0,                      /*tp_descr_get*/
3764        0,                      /*tp_descr_set*/
3765        0,                      /*tp_dictoffset*/
3766        0,                      /*tp_init*/
3767        0,                      /*tp_alloc*/
3768        0,                      /*tp_new*/
3769        0,                      /*tp_free*/
3770        0,                      /*tp_is_gc*/
3771};
3772
3773PyObject*
3774PyUnicode_BuildEncodingMap(PyObject* string)
3775{
3776    Py_UNICODE *decode;
3777    PyObject *result;
3778    struct encoding_map *mresult;
3779    int i;
3780    int need_dict = 0;
3781    unsigned char level1[32];
3782    unsigned char level2[512];
3783    unsigned char *mlevel1, *mlevel2, *mlevel3;
3784    int count2 = 0, count3 = 0;
3785
3786    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3787        PyErr_BadArgument();
3788        return NULL;
3789    }
3790    decode = PyUnicode_AS_UNICODE(string);
3791    memset(level1, 0xFF, sizeof level1);
3792    memset(level2, 0xFF, sizeof level2);
3793
3794    /* If there isn't a one-to-one mapping of NULL to \0,
3795       or if there are non-BMP characters, we need to use
3796       a mapping dictionary. */
3797    if (decode[0] != 0)
3798        need_dict = 1;
3799    for (i = 1; i < 256; i++) {
3800        int l1, l2;
3801        if (decode[i] == 0
3802            #ifdef Py_UNICODE_WIDE
3803            || decode[i] > 0xFFFF
3804            #endif
3805        ) {
3806            need_dict = 1;
3807            break;
3808        }
3809        if (decode[i] == 0xFFFE)
3810            /* unmapped character */
3811            continue;
3812        l1 = decode[i] >> 11;
3813        l2 = decode[i] >> 7;
3814        if (level1[l1] == 0xFF)
3815            level1[l1] = count2++;
3816        if (level2[l2] == 0xFF)
3817            level2[l2] = count3++;
3818    }
3819
3820    if (count2 >= 0xFF || count3 >= 0xFF)
3821        need_dict = 1;
3822
3823    if (need_dict) {
3824        PyObject *result = PyDict_New();
3825        PyObject *key, *value;
3826        if (!result)
3827            return NULL;
3828        for (i = 0; i < 256; i++) {
3829            key = value = NULL;
3830            key = PyInt_FromLong(decode[i]);
3831            value = PyInt_FromLong(i);
3832            if (!key || !value)
3833                goto failed1;
3834            if (PyDict_SetItem(result, key, value) == -1)
3835                goto failed1;
3836            Py_DECREF(key);
3837            Py_DECREF(value);
3838        }
3839        return result;
3840      failed1:
3841        Py_XDECREF(key);
3842        Py_XDECREF(value);
3843        Py_DECREF(result);
3844        return NULL;
3845    }
3846
3847    /* Create a three-level trie */
3848    result = PyObject_MALLOC(sizeof(struct encoding_map) +
3849                             16*count2 + 128*count3 - 1);
3850    if (!result)
3851        return PyErr_NoMemory();
3852    PyObject_Init(result, &EncodingMapType);
3853    mresult = (struct encoding_map*)result;
3854    mresult->count2 = count2;
3855    mresult->count3 = count3;
3856    mlevel1 = mresult->level1;
3857    mlevel2 = mresult->level23;
3858    mlevel3 = mresult->level23 + 16*count2;
3859    memcpy(mlevel1, level1, 32);
3860    memset(mlevel2, 0xFF, 16*count2);
3861    memset(mlevel3, 0, 128*count3);
3862    count3 = 0;
3863    for (i = 1; i < 256; i++) {
3864        int o1, o2, o3, i2, i3;
3865        if (decode[i] == 0xFFFE)
3866            /* unmapped character */
3867            continue;
3868        o1 = decode[i]>>11;
3869        o2 = (decode[i]>>7) & 0xF;
3870        i2 = 16*mlevel1[o1] + o2;
3871        if (mlevel2[i2] == 0xFF)
3872            mlevel2[i2] = count3++;
3873        o3 = decode[i] & 0x7F;
3874        i3 = 128*mlevel2[i2] + o3;
3875        mlevel3[i3] = i;
3876    }
3877    return result;
3878}
3879
3880static int
3881encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3882{
3883    struct encoding_map *map = (struct encoding_map*)mapping;
3884    int l1 = c>>11;
3885    int l2 = (c>>7) & 0xF;
3886    int l3 = c & 0x7F;
3887    int i;
3888
3889#ifdef Py_UNICODE_WIDE
3890    if (c > 0xFFFF) {
3891	return -1;
3892    }
3893#endif
3894    if (c == 0)
3895        return 0;
3896    /* level 1*/
3897    i = map->level1[l1];
3898    if (i == 0xFF) {
3899        return -1;
3900    }
3901    /* level 2*/
3902    i = map->level23[16*i+l2];
3903    if (i == 0xFF) {
3904        return -1;
3905    }
3906    /* level 3 */
3907    i = map->level23[16*map->count2 + 128*i + l3];
3908    if (i == 0) {
3909        return -1;
3910    }
3911    return i;
3912}
3913
3914/* Lookup the character ch in the mapping. If the character
3915   can't be found, Py_None is returned (or NULL, if another
3916   error occurred). */
3917static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
3918{
3919    PyObject *w = PyInt_FromLong((long)c);
3920    PyObject *x;
3921
3922    if (w == NULL)
3923	 return NULL;
3924    x = PyObject_GetItem(mapping, w);
3925    Py_DECREF(w);
3926    if (x == NULL) {
3927	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3928	    /* No mapping found means: mapping is undefined. */
3929	    PyErr_Clear();
3930	    x = Py_None;
3931	    Py_INCREF(x);
3932	    return x;
3933	} else
3934	    return NULL;
3935    }
3936    else if (x == Py_None)
3937	return x;
3938    else if (PyInt_Check(x)) {
3939	long value = PyInt_AS_LONG(x);
3940	if (value < 0 || value > 255) {
3941	    PyErr_SetString(PyExc_TypeError,
3942			     "character mapping must be in range(256)");
3943	    Py_DECREF(x);
3944	    return NULL;
3945	}
3946	return x;
3947    }
3948    else if (PyString_Check(x))
3949	return x;
3950    else {
3951	/* wrong return value */
3952	PyErr_Format(PyExc_TypeError,
3953                "character mapping must return integer, None or str8, not %.400s",
3954                x->ob_type->tp_name);
3955	Py_DECREF(x);
3956	return NULL;
3957    }
3958}
3959
3960static int
3961charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3962{
3963	Py_ssize_t outsize = PyBytes_GET_SIZE(  outobj);
3964	/* exponentially overallocate to minimize reallocations */
3965	if (requiredsize < 2*outsize)
3966	    requiredsize = 2*outsize;
3967	if (PyBytes_Resize(outobj, requiredsize)) {
3968	    Py_DECREF(outobj);
3969	    return -1;
3970	}
3971	return 0;
3972}
3973
3974typedef enum charmapencode_result {
3975  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3976}charmapencode_result;
3977/* lookup the character, put the result in the output string and adjust
3978   various state variables. Resize the output bytes object if not enough
3979   space is available. Return a new reference to the object that
3980   was put in the output buffer, or Py_None, if the mapping was undefined
3981   (in which case no character was written) or NULL, if a
3982   reallocation error occurred. The caller must decref the result */
3983static
3984charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3985    PyObject *outobj, Py_ssize_t *outpos)
3986{
3987    PyObject *rep;
3988    char *outstart;
3989    Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
3990
3991    if (mapping->ob_type == &EncodingMapType) {
3992        int res = encoding_map_lookup(c, mapping);
3993	Py_ssize_t requiredsize = *outpos+1;
3994        if (res == -1)
3995            return enc_FAILED;
3996	if (outsize<requiredsize)
3997	    if (charmapencode_resize(outobj, outpos, requiredsize))
3998		return enc_EXCEPTION;
3999        outstart = PyBytes_AS_STRING(outobj);
4000	outstart[(*outpos)++] = (char)res;
4001	return enc_SUCCESS;
4002    }
4003
4004    rep = charmapencode_lookup(c, mapping);
4005    if (rep==NULL)
4006	return enc_EXCEPTION;
4007    else if (rep==Py_None) {
4008	Py_DECREF(rep);
4009	return enc_FAILED;
4010    } else {
4011	if (PyInt_Check(rep)) {
4012	    Py_ssize_t requiredsize = *outpos+1;
4013	    if (outsize<requiredsize)
4014		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4015		    Py_DECREF(rep);
4016		    return enc_EXCEPTION;
4017		}
4018            outstart = PyBytes_AS_STRING(outobj);
4019	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4020	}
4021	else {
4022	    const char *repchars = PyString_AS_STRING(rep);
4023	    Py_ssize_t repsize = PyString_GET_SIZE(rep);
4024	    Py_ssize_t requiredsize = *outpos+repsize;
4025	    if (outsize<requiredsize)
4026		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4027		    Py_DECREF(rep);
4028		    return enc_EXCEPTION;
4029		}
4030            outstart = PyBytes_AS_STRING(outobj);
4031	    memcpy(outstart + *outpos, repchars, repsize);
4032	    *outpos += repsize;
4033	}
4034    }
4035    Py_DECREF(rep);
4036    return enc_SUCCESS;
4037}
4038
4039/* handle an error in PyUnicode_EncodeCharmap
4040   Return 0 on success, -1 on error */
4041static
4042int charmap_encoding_error(
4043    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4044    PyObject **exceptionObject,
4045    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4046    PyObject *res, Py_ssize_t *respos)
4047{
4048    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4049    Py_ssize_t repsize;
4050    Py_ssize_t newpos;
4051    Py_UNICODE *uni2;
4052    /* startpos for collecting unencodable chars */
4053    Py_ssize_t collstartpos = *inpos;
4054    Py_ssize_t collendpos = *inpos+1;
4055    Py_ssize_t collpos;
4056    char *encoding = "charmap";
4057    char *reason = "character maps to <undefined>";
4058    charmapencode_result x;
4059
4060    /* find all unencodable characters */
4061    while (collendpos < size) {
4062        PyObject *rep;
4063        if (mapping->ob_type == &EncodingMapType) {
4064	    int res = encoding_map_lookup(p[collendpos], mapping);
4065	    if (res != -1)
4066		break;
4067	    ++collendpos;
4068	    continue;
4069	}
4070
4071	rep = charmapencode_lookup(p[collendpos], mapping);
4072	if (rep==NULL)
4073	    return -1;
4074	else if (rep!=Py_None) {
4075	    Py_DECREF(rep);
4076	    break;
4077	}
4078	Py_DECREF(rep);
4079	++collendpos;
4080    }
4081    /* cache callback name lookup
4082     * (if not done yet, i.e. it's the first error) */
4083    if (*known_errorHandler==-1) {
4084	if ((errors==NULL) || (!strcmp(errors, "strict")))
4085	    *known_errorHandler = 1;
4086	else if (!strcmp(errors, "replace"))
4087	    *known_errorHandler = 2;
4088	else if (!strcmp(errors, "ignore"))
4089	    *known_errorHandler = 3;
4090	else if (!strcmp(errors, "xmlcharrefreplace"))
4091	    *known_errorHandler = 4;
4092	else
4093	    *known_errorHandler = 0;
4094    }
4095    switch (*known_errorHandler) {
4096	case 1: /* strict */
4097	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4098	    return -1;
4099	case 2: /* replace */
4100	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4101		x = charmapencode_output('?', mapping, res, respos);
4102		if (x==enc_EXCEPTION) {
4103		    return -1;
4104		}
4105		else if (x==enc_FAILED) {
4106		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4107		    return -1;
4108		}
4109	    }
4110	    /* fall through */
4111	case 3: /* ignore */
4112	    *inpos = collendpos;
4113	    break;
4114	case 4: /* xmlcharrefreplace */
4115	    /* generate replacement (temporarily (mis)uses p) */
4116	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4117		char buffer[2+29+1+1];
4118		char *cp;
4119		sprintf(buffer, "&#%d;", (int)p[collpos]);
4120		for (cp = buffer; *cp; ++cp) {
4121		    x = charmapencode_output(*cp, mapping, res, respos);
4122		    if (x==enc_EXCEPTION)
4123			return -1;
4124		    else if (x==enc_FAILED) {
4125			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4126			return -1;
4127		    }
4128		}
4129	    }
4130	    *inpos = collendpos;
4131	    break;
4132	default:
4133	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4134		encoding, reason, p, size, exceptionObject,
4135		collstartpos, collendpos, &newpos);
4136	    if (repunicode == NULL)
4137		return -1;
4138	    /* generate replacement  */
4139	    repsize = PyUnicode_GET_SIZE(repunicode);
4140	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4141		x = charmapencode_output(*uni2, mapping, res, respos);
4142		if (x==enc_EXCEPTION) {
4143		    return -1;
4144		}
4145		else if (x==enc_FAILED) {
4146		    Py_DECREF(repunicode);
4147		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4148		    return -1;
4149		}
4150	    }
4151	    *inpos = newpos;
4152	    Py_DECREF(repunicode);
4153    }
4154    return 0;
4155}
4156
4157PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4158				  Py_ssize_t size,
4159				  PyObject *mapping,
4160				  const char *errors)
4161{
4162    /* output object */
4163    PyObject *res = NULL;
4164    /* current input position */
4165    Py_ssize_t inpos = 0;
4166    /* current output position */
4167    Py_ssize_t respos = 0;
4168    PyObject *errorHandler = NULL;
4169    PyObject *exc = NULL;
4170    /* the following variable is used for caching string comparisons
4171     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4172     * 3=ignore, 4=xmlcharrefreplace */
4173    int known_errorHandler = -1;
4174
4175    /* Default to Latin-1 */
4176    if (mapping == NULL)
4177	return PyUnicode_EncodeLatin1(p, size, errors);
4178
4179    /* allocate enough for a simple encoding without
4180       replacements, if we need more, we'll resize */
4181    res = PyBytes_FromStringAndSize(NULL, size);
4182    if (res == NULL)
4183        goto onError;
4184    if (size == 0)
4185	return res;
4186
4187    while (inpos<size) {
4188	/* try to encode it */
4189	charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
4190	if (x==enc_EXCEPTION) /* error */
4191	    goto onError;
4192	if (x==enc_FAILED) { /* unencodable character */
4193	    if (charmap_encoding_error(p, size, &inpos, mapping,
4194		&exc,
4195		&known_errorHandler, &errorHandler, errors,
4196		res, &respos)) {
4197		goto onError;
4198	    }
4199	}
4200	else
4201	    /* done with this character => adjust input position */
4202	    ++inpos;
4203    }
4204
4205    /* Resize if we allocated to much */
4206    if (respos<PyBytes_GET_SIZE(res)) {
4207	if (PyBytes_Resize(res, respos))
4208	    goto onError;
4209    }
4210    Py_XDECREF(exc);
4211    Py_XDECREF(errorHandler);
4212    return res;
4213
4214    onError:
4215    Py_XDECREF(res);
4216    Py_XDECREF(exc);
4217    Py_XDECREF(errorHandler);
4218    return NULL;
4219}
4220
4221PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4222				    PyObject *mapping)
4223{
4224    if (!PyUnicode_Check(unicode) || mapping == NULL) {
4225	PyErr_BadArgument();
4226	return NULL;
4227    }
4228    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4229				   PyUnicode_GET_SIZE(unicode),
4230				   mapping,
4231				   NULL);
4232}
4233
4234/* create or adjust a UnicodeTranslateError */
4235static void make_translate_exception(PyObject **exceptionObject,
4236    const Py_UNICODE *unicode, Py_ssize_t size,
4237    Py_ssize_t startpos, Py_ssize_t endpos,
4238    const char *reason)
4239{
4240    if (*exceptionObject == NULL) {
4241    	*exceptionObject = PyUnicodeTranslateError_Create(
4242	    unicode, size, startpos, endpos, reason);
4243    }
4244    else {
4245	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4246	    goto onError;
4247	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4248	    goto onError;
4249	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4250	    goto onError;
4251	return;
4252	onError:
4253	Py_DECREF(*exceptionObject);
4254	*exceptionObject = NULL;
4255    }
4256}
4257
4258/* raises a UnicodeTranslateError */
4259static void raise_translate_exception(PyObject **exceptionObject,
4260    const Py_UNICODE *unicode, Py_ssize_t size,
4261    Py_ssize_t startpos, Py_ssize_t endpos,
4262    const char *reason)
4263{
4264    make_translate_exception(exceptionObject,
4265	unicode, size, startpos, endpos, reason);
4266    if (*exceptionObject != NULL)
4267	PyCodec_StrictErrors(*exceptionObject);
4268}
4269
4270/* error handling callback helper:
4271   build arguments, call the callback and check the arguments,
4272   put the result into newpos and return the replacement string, which
4273   has to be freed by the caller */
4274static PyObject *unicode_translate_call_errorhandler(const char *errors,
4275    PyObject **errorHandler,
4276    const char *reason,
4277    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4278    Py_ssize_t startpos, Py_ssize_t endpos,
4279    Py_ssize_t *newpos)
4280{
4281    static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4282
4283    Py_ssize_t i_newpos;
4284    PyObject *restuple;
4285    PyObject *resunicode;
4286
4287    if (*errorHandler == NULL) {
4288	*errorHandler = PyCodec_LookupError(errors);
4289        if (*errorHandler == NULL)
4290	    return NULL;
4291    }
4292
4293    make_translate_exception(exceptionObject,
4294	unicode, size, startpos, endpos, reason);
4295    if (*exceptionObject == NULL)
4296	return NULL;
4297
4298    restuple = PyObject_CallFunctionObjArgs(
4299	*errorHandler, *exceptionObject, NULL);
4300    if (restuple == NULL)
4301	return NULL;
4302    if (!PyTuple_Check(restuple)) {
4303	PyErr_Format(PyExc_TypeError, &argparse[4]);
4304	Py_DECREF(restuple);
4305	return NULL;
4306    }
4307    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4308	&resunicode, &i_newpos)) {
4309	Py_DECREF(restuple);
4310	return NULL;
4311    }
4312    if (i_newpos<0)
4313	*newpos = size+i_newpos;
4314    else
4315        *newpos = i_newpos;
4316    if (*newpos<0 || *newpos>size) {
4317	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4318	Py_DECREF(restuple);
4319	return NULL;
4320    }
4321    Py_INCREF(resunicode);
4322    Py_DECREF(restuple);
4323    return resunicode;
4324}
4325
4326/* Lookup the character ch in the mapping and put the result in result,
4327   which must be decrefed by the caller.
4328   Return 0 on success, -1 on error */
4329static
4330int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4331{
4332    PyObject *w = PyInt_FromLong((long)c);
4333    PyObject *x;
4334
4335    if (w == NULL)
4336	 return -1;
4337    x = PyObject_GetItem(mapping, w);
4338    Py_DECREF(w);
4339    if (x == NULL) {
4340	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4341	    /* No mapping found means: use 1:1 mapping. */
4342	    PyErr_Clear();
4343	    *result = NULL;
4344	    return 0;
4345	} else
4346	    return -1;
4347    }
4348    else if (x == Py_None) {
4349	*result = x;
4350	return 0;
4351    }
4352    else if (PyInt_Check(x)) {
4353	long value = PyInt_AS_LONG(x);
4354	long max = PyUnicode_GetMax();
4355	if (value < 0 || value > max) {
4356	    PyErr_Format(PyExc_TypeError,
4357			     "character mapping must be in range(0x%lx)", max+1);
4358	    Py_DECREF(x);
4359	    return -1;
4360	}
4361	*result = x;
4362	return 0;
4363    }
4364    else if (PyUnicode_Check(x)) {
4365	*result = x;
4366	return 0;
4367    }
4368    else {
4369	/* wrong return value */
4370	PyErr_SetString(PyExc_TypeError,
4371	      "character mapping must return integer, None or unicode");
4372	Py_DECREF(x);
4373	return -1;
4374    }
4375}
4376/* ensure that *outobj is at least requiredsize characters long,
4377if not reallocate and adjust various state variables.
4378Return 0 on success, -1 on error */
4379static
4380int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4381    Py_ssize_t requiredsize)
4382{
4383    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4384    if (requiredsize > oldsize) {
4385	/* remember old output position */
4386	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4387	/* exponentially overallocate to minimize reallocations */
4388	if (requiredsize < 2 * oldsize)
4389	    requiredsize = 2 * oldsize;
4390	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4391	    return -1;
4392	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4393    }
4394    return 0;
4395}
4396/* lookup the character, put the result in the output string and adjust
4397   various state variables. Return a new reference to the object that
4398   was put in the output buffer in *result, or Py_None, if the mapping was
4399   undefined (in which case no character was written).
4400   The called must decref result.
4401   Return 0 on success, -1 on error. */
4402static
4403int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4404    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4405    PyObject **res)
4406{
4407    if (charmaptranslate_lookup(*curinp, mapping, res))
4408	return -1;
4409    if (*res==NULL) {
4410	/* not found => default to 1:1 mapping */
4411	*(*outp)++ = *curinp;
4412    }
4413    else if (*res==Py_None)
4414	;
4415    else if (PyInt_Check(*res)) {
4416	/* no overflow check, because we know that the space is enough */
4417	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4418    }
4419    else if (PyUnicode_Check(*res)) {
4420	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4421	if (repsize==1) {
4422	    /* no overflow check, because we know that the space is enough */
4423	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4424	}
4425	else if (repsize!=0) {
4426	    /* more than one character */
4427	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4428		(insize - (curinp-startinp)) +
4429		repsize - 1;
4430	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
4431		return -1;
4432	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4433	    *outp += repsize;
4434	}
4435    }
4436    else
4437	return -1;
4438    return 0;
4439}
4440
4441PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4442				     Py_ssize_t size,
4443				     PyObject *mapping,
4444				     const char *errors)
4445{
4446    /* output object */
4447    PyObject *res = NULL;
4448    /* pointers to the beginning and end+1 of input */
4449    const Py_UNICODE *startp = p;
4450    const Py_UNICODE *endp = p + size;
4451    /* pointer into the output */
4452    Py_UNICODE *str;
4453    /* current output position */
4454    Py_ssize_t respos = 0;
4455    char *reason = "character maps to <undefined>";
4456    PyObject *errorHandler = NULL;
4457    PyObject *exc = NULL;
4458    /* the following variable is used for caching string comparisons
4459     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4460     * 3=ignore, 4=xmlcharrefreplace */
4461    int known_errorHandler = -1;
4462
4463    if (mapping == NULL) {
4464	PyErr_BadArgument();
4465	return NULL;
4466    }
4467
4468    /* allocate enough for a simple 1:1 translation without
4469       replacements, if we need more, we'll resize */
4470    res = PyUnicode_FromUnicode(NULL, size);
4471    if (res == NULL)
4472	goto onError;
4473    if (size == 0)
4474	return res;
4475    str = PyUnicode_AS_UNICODE(res);
4476
4477    while (p<endp) {
4478	/* try to encode it */
4479	PyObject *x = NULL;
4480	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4481	    Py_XDECREF(x);
4482	    goto onError;
4483	}
4484	Py_XDECREF(x);
4485	if (x!=Py_None) /* it worked => adjust input pointer */
4486	    ++p;
4487	else { /* untranslatable character */
4488	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4489	    Py_ssize_t repsize;
4490	    Py_ssize_t newpos;
4491	    Py_UNICODE *uni2;
4492	    /* startpos for collecting untranslatable chars */
4493	    const Py_UNICODE *collstart = p;
4494	    const Py_UNICODE *collend = p+1;
4495	    const Py_UNICODE *coll;
4496
4497	    /* find all untranslatable characters */
4498	    while (collend < endp) {
4499		if (charmaptranslate_lookup(*collend, mapping, &x))
4500		    goto onError;
4501		Py_XDECREF(x);
4502		if (x!=Py_None)
4503		    break;
4504		++collend;
4505	    }
4506	    /* cache callback name lookup
4507	     * (if not done yet, i.e. it's the first error) */
4508	    if (known_errorHandler==-1) {
4509		if ((errors==NULL) || (!strcmp(errors, "strict")))
4510		    known_errorHandler = 1;
4511		else if (!strcmp(errors, "replace"))
4512		    known_errorHandler = 2;
4513		else if (!strcmp(errors, "ignore"))
4514		    known_errorHandler = 3;
4515		else if (!strcmp(errors, "xmlcharrefreplace"))
4516		    known_errorHandler = 4;
4517		else
4518		    known_errorHandler = 0;
4519	    }
4520	    switch (known_errorHandler) {
4521		case 1: /* strict */
4522		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4523		    goto onError;
4524		case 2: /* replace */
4525		    /* No need to check for space, this is a 1:1 replacement */
4526		    for (coll = collstart; coll<collend; ++coll)
4527			*str++ = '?';
4528		    /* fall through */
4529		case 3: /* ignore */
4530		    p = collend;
4531		    break;
4532		case 4: /* xmlcharrefreplace */
4533		    /* generate replacement (temporarily (mis)uses p) */
4534		    for (p = collstart; p < collend; ++p) {
4535			char buffer[2+29+1+1];
4536			char *cp;
4537			sprintf(buffer, "&#%d;", (int)*p);
4538			if (charmaptranslate_makespace(&res, &str,
4539			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4540			    goto onError;
4541			for (cp = buffer; *cp; ++cp)
4542			    *str++ = *cp;
4543		    }
4544		    p = collend;
4545		    break;
4546		default:
4547		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4548			reason, startp, size, &exc,
4549			collstart-startp, collend-startp, &newpos);
4550		    if (repunicode == NULL)
4551			goto onError;
4552		    /* generate replacement  */
4553		    repsize = PyUnicode_GET_SIZE(repunicode);
4554		    if (charmaptranslate_makespace(&res, &str,
4555			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4556			Py_DECREF(repunicode);
4557			goto onError;
4558		    }
4559		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4560			*str++ = *uni2;
4561		    p = startp + newpos;
4562		    Py_DECREF(repunicode);
4563	    }
4564	}
4565    }
4566    /* Resize if we allocated to much */
4567    respos = str-PyUnicode_AS_UNICODE(res);
4568    if (respos<PyUnicode_GET_SIZE(res)) {
4569	if (_PyUnicode_Resize(&res, respos) < 0)
4570	    goto onError;
4571    }
4572    Py_XDECREF(exc);
4573    Py_XDECREF(errorHandler);
4574    return res;
4575
4576    onError:
4577    Py_XDECREF(res);
4578    Py_XDECREF(exc);
4579    Py_XDECREF(errorHandler);
4580    return NULL;
4581}
4582
4583PyObject *PyUnicode_Translate(PyObject *str,
4584			      PyObject *mapping,
4585			      const char *errors)
4586{
4587    PyObject *result;
4588
4589    str = PyUnicode_FromObject(str);
4590    if (str == NULL)
4591	goto onError;
4592    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4593					PyUnicode_GET_SIZE(str),
4594					mapping,
4595					errors);
4596    Py_DECREF(str);
4597    return result;
4598
4599 onError:
4600    Py_XDECREF(str);
4601    return NULL;
4602}
4603
4604/* --- Decimal Encoder ---------------------------------------------------- */
4605
4606int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4607			    Py_ssize_t length,
4608			    char *output,
4609			    const char *errors)
4610{
4611    Py_UNICODE *p, *end;
4612    PyObject *errorHandler = NULL;
4613    PyObject *exc = NULL;
4614    const char *encoding = "decimal";
4615    const char *reason = "invalid decimal Unicode string";
4616    /* the following variable is used for caching string comparisons
4617     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4618    int known_errorHandler = -1;
4619
4620    if (output == NULL) {
4621	PyErr_BadArgument();
4622	return -1;
4623    }
4624
4625    p = s;
4626    end = s + length;
4627    while (p < end) {
4628	register Py_UNICODE ch = *p;
4629	int decimal;
4630	PyObject *repunicode;
4631	Py_ssize_t repsize;
4632	Py_ssize_t newpos;
4633	Py_UNICODE *uni2;
4634	Py_UNICODE *collstart;
4635	Py_UNICODE *collend;
4636
4637	if (Py_UNICODE_ISSPACE(ch)) {
4638	    *output++ = ' ';
4639	    ++p;
4640	    continue;
4641	}
4642	decimal = Py_UNICODE_TODECIMAL(ch);
4643	if (decimal >= 0) {
4644	    *output++ = '0' + decimal;
4645	    ++p;
4646	    continue;
4647	}
4648	if (0 < ch && ch < 256) {
4649	    *output++ = (char)ch;
4650	    ++p;
4651	    continue;
4652	}
4653	/* All other characters are considered unencodable */
4654	collstart = p;
4655	collend = p+1;
4656	while (collend < end) {
4657	    if ((0 < *collend && *collend < 256) ||
4658	        !Py_UNICODE_ISSPACE(*collend) ||
4659	        Py_UNICODE_TODECIMAL(*collend))
4660		break;
4661	}
4662	/* cache callback name lookup
4663	 * (if not done yet, i.e. it's the first error) */
4664	if (known_errorHandler==-1) {
4665	    if ((errors==NULL) || (!strcmp(errors, "strict")))
4666		known_errorHandler = 1;
4667	    else if (!strcmp(errors, "replace"))
4668		known_errorHandler = 2;
4669	    else if (!strcmp(errors, "ignore"))
4670		known_errorHandler = 3;
4671	    else if (!strcmp(errors, "xmlcharrefreplace"))
4672		known_errorHandler = 4;
4673	    else
4674		known_errorHandler = 0;
4675	}
4676	switch (known_errorHandler) {
4677	    case 1: /* strict */
4678		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4679		goto onError;
4680	    case 2: /* replace */
4681		for (p = collstart; p < collend; ++p)
4682		    *output++ = '?';
4683		/* fall through */
4684	    case 3: /* ignore */
4685		p = collend;
4686		break;
4687	    case 4: /* xmlcharrefreplace */
4688		/* generate replacement (temporarily (mis)uses p) */
4689		for (p = collstart; p < collend; ++p)
4690		    output += sprintf(output, "&#%d;", (int)*p);
4691		p = collend;
4692		break;
4693	    default:
4694		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4695		    encoding, reason, s, length, &exc,
4696		    collstart-s, collend-s, &newpos);
4697		if (repunicode == NULL)
4698		    goto onError;
4699		/* generate replacement  */
4700		repsize = PyUnicode_GET_SIZE(repunicode);
4701		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4702		    Py_UNICODE ch = *uni2;
4703		    if (Py_UNICODE_ISSPACE(ch))
4704			*output++ = ' ';
4705		    else {
4706			decimal = Py_UNICODE_TODECIMAL(ch);
4707			if (decimal >= 0)
4708			    *output++ = '0' + decimal;
4709			else if (0 < ch && ch < 256)
4710			    *output++ = (char)ch;
4711			else {
4712			    Py_DECREF(repunicode);
4713			    raise_encode_exception(&exc, encoding,
4714				s, length, collstart-s, collend-s, reason);
4715			    goto onError;
4716			}
4717		    }
4718		}
4719		p = s + newpos;
4720		Py_DECREF(repunicode);
4721	}
4722    }
4723    /* 0-terminate the output string */
4724    *output++ = '\0';
4725    Py_XDECREF(exc);
4726    Py_XDECREF(errorHandler);
4727    return 0;
4728
4729 onError:
4730    Py_XDECREF(exc);
4731    Py_XDECREF(errorHandler);
4732    return -1;
4733}
4734
4735/* --- Helpers ------------------------------------------------------------ */
4736
4737#define STRINGLIB_CHAR Py_UNICODE
4738
4739#define STRINGLIB_LEN PyUnicode_GET_SIZE
4740#define STRINGLIB_NEW PyUnicode_FromUnicode
4741#define STRINGLIB_STR PyUnicode_AS_UNICODE
4742
4743Py_LOCAL_INLINE(int)
4744STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4745{
4746    if (str[0] != other[0])
4747        return 1;
4748    return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4749}
4750
4751#define STRINGLIB_EMPTY unicode_empty
4752
4753#include "stringlib/fastsearch.h"
4754
4755#include "stringlib/count.h"
4756#include "stringlib/find.h"
4757#include "stringlib/partition.h"
4758
4759/* helper macro to fixup start/end slice values */
4760#define FIX_START_END(obj)                      \
4761    if (start < 0)                              \
4762        start += (obj)->length;                 \
4763    if (start < 0)                              \
4764        start = 0;                              \
4765    if (end > (obj)->length)                    \
4766        end = (obj)->length;                    \
4767    if (end < 0)                                \
4768        end += (obj)->length;                   \
4769    if (end < 0)                                \
4770        end = 0;
4771
4772Py_ssize_t PyUnicode_Count(PyObject *str,
4773                           PyObject *substr,
4774                           Py_ssize_t start,
4775                           Py_ssize_t end)
4776{
4777    Py_ssize_t result;
4778    PyUnicodeObject* str_obj;
4779    PyUnicodeObject* sub_obj;
4780
4781    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4782    if (!str_obj)
4783	return -1;
4784    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4785    if (!sub_obj) {
4786	Py_DECREF(str_obj);
4787	return -1;
4788    }
4789
4790    FIX_START_END(str_obj);
4791
4792    result = stringlib_count(
4793        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4794        );
4795
4796    Py_DECREF(sub_obj);
4797    Py_DECREF(str_obj);
4798
4799    return result;
4800}
4801
4802Py_ssize_t PyUnicode_Find(PyObject *str,
4803                          PyObject *sub,
4804                          Py_ssize_t start,
4805                          Py_ssize_t end,
4806                          int direction)
4807{
4808    Py_ssize_t result;
4809
4810    str = PyUnicode_FromObject(str);
4811    if (!str)
4812	return -2;
4813    sub = PyUnicode_FromObject(sub);
4814    if (!sub) {
4815	Py_DECREF(str);
4816	return -2;
4817    }
4818
4819    if (direction > 0)
4820        result = stringlib_find_slice(
4821            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4822            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4823            start, end
4824            );
4825    else
4826        result = stringlib_rfind_slice(
4827            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4828            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4829            start, end
4830            );
4831
4832    Py_DECREF(str);
4833    Py_DECREF(sub);
4834
4835    return result;
4836}
4837
4838static
4839int tailmatch(PyUnicodeObject *self,
4840	      PyUnicodeObject *substring,
4841	      Py_ssize_t start,
4842	      Py_ssize_t end,
4843	      int direction)
4844{
4845    if (substring->length == 0)
4846        return 1;
4847
4848    FIX_START_END(self);
4849
4850    end -= substring->length;
4851    if (end < start)
4852	return 0;
4853
4854    if (direction > 0) {
4855	if (Py_UNICODE_MATCH(self, end, substring))
4856	    return 1;
4857    } else {
4858        if (Py_UNICODE_MATCH(self, start, substring))
4859	    return 1;
4860    }
4861
4862    return 0;
4863}
4864
4865Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4866			PyObject *substr,
4867			Py_ssize_t start,
4868			Py_ssize_t end,
4869			int direction)
4870{
4871    Py_ssize_t result;
4872
4873    str = PyUnicode_FromObject(str);
4874    if (str == NULL)
4875	return -1;
4876    substr = PyUnicode_FromObject(substr);
4877    if (substr == NULL) {
4878	Py_DECREF(str);
4879	return -1;
4880    }
4881
4882    result = tailmatch((PyUnicodeObject *)str,
4883		       (PyUnicodeObject *)substr,
4884		       start, end, direction);
4885    Py_DECREF(str);
4886    Py_DECREF(substr);
4887    return result;
4888}
4889
4890/* Apply fixfct filter to the Unicode object self and return a
4891   reference to the modified object */
4892
4893static
4894PyObject *fixup(PyUnicodeObject *self,
4895		int (*fixfct)(PyUnicodeObject *s))
4896{
4897
4898    PyUnicodeObject *u;
4899
4900    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4901    if (u == NULL)
4902	return NULL;
4903
4904    Py_UNICODE_COPY(u->str, self->str, self->length);
4905
4906    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4907	/* fixfct should return TRUE if it modified the buffer. If
4908	   FALSE, return a reference to the original buffer instead
4909	   (to save space, not time) */
4910	Py_INCREF(self);
4911	Py_DECREF(u);
4912	return (PyObject*) self;
4913    }
4914    return (PyObject*) u;
4915}
4916
4917static
4918int fixupper(PyUnicodeObject *self)
4919{
4920    Py_ssize_t len = self->length;
4921    Py_UNICODE *s = self->str;
4922    int status = 0;
4923
4924    while (len-- > 0) {
4925	register Py_UNICODE ch;
4926
4927	ch = Py_UNICODE_TOUPPER(*s);
4928	if (ch != *s) {
4929            status = 1;
4930	    *s = ch;
4931	}
4932        s++;
4933    }
4934
4935    return status;
4936}
4937
4938static
4939int fixlower(PyUnicodeObject *self)
4940{
4941    Py_ssize_t len = self->length;
4942    Py_UNICODE *s = self->str;
4943    int status = 0;
4944
4945    while (len-- > 0) {
4946	register Py_UNICODE ch;
4947
4948	ch = Py_UNICODE_TOLOWER(*s);
4949	if (ch != *s) {
4950            status = 1;
4951	    *s = ch;
4952	}
4953        s++;
4954    }
4955
4956    return status;
4957}
4958
4959static
4960int fixswapcase(PyUnicodeObject *self)
4961{
4962    Py_ssize_t len = self->length;
4963    Py_UNICODE *s = self->str;
4964    int status = 0;
4965
4966    while (len-- > 0) {
4967        if (Py_UNICODE_ISUPPER(*s)) {
4968            *s = Py_UNICODE_TOLOWER(*s);
4969            status = 1;
4970        } else if (Py_UNICODE_ISLOWER(*s)) {
4971            *s = Py_UNICODE_TOUPPER(*s);
4972            status = 1;
4973        }
4974        s++;
4975    }
4976
4977    return status;
4978}
4979
4980static
4981int fixcapitalize(PyUnicodeObject *self)
4982{
4983    Py_ssize_t len = self->length;
4984    Py_UNICODE *s = self->str;
4985    int status = 0;
4986
4987    if (len == 0)
4988	return 0;
4989    if (Py_UNICODE_ISLOWER(*s)) {
4990	*s = Py_UNICODE_TOUPPER(*s);
4991	status = 1;
4992    }
4993    s++;
4994    while (--len > 0) {
4995        if (Py_UNICODE_ISUPPER(*s)) {
4996            *s = Py_UNICODE_TOLOWER(*s);
4997            status = 1;
4998        }
4999        s++;
5000    }
5001    return status;
5002}
5003
5004static
5005int fixtitle(PyUnicodeObject *self)
5006{
5007    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5008    register Py_UNICODE *e;
5009    int previous_is_cased;
5010
5011    /* Shortcut for single character strings */
5012    if (PyUnicode_GET_SIZE(self) == 1) {
5013	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5014	if (*p != ch) {
5015	    *p = ch;
5016	    return 1;
5017	}
5018	else
5019	    return 0;
5020    }
5021
5022    e = p + PyUnicode_GET_SIZE(self);
5023    previous_is_cased = 0;
5024    for (; p < e; p++) {
5025	register const Py_UNICODE ch = *p;
5026
5027	if (previous_is_cased)
5028	    *p = Py_UNICODE_TOLOWER(ch);
5029	else
5030	    *p = Py_UNICODE_TOTITLE(ch);
5031
5032	if (Py_UNICODE_ISLOWER(ch) ||
5033	    Py_UNICODE_ISUPPER(ch) ||
5034	    Py_UNICODE_ISTITLE(ch))
5035	    previous_is_cased = 1;
5036	else
5037	    previous_is_cased = 0;
5038    }
5039    return 1;
5040}
5041
5042PyObject *
5043PyUnicode_Join(PyObject *separator, PyObject *seq)
5044{
5045    PyObject *internal_separator = NULL;
5046    const Py_UNICODE blank = ' ';
5047    const Py_UNICODE *sep = &blank;
5048    Py_ssize_t seplen = 1;
5049    PyUnicodeObject *res = NULL; /* the result */
5050    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5051    Py_ssize_t res_used;         /* # used bytes */
5052    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5053    PyObject *fseq;          /* PySequence_Fast(seq) */
5054    Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5055    PyObject *item;
5056    Py_ssize_t i;
5057
5058    fseq = PySequence_Fast(seq, "");
5059    if (fseq == NULL) {
5060    	return NULL;
5061    }
5062
5063    /* Grrrr.  A codec may be invoked to convert str objects to
5064     * Unicode, and so it's possible to call back into Python code
5065     * during PyUnicode_FromObject(), and so it's possible for a sick
5066     * codec to change the size of fseq (if seq is a list).  Therefore
5067     * we have to keep refetching the size -- can't assume seqlen
5068     * is invariant.
5069     */
5070    seqlen = PySequence_Fast_GET_SIZE(fseq);
5071    /* If empty sequence, return u"". */
5072    if (seqlen == 0) {
5073    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5074    	goto Done;
5075    }
5076    /* If singleton sequence with an exact Unicode, return that. */
5077    if (seqlen == 1) {
5078	item = PySequence_Fast_GET_ITEM(fseq, 0);
5079	if (PyUnicode_CheckExact(item)) {
5080	    Py_INCREF(item);
5081	    res = (PyUnicodeObject *)item;
5082	    goto Done;
5083	}
5084    }
5085
5086    /* At least two items to join, or one that isn't exact Unicode. */
5087    if (seqlen > 1) {
5088        /* Set up sep and seplen -- they're needed. */
5089    	if (separator == NULL) {
5090	    sep = &blank;
5091	    seplen = 1;
5092        }
5093    	else {
5094	    internal_separator = PyUnicode_FromObject(separator);
5095	    if (internal_separator == NULL)
5096	        goto onError;
5097	    sep = PyUnicode_AS_UNICODE(internal_separator);
5098	    seplen = PyUnicode_GET_SIZE(internal_separator);
5099	    /* In case PyUnicode_FromObject() mutated seq. */
5100	    seqlen = PySequence_Fast_GET_SIZE(fseq);
5101        }
5102    }
5103
5104    /* Get space. */
5105    res = _PyUnicode_New(res_alloc);
5106    if (res == NULL)
5107        goto onError;
5108    res_p = PyUnicode_AS_UNICODE(res);
5109    res_used = 0;
5110
5111    for (i = 0; i < seqlen; ++i) {
5112	Py_ssize_t itemlen;
5113	Py_ssize_t new_res_used;
5114
5115	item = PySequence_Fast_GET_ITEM(fseq, i);
5116	/* Convert item to Unicode. */
5117	if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5118	    PyErr_Format(PyExc_TypeError,
5119			 "sequence item %zd: expected string or Unicode,"
5120			 " %.80s found",
5121			 i, item->ob_type->tp_name);
5122	    goto onError;
5123	}
5124	item = PyUnicode_FromObject(item);
5125	if (item == NULL)
5126	    goto onError;
5127	/* We own a reference to item from here on. */
5128
5129	/* In case PyUnicode_FromObject() mutated seq. */
5130	seqlen = PySequence_Fast_GET_SIZE(fseq);
5131
5132        /* Make sure we have enough space for the separator and the item. */
5133	itemlen = PyUnicode_GET_SIZE(item);
5134	new_res_used = res_used + itemlen;
5135	if (new_res_used < 0)
5136	    goto Overflow;
5137	if (i < seqlen - 1) {
5138	    new_res_used += seplen;
5139	    if (new_res_used < 0)
5140		goto Overflow;
5141	}
5142	if (new_res_used > res_alloc) {
5143	    /* double allocated size until it's big enough */
5144	    do {
5145	        res_alloc += res_alloc;
5146	        if (res_alloc <= 0)
5147	            goto Overflow;
5148	    } while (new_res_used > res_alloc);
5149	    if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5150		Py_DECREF(item);
5151		goto onError;
5152	    }
5153            res_p = PyUnicode_AS_UNICODE(res) + res_used;
5154	}
5155
5156	/* Copy item, and maybe the separator. */
5157	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5158	res_p += itemlen;
5159	if (i < seqlen - 1) {
5160	    Py_UNICODE_COPY(res_p, sep, seplen);
5161	    res_p += seplen;
5162	}
5163	Py_DECREF(item);
5164	res_used = new_res_used;
5165    }
5166
5167    /* Shrink res to match the used area; this probably can't fail,
5168     * but it's cheap to check.
5169     */
5170    if (_PyUnicode_Resize(&res, res_used) < 0)
5171	goto onError;
5172
5173 Done:
5174    Py_XDECREF(internal_separator);
5175    Py_DECREF(fseq);
5176    return (PyObject *)res;
5177
5178 Overflow:
5179    PyErr_SetString(PyExc_OverflowError,
5180                    "join() result is too long for a Python string");
5181    Py_DECREF(item);
5182    /* fall through */
5183
5184 onError:
5185    Py_XDECREF(internal_separator);
5186    Py_DECREF(fseq);
5187    Py_XDECREF(res);
5188    return NULL;
5189}
5190
5191static
5192PyUnicodeObject *pad(PyUnicodeObject *self,
5193		     Py_ssize_t left,
5194		     Py_ssize_t right,
5195		     Py_UNICODE fill)
5196{
5197    PyUnicodeObject *u;
5198
5199    if (left < 0)
5200        left = 0;
5201    if (right < 0)
5202        right = 0;
5203
5204    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5205        Py_INCREF(self);
5206        return self;
5207    }
5208
5209    u = _PyUnicode_New(left + self->length + right);
5210    if (u) {
5211        if (left)
5212            Py_UNICODE_FILL(u->str, fill, left);
5213        Py_UNICODE_COPY(u->str + left, self->str, self->length);
5214        if (right)
5215            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5216    }
5217
5218    return u;
5219}
5220
5221#define SPLIT_APPEND(data, left, right)					\
5222	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
5223	if (!str)							\
5224	    goto onError;						\
5225	if (PyList_Append(list, str)) {					\
5226	    Py_DECREF(str);						\
5227	    goto onError;						\
5228	}								\
5229        else								\
5230            Py_DECREF(str);
5231
5232static
5233PyObject *split_whitespace(PyUnicodeObject *self,
5234			   PyObject *list,
5235			   Py_ssize_t maxcount)
5236{
5237    register Py_ssize_t i;
5238    register Py_ssize_t j;
5239    Py_ssize_t len = self->length;
5240    PyObject *str;
5241
5242    for (i = j = 0; i < len; ) {
5243	/* find a token */
5244	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5245	    i++;
5246	j = i;
5247	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5248	    i++;
5249	if (j < i) {
5250	    if (maxcount-- <= 0)
5251		break;
5252	    SPLIT_APPEND(self->str, j, i);
5253	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5254		i++;
5255	    j = i;
5256	}
5257    }
5258    if (j < len) {
5259	SPLIT_APPEND(self->str, j, len);
5260    }
5261    return list;
5262
5263 onError:
5264    Py_DECREF(list);
5265    return NULL;
5266}
5267
5268PyObject *PyUnicode_Splitlines(PyObject *string,
5269			       int keepends)
5270{
5271    register Py_ssize_t i;
5272    register Py_ssize_t j;
5273    Py_ssize_t len;
5274    PyObject *list;
5275    PyObject *str;
5276    Py_UNICODE *data;
5277
5278    string = PyUnicode_FromObject(string);
5279    if (string == NULL)
5280	return NULL;
5281    data = PyUnicode_AS_UNICODE(string);
5282    len = PyUnicode_GET_SIZE(string);
5283
5284    list = PyList_New(0);
5285    if (!list)
5286        goto onError;
5287
5288    for (i = j = 0; i < len; ) {
5289	Py_ssize_t eol;
5290
5291	/* Find a line and append it */
5292	while (i < len && !BLOOM_LINEBREAK(data[i]))
5293	    i++;
5294
5295	/* Skip the line break reading CRLF as one line break */
5296	eol = i;
5297	if (i < len) {
5298	    if (data[i] == '\r' && i + 1 < len &&
5299		data[i+1] == '\n')
5300		i += 2;
5301	    else
5302		i++;
5303	    if (keepends)
5304		eol = i;
5305	}
5306	SPLIT_APPEND(data, j, eol);
5307	j = i;
5308    }
5309    if (j < len) {
5310	SPLIT_APPEND(data, j, len);
5311    }
5312
5313    Py_DECREF(string);
5314    return list;
5315
5316 onError:
5317    Py_XDECREF(list);
5318    Py_DECREF(string);
5319    return NULL;
5320}
5321
5322static
5323PyObject *split_char(PyUnicodeObject *self,
5324		     PyObject *list,
5325		     Py_UNICODE ch,
5326		     Py_ssize_t maxcount)
5327{
5328    register Py_ssize_t i;
5329    register Py_ssize_t j;
5330    Py_ssize_t len = self->length;
5331    PyObject *str;
5332
5333    for (i = j = 0; i < len; ) {
5334	if (self->str[i] == ch) {
5335	    if (maxcount-- <= 0)
5336		break;
5337	    SPLIT_APPEND(self->str, j, i);
5338	    i = j = i + 1;
5339	} else
5340	    i++;
5341    }
5342    if (j <= len) {
5343	SPLIT_APPEND(self->str, j, len);
5344    }
5345    return list;
5346
5347 onError:
5348    Py_DECREF(list);
5349    return NULL;
5350}
5351
5352static
5353PyObject *split_substring(PyUnicodeObject *self,
5354			  PyObject *list,
5355			  PyUnicodeObject *substring,
5356			  Py_ssize_t maxcount)
5357{
5358    register Py_ssize_t i;
5359    register Py_ssize_t j;
5360    Py_ssize_t len = self->length;
5361    Py_ssize_t sublen = substring->length;
5362    PyObject *str;
5363
5364    for (i = j = 0; i <= len - sublen; ) {
5365	if (Py_UNICODE_MATCH(self, i, substring)) {
5366	    if (maxcount-- <= 0)
5367		break;
5368	    SPLIT_APPEND(self->str, j, i);
5369	    i = j = i + sublen;
5370	} else
5371	    i++;
5372    }
5373    if (j <= len) {
5374	SPLIT_APPEND(self->str, j, len);
5375    }
5376    return list;
5377
5378 onError:
5379    Py_DECREF(list);
5380    return NULL;
5381}
5382
5383static
5384PyObject *rsplit_whitespace(PyUnicodeObject *self,
5385			    PyObject *list,
5386			    Py_ssize_t maxcount)
5387{
5388    register Py_ssize_t i;
5389    register Py_ssize_t j;
5390    Py_ssize_t len = self->length;
5391    PyObject *str;
5392
5393    for (i = j = len - 1; i >= 0; ) {
5394	/* find a token */
5395	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5396	    i--;
5397	j = i;
5398	while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5399	    i--;
5400	if (j > i) {
5401	    if (maxcount-- <= 0)
5402		break;
5403	    SPLIT_APPEND(self->str, i + 1, j + 1);
5404	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5405		i--;
5406	    j = i;
5407	}
5408    }
5409    if (j >= 0) {
5410	SPLIT_APPEND(self->str, 0, j + 1);
5411    }
5412    if (PyList_Reverse(list) < 0)
5413        goto onError;
5414    return list;
5415
5416 onError:
5417    Py_DECREF(list);
5418    return NULL;
5419}
5420
5421static
5422PyObject *rsplit_char(PyUnicodeObject *self,
5423		      PyObject *list,
5424		      Py_UNICODE ch,
5425		      Py_ssize_t maxcount)
5426{
5427    register Py_ssize_t i;
5428    register Py_ssize_t j;
5429    Py_ssize_t len = self->length;
5430    PyObject *str;
5431
5432    for (i = j = len - 1; i >= 0; ) {
5433	if (self->str[i] == ch) {
5434	    if (maxcount-- <= 0)
5435		break;
5436	    SPLIT_APPEND(self->str, i + 1, j + 1);
5437	    j = i = i - 1;
5438	} else
5439	    i--;
5440    }
5441    if (j >= -1) {
5442	SPLIT_APPEND(self->str, 0, j + 1);
5443    }
5444    if (PyList_Reverse(list) < 0)
5445        goto onError;
5446    return list;
5447
5448 onError:
5449    Py_DECREF(list);
5450    return NULL;
5451}
5452
5453static
5454PyObject *rsplit_substring(PyUnicodeObject *self,
5455			   PyObject *list,
5456			   PyUnicodeObject *substring,
5457			   Py_ssize_t maxcount)
5458{
5459    register Py_ssize_t i;
5460    register Py_ssize_t j;
5461    Py_ssize_t len = self->length;
5462    Py_ssize_t sublen = substring->length;
5463    PyObject *str;
5464
5465    for (i = len - sublen, j = len; i >= 0; ) {
5466	if (Py_UNICODE_MATCH(self, i, substring)) {
5467	    if (maxcount-- <= 0)
5468		break;
5469	    SPLIT_APPEND(self->str, i + sublen, j);
5470	    j = i;
5471	    i -= sublen;
5472	} else
5473	    i--;
5474    }
5475    if (j >= 0) {
5476	SPLIT_APPEND(self->str, 0, j);
5477    }
5478    if (PyList_Reverse(list) < 0)
5479        goto onError;
5480    return list;
5481
5482 onError:
5483    Py_DECREF(list);
5484    return NULL;
5485}
5486
5487#undef SPLIT_APPEND
5488
5489static
5490PyObject *split(PyUnicodeObject *self,
5491		PyUnicodeObject *substring,
5492		Py_ssize_t maxcount)
5493{
5494    PyObject *list;
5495
5496    if (maxcount < 0)
5497        maxcount = PY_SSIZE_T_MAX;
5498
5499    list = PyList_New(0);
5500    if (!list)
5501        return NULL;
5502
5503    if (substring == NULL)
5504	return split_whitespace(self,list,maxcount);
5505
5506    else if (substring->length == 1)
5507	return split_char(self,list,substring->str[0],maxcount);
5508
5509    else if (substring->length == 0) {
5510	Py_DECREF(list);
5511	PyErr_SetString(PyExc_ValueError, "empty separator");
5512	return NULL;
5513    }
5514    else
5515	return split_substring(self,list,substring,maxcount);
5516}
5517
5518static
5519PyObject *rsplit(PyUnicodeObject *self,
5520		 PyUnicodeObject *substring,
5521		 Py_ssize_t maxcount)
5522{
5523    PyObject *list;
5524
5525    if (maxcount < 0)
5526        maxcount = PY_SSIZE_T_MAX;
5527
5528    list = PyList_New(0);
5529    if (!list)
5530        return NULL;
5531
5532    if (substring == NULL)
5533	return rsplit_whitespace(self,list,maxcount);
5534
5535    else if (substring->length == 1)
5536	return rsplit_char(self,list,substring->str[0],maxcount);
5537
5538    else if (substring->length == 0) {
5539	Py_DECREF(list);
5540	PyErr_SetString(PyExc_ValueError, "empty separator");
5541	return NULL;
5542    }
5543    else
5544	return rsplit_substring(self,list,substring,maxcount);
5545}
5546
5547static
5548PyObject *replace(PyUnicodeObject *self,
5549		  PyUnicodeObject *str1,
5550		  PyUnicodeObject *str2,
5551		  Py_ssize_t maxcount)
5552{
5553    PyUnicodeObject *u;
5554
5555    if (maxcount < 0)
5556	maxcount = PY_SSIZE_T_MAX;
5557
5558    if (str1->length == str2->length) {
5559        /* same length */
5560        Py_ssize_t i;
5561        if (str1->length == 1) {
5562            /* replace characters */
5563            Py_UNICODE u1, u2;
5564            if (!findchar(self->str, self->length, str1->str[0]))
5565                goto nothing;
5566            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5567            if (!u)
5568                return NULL;
5569            Py_UNICODE_COPY(u->str, self->str, self->length);
5570            u1 = str1->str[0];
5571            u2 = str2->str[0];
5572            for (i = 0; i < u->length; i++)
5573                if (u->str[i] == u1) {
5574                    if (--maxcount < 0)
5575                        break;
5576                    u->str[i] = u2;
5577                }
5578        } else {
5579            i = fastsearch(
5580                self->str, self->length, str1->str, str1->length, FAST_SEARCH
5581                );
5582            if (i < 0)
5583                goto nothing;
5584            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5585            if (!u)
5586                return NULL;
5587            Py_UNICODE_COPY(u->str, self->str, self->length);
5588            while (i <= self->length - str1->length)
5589                if (Py_UNICODE_MATCH(self, i, str1)) {
5590                    if (--maxcount < 0)
5591                        break;
5592                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5593                    i += str1->length;
5594                } else
5595                    i++;
5596        }
5597    } else {
5598
5599        Py_ssize_t n, i, j, e;
5600        Py_ssize_t product, new_size, delta;
5601        Py_UNICODE *p;
5602
5603        /* replace strings */
5604        n = stringlib_count(self->str, self->length, str1->str, str1->length);
5605        if (n > maxcount)
5606            n = maxcount;
5607        if (n == 0)
5608            goto nothing;
5609        /* new_size = self->length + n * (str2->length - str1->length)); */
5610        delta = (str2->length - str1->length);
5611        if (delta == 0) {
5612            new_size = self->length;
5613        } else {
5614            product = n * (str2->length - str1->length);
5615            if ((product / (str2->length - str1->length)) != n) {
5616                PyErr_SetString(PyExc_OverflowError,
5617                                "replace string is too long");
5618                return NULL;
5619            }
5620            new_size = self->length + product;
5621            if (new_size < 0) {
5622                PyErr_SetString(PyExc_OverflowError,
5623                                "replace string is too long");
5624                return NULL;
5625            }
5626        }
5627        u = _PyUnicode_New(new_size);
5628        if (!u)
5629            return NULL;
5630        i = 0;
5631        p = u->str;
5632        e = self->length - str1->length;
5633        if (str1->length > 0) {
5634            while (n-- > 0) {
5635                /* look for next match */
5636                j = i;
5637                while (j <= e) {
5638                    if (Py_UNICODE_MATCH(self, j, str1))
5639                        break;
5640                    j++;
5641                }
5642		if (j > i) {
5643                    if (j > e)
5644                        break;
5645                    /* copy unchanged part [i:j] */
5646                    Py_UNICODE_COPY(p, self->str+i, j-i);
5647                    p += j - i;
5648                }
5649                /* copy substitution string */
5650                if (str2->length > 0) {
5651                    Py_UNICODE_COPY(p, str2->str, str2->length);
5652                    p += str2->length;
5653                }
5654                i = j + str1->length;
5655            }
5656            if (i < self->length)
5657                /* copy tail [i:] */
5658                Py_UNICODE_COPY(p, self->str+i, self->length-i);
5659        } else {
5660            /* interleave */
5661            while (n > 0) {
5662                Py_UNICODE_COPY(p, str2->str, str2->length);
5663                p += str2->length;
5664                if (--n <= 0)
5665                    break;
5666                *p++ = self->str[i++];
5667            }
5668            Py_UNICODE_COPY(p, self->str+i, self->length-i);
5669        }
5670    }
5671    return (PyObject *) u;
5672
5673nothing:
5674    /* nothing to replace; return original string (when possible) */
5675    if (PyUnicode_CheckExact(self)) {
5676        Py_INCREF(self);
5677        return (PyObject *) self;
5678    }
5679    return PyUnicode_FromUnicode(self->str, self->length);
5680}
5681
5682/* --- Unicode Object Methods --------------------------------------------- */
5683
5684PyDoc_STRVAR(title__doc__,
5685"S.title() -> unicode\n\
5686\n\
5687Return a titlecased version of S, i.e. words start with title case\n\
5688characters, all remaining cased characters have lower case.");
5689
5690static PyObject*
5691unicode_title(PyUnicodeObject *self)
5692{
5693    return fixup(self, fixtitle);
5694}
5695
5696PyDoc_STRVAR(capitalize__doc__,
5697"S.capitalize() -> unicode\n\
5698\n\
5699Return a capitalized version of S, i.e. make the first character\n\
5700have upper case.");
5701
5702static PyObject*
5703unicode_capitalize(PyUnicodeObject *self)
5704{
5705    return fixup(self, fixcapitalize);
5706}
5707
5708#if 0
5709PyDoc_STRVAR(capwords__doc__,
5710"S.capwords() -> unicode\n\
5711\n\
5712Apply .capitalize() to all words in S and return the result with\n\
5713normalized whitespace (all whitespace strings are replaced by ' ').");
5714
5715static PyObject*
5716unicode_capwords(PyUnicodeObject *self)
5717{
5718    PyObject *list;
5719    PyObject *item;
5720    Py_ssize_t i;
5721
5722    /* Split into words */
5723    list = split(self, NULL, -1);
5724    if (!list)
5725        return NULL;
5726
5727    /* Capitalize each word */
5728    for (i = 0; i < PyList_GET_SIZE(list); i++) {
5729        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5730		     fixcapitalize);
5731        if (item == NULL)
5732            goto onError;
5733        Py_DECREF(PyList_GET_ITEM(list, i));
5734        PyList_SET_ITEM(list, i, item);
5735    }
5736
5737    /* Join the words to form a new string */
5738    item = PyUnicode_Join(NULL, list);
5739
5740onError:
5741    Py_DECREF(list);
5742    return (PyObject *)item;
5743}
5744#endif
5745
5746/* Argument converter.  Coerces to a single unicode character */
5747
5748static int
5749convert_uc(PyObject *obj, void *addr)
5750{
5751	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5752	PyObject *uniobj;
5753	Py_UNICODE *unistr;
5754
5755	uniobj = PyUnicode_FromObject(obj);
5756	if (uniobj == NULL) {
5757		PyErr_SetString(PyExc_TypeError,
5758			"The fill character cannot be converted to Unicode");
5759		return 0;
5760	}
5761	if (PyUnicode_GET_SIZE(uniobj) != 1) {
5762		PyErr_SetString(PyExc_TypeError,
5763			"The fill character must be exactly one character long");
5764		Py_DECREF(uniobj);
5765		return 0;
5766	}
5767	unistr = PyUnicode_AS_UNICODE(uniobj);
5768	*fillcharloc = unistr[0];
5769	Py_DECREF(uniobj);
5770	return 1;
5771}
5772
5773PyDoc_STRVAR(center__doc__,
5774"S.center(width[, fillchar]) -> unicode\n\
5775\n\
5776Return S centered in a Unicode string of length width. Padding is\n\
5777done using the specified fill character (default is a space)");
5778
5779static PyObject *
5780unicode_center(PyUnicodeObject *self, PyObject *args)
5781{
5782    Py_ssize_t marg, left;
5783    Py_ssize_t width;
5784    Py_UNICODE fillchar = ' ';
5785
5786    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
5787        return NULL;
5788
5789    if (self->length >= width && PyUnicode_CheckExact(self)) {
5790        Py_INCREF(self);
5791        return (PyObject*) self;
5792    }
5793
5794    marg = width - self->length;
5795    left = marg / 2 + (marg & width & 1);
5796
5797    return (PyObject*) pad(self, left, marg - left, fillchar);
5798}
5799
5800#if 0
5801
5802/* This code should go into some future Unicode collation support
5803   module. The basic comparison should compare ordinals on a naive
5804   basis (this is what Java does and thus JPython too). */
5805
5806/* speedy UTF-16 code point order comparison */
5807/* gleaned from: */
5808/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5809
5810static short utf16Fixup[32] =
5811{
5812    0, 0, 0, 0, 0, 0, 0, 0,
5813    0, 0, 0, 0, 0, 0, 0, 0,
5814    0, 0, 0, 0, 0, 0, 0, 0,
5815    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5816};
5817
5818static int
5819unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5820{
5821    Py_ssize_t len1, len2;
5822
5823    Py_UNICODE *s1 = str1->str;
5824    Py_UNICODE *s2 = str2->str;
5825
5826    len1 = str1->length;
5827    len2 = str2->length;
5828
5829    while (len1 > 0 && len2 > 0) {
5830        Py_UNICODE c1, c2;
5831
5832        c1 = *s1++;
5833        c2 = *s2++;
5834
5835	if (c1 > (1<<11) * 26)
5836	    c1 += utf16Fixup[c1>>11];
5837	if (c2 > (1<<11) * 26)
5838            c2 += utf16Fixup[c2>>11];
5839        /* now c1 and c2 are in UTF-32-compatible order */
5840
5841        if (c1 != c2)
5842            return (c1 < c2) ? -1 : 1;
5843
5844        len1--; len2--;
5845    }
5846
5847    return (len1 < len2) ? -1 : (len1 != len2);
5848}
5849
5850#else
5851
5852static int
5853unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5854{
5855    register Py_ssize_t len1, len2;
5856
5857    Py_UNICODE *s1 = str1->str;
5858    Py_UNICODE *s2 = str2->str;
5859
5860    len1 = str1->length;
5861    len2 = str2->length;
5862
5863    while (len1 > 0 && len2 > 0) {
5864        Py_UNICODE c1, c2;
5865
5866        c1 = *s1++;
5867        c2 = *s2++;
5868
5869        if (c1 != c2)
5870            return (c1 < c2) ? -1 : 1;
5871
5872        len1--; len2--;
5873    }
5874
5875    return (len1 < len2) ? -1 : (len1 != len2);
5876}
5877
5878#endif
5879
5880int PyUnicode_Compare(PyObject *left,
5881		      PyObject *right)
5882{
5883    if (PyUnicode_Check(left) && PyUnicode_Check(right))
5884        return unicode_compare((PyUnicodeObject *)left,
5885                               (PyUnicodeObject *)right);
5886    if ((PyString_Check(left) && PyUnicode_Check(right)) ||
5887        (PyUnicode_Check(left) && PyString_Check(right))) {
5888        if (PyUnicode_Check(left))
5889            left = _PyUnicode_AsDefaultEncodedString(left, NULL);
5890        if (PyUnicode_Check(right))
5891            right = _PyUnicode_AsDefaultEncodedString(right, NULL);
5892        assert(PyString_Check(left));
5893        assert(PyString_Check(right));
5894        return PyObject_Compare(left, right);
5895    }
5896    PyErr_Format(PyExc_TypeError,
5897                 "Can't compare %.100s and %.100s",
5898                 left->ob_type->tp_name,
5899                 right->ob_type->tp_name);
5900    return -1;
5901}
5902
5903int
5904PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
5905{
5906    int i;
5907    Py_UNICODE *id;
5908    assert(PyUnicode_Check(uni));
5909    id = PyUnicode_AS_UNICODE(uni);
5910    /* Compare Unicode string and source character set string */
5911    for (i = 0; id[i] && str[i]; i++)
5912	if (id[i] != str[i])
5913	    return ((int)id[i] < (int)str[i]) ? -1 : 1;
5914    if (id[i])
5915	return 1; /* uni is longer */
5916    if (str[i])
5917	return -1; /* str is longer */
5918    return 0;
5919}
5920
5921PyObject *PyUnicode_RichCompare(PyObject *left,
5922                                PyObject *right,
5923                                int op)
5924{
5925    int result;
5926
5927    result = PyUnicode_Compare(left, right);
5928    if (result == -1 && PyErr_Occurred())
5929        goto onError;
5930
5931    /* Convert the return value to a Boolean */
5932    switch (op) {
5933    case Py_EQ:
5934        result = (result == 0);
5935        break;
5936    case Py_NE:
5937        result = (result != 0);
5938        break;
5939    case Py_LE:
5940        result = (result <= 0);
5941        break;
5942    case Py_GE:
5943        result = (result >= 0);
5944        break;
5945    case Py_LT:
5946        result = (result == -1);
5947        break;
5948    case Py_GT:
5949        result = (result == 1);
5950        break;
5951    }
5952    return PyBool_FromLong(result);
5953
5954 onError:
5955
5956    /* Standard case
5957
5958       Type errors mean that PyUnicode_FromObject() could not convert
5959       one of the arguments (usually the right hand side) to Unicode,
5960       ie. we can't handle the comparison request. However, it is
5961       possible that the other object knows a comparison method, which
5962       is why we return Py_NotImplemented to give the other object a
5963       chance.
5964
5965    */
5966    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5967        PyErr_Clear();
5968        Py_INCREF(Py_NotImplemented);
5969        return Py_NotImplemented;
5970    }
5971    if (op != Py_EQ && op != Py_NE)
5972        return NULL;
5973
5974    /* Equality comparison.
5975
5976       This is a special case: we silence any PyExc_UnicodeDecodeError
5977       and instead turn it into a PyErr_UnicodeWarning.
5978
5979    */
5980    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5981        return NULL;
5982    PyErr_Clear();
5983    if (PyErr_Warn(PyExc_UnicodeWarning,
5984                   (op == Py_EQ) ?
5985                   "Unicode equal comparison "
5986                   "failed to convert both arguments to Unicode - "
5987                   "interpreting them as being unequal" :
5988                   "Unicode unequal comparison "
5989                   "failed to convert both arguments to Unicode - "
5990                   "interpreting them as being unequal"
5991                   ) < 0)
5992        return NULL;
5993    result = (op == Py_NE);
5994    return PyBool_FromLong(result);
5995}
5996
5997int PyUnicode_Contains(PyObject *container,
5998		       PyObject *element)
5999{
6000    PyObject *str, *sub;
6001    int result;
6002
6003    /* Coerce the two arguments */
6004    sub = PyUnicode_FromObject(element);
6005    if (!sub) {
6006	PyErr_Format(PyExc_TypeError,
6007	    "'in <string>' requires string as left operand, not %s",
6008	    element->ob_type->tp_name);
6009        return -1;
6010    }
6011
6012    str = PyUnicode_FromObject(container);
6013    if (!str) {
6014        Py_DECREF(sub);
6015        return -1;
6016    }
6017
6018    result = stringlib_contains_obj(str, sub);
6019
6020    Py_DECREF(str);
6021    Py_DECREF(sub);
6022
6023    return result;
6024}
6025
6026/* Concat to string or Unicode object giving a new Unicode object. */
6027
6028PyObject *PyUnicode_Concat(PyObject *left,
6029			   PyObject *right)
6030{
6031    PyUnicodeObject *u = NULL, *v = NULL, *w;
6032
6033    if (PyBytes_Check(left) || PyBytes_Check(right))
6034        return PyBytes_Concat(left, right);
6035
6036    /* Coerce the two arguments */
6037    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6038    if (u == NULL)
6039	goto onError;
6040    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6041    if (v == NULL)
6042	goto onError;
6043
6044    /* Shortcuts */
6045    if (v == unicode_empty) {
6046	Py_DECREF(v);
6047	return (PyObject *)u;
6048    }
6049    if (u == unicode_empty) {
6050	Py_DECREF(u);
6051	return (PyObject *)v;
6052    }
6053
6054    /* Concat the two Unicode strings */
6055    w = _PyUnicode_New(u->length + v->length);
6056    if (w == NULL)
6057	goto onError;
6058    Py_UNICODE_COPY(w->str, u->str, u->length);
6059    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6060
6061    Py_DECREF(u);
6062    Py_DECREF(v);
6063    return (PyObject *)w;
6064
6065onError:
6066    Py_XDECREF(u);
6067    Py_XDECREF(v);
6068    return NULL;
6069}
6070
6071void
6072PyUnicode_Append(PyObject **pleft, PyObject *right)
6073{
6074	PyObject *new;
6075	if (*pleft == NULL)
6076		return;
6077	if (right == NULL || !PyUnicode_Check(*pleft)) {
6078		Py_DECREF(*pleft);
6079		*pleft = NULL;
6080		return;
6081	}
6082	new = PyUnicode_Concat(*pleft, right);
6083	Py_DECREF(*pleft);
6084	*pleft = new;
6085}
6086
6087void
6088PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6089{
6090	PyUnicode_Append(pleft, right);
6091	Py_XDECREF(right);
6092}
6093
6094PyDoc_STRVAR(count__doc__,
6095"S.count(sub[, start[, end]]) -> int\n\
6096\n\
6097Return the number of non-overlapping occurrences of substring sub in\n\
6098Unicode string S[start:end].  Optional arguments start and end are\n\
6099interpreted as in slice notation.");
6100
6101static PyObject *
6102unicode_count(PyUnicodeObject *self, PyObject *args)
6103{
6104    PyUnicodeObject *substring;
6105    Py_ssize_t start = 0;
6106    Py_ssize_t end = PY_SSIZE_T_MAX;
6107    PyObject *result;
6108
6109    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6110		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6111        return NULL;
6112
6113    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6114        (PyObject *)substring);
6115    if (substring == NULL)
6116	return NULL;
6117
6118    FIX_START_END(self);
6119
6120    result = PyInt_FromSsize_t(
6121        stringlib_count(self->str + start, end - start,
6122                        substring->str, substring->length)
6123        );
6124
6125    Py_DECREF(substring);
6126
6127    return result;
6128}
6129
6130PyDoc_STRVAR(encode__doc__,
6131"S.encode([encoding[,errors]]) -> string or unicode\n\
6132\n\
6133Encodes S using the codec registered for encoding. encoding defaults\n\
6134to the default encoding. errors may be given to set a different error\n\
6135handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6136a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6137'xmlcharrefreplace' as well as any other name registered with\n\
6138codecs.register_error that can handle UnicodeEncodeErrors.");
6139
6140static PyObject *
6141unicode_encode(PyUnicodeObject *self, PyObject *args)
6142{
6143    char *encoding = NULL;
6144    char *errors = NULL;
6145    PyObject *v;
6146
6147    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6148        return NULL;
6149    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6150    if (v == NULL)
6151        goto onError;
6152    if (!PyBytes_Check(v)) {
6153        if (PyString_Check(v)) {
6154            /* Old codec, turn it into bytes */
6155            PyObject *b = PyBytes_FromObject(v);
6156            Py_DECREF(v);
6157            return b;
6158        }
6159        PyErr_Format(PyExc_TypeError,
6160                     "encoder did not return a bytes object "
6161                     "(type=%.400s)",
6162                     v->ob_type->tp_name);
6163        Py_DECREF(v);
6164        return NULL;
6165    }
6166    return v;
6167
6168 onError:
6169    return NULL;
6170}
6171
6172PyDoc_STRVAR(decode__doc__,
6173"S.decode([encoding[,errors]]) -> string or unicode\n\
6174\n\
6175Decodes S using the codec registered for encoding. encoding defaults\n\
6176to the default encoding. errors may be given to set a different error\n\
6177handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6178a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6179as well as any other name registerd with codecs.register_error that is\n\
6180able to handle UnicodeDecodeErrors.");
6181
6182static PyObject *
6183unicode_decode(PyUnicodeObject *self, PyObject *args)
6184{
6185    char *encoding = NULL;
6186    char *errors = NULL;
6187    PyObject *v;
6188
6189    if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6190        return NULL;
6191    v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6192    if (v == NULL)
6193        goto onError;
6194    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6195        PyErr_Format(PyExc_TypeError,
6196                     "decoder did not return a string/unicode object "
6197                     "(type=%.400s)",
6198                     v->ob_type->tp_name);
6199        Py_DECREF(v);
6200        return NULL;
6201    }
6202    return v;
6203
6204 onError:
6205    return NULL;
6206}
6207
6208PyDoc_STRVAR(expandtabs__doc__,
6209"S.expandtabs([tabsize]) -> unicode\n\
6210\n\
6211Return a copy of S where all tab characters are expanded using spaces.\n\
6212If tabsize is not given, a tab size of 8 characters is assumed.");
6213
6214static PyObject*
6215unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6216{
6217    Py_UNICODE *e;
6218    Py_UNICODE *p;
6219    Py_UNICODE *q;
6220    Py_ssize_t i, j, old_j;
6221    PyUnicodeObject *u;
6222    int tabsize = 8;
6223
6224    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6225	return NULL;
6226
6227    /* First pass: determine size of output string */
6228    i = j = old_j = 0;
6229    e = self->str + self->length;
6230    for (p = self->str; p < e; p++)
6231        if (*p == '\t') {
6232	    if (tabsize > 0) {
6233		j += tabsize - (j % tabsize);
6234		if (old_j > j) {
6235		    PyErr_SetString(PyExc_OverflowError,
6236				    "new string is too long");
6237		    return NULL;
6238		}
6239		old_j = j;
6240	    }
6241	}
6242        else {
6243            j++;
6244            if (*p == '\n' || *p == '\r') {
6245                i += j;
6246                old_j = j = 0;
6247                if (i < 0) {
6248                    PyErr_SetString(PyExc_OverflowError,
6249                                    "new string is too long");
6250                    return NULL;
6251                }
6252            }
6253        }
6254
6255    if ((i + j) < 0) {
6256        PyErr_SetString(PyExc_OverflowError, "new string is too long");
6257        return NULL;
6258    }
6259
6260    /* Second pass: create output string and fill it */
6261    u = _PyUnicode_New(i + j);
6262    if (!u)
6263        return NULL;
6264
6265    j = 0;
6266    q = u->str;
6267
6268    for (p = self->str; p < e; p++)
6269        if (*p == '\t') {
6270	    if (tabsize > 0) {
6271		i = tabsize - (j % tabsize);
6272		j += i;
6273		while (i--)
6274		    *q++ = ' ';
6275	    }
6276	}
6277	else {
6278            j++;
6279	    *q++ = *p;
6280            if (*p == '\n' || *p == '\r')
6281                j = 0;
6282        }
6283
6284    return (PyObject*) u;
6285}
6286
6287PyDoc_STRVAR(find__doc__,
6288"S.find(sub [,start [,end]]) -> int\n\
6289\n\
6290Return the lowest index in S where substring sub is found,\n\
6291such that sub is contained within s[start,end].  Optional\n\
6292arguments start and end are interpreted as in slice notation.\n\
6293\n\
6294Return -1 on failure.");
6295
6296static PyObject *
6297unicode_find(PyUnicodeObject *self, PyObject *args)
6298{
6299    PyObject *substring;
6300    Py_ssize_t start = 0;
6301    Py_ssize_t end = PY_SSIZE_T_MAX;
6302    Py_ssize_t result;
6303
6304    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6305		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6306        return NULL;
6307    substring = PyUnicode_FromObject(substring);
6308    if (!substring)
6309	return NULL;
6310
6311    result = stringlib_find_slice(
6312        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6313        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6314        start, end
6315        );
6316
6317    Py_DECREF(substring);
6318
6319    return PyInt_FromSsize_t(result);
6320}
6321
6322static PyObject *
6323unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6324{
6325    if (index < 0 || index >= self->length) {
6326        PyErr_SetString(PyExc_IndexError, "string index out of range");
6327        return NULL;
6328    }
6329
6330    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6331}
6332
6333static long
6334unicode_hash(PyObject *self)
6335{
6336    /* Since Unicode objects compare equal to their UTF-8 string
6337       counterparts, we hash the UTF-8 string. */
6338    PyObject *v = _PyUnicode_AsDefaultEncodedString(self, NULL);
6339    return PyObject_Hash(v);
6340}
6341
6342PyDoc_STRVAR(index__doc__,
6343"S.index(sub [,start [,end]]) -> int\n\
6344\n\
6345Like S.find() but raise ValueError when the substring is not found.");
6346
6347static PyObject *
6348unicode_index(PyUnicodeObject *self, PyObject *args)
6349{
6350    Py_ssize_t result;
6351    PyObject *substring;
6352    Py_ssize_t start = 0;
6353    Py_ssize_t end = PY_SSIZE_T_MAX;
6354
6355    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6356		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6357        return NULL;
6358    substring = PyUnicode_FromObject(substring);
6359    if (!substring)
6360	return NULL;
6361
6362    result = stringlib_find_slice(
6363        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6364        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6365        start, end
6366        );
6367
6368    Py_DECREF(substring);
6369
6370    if (result < 0) {
6371        PyErr_SetString(PyExc_ValueError, "substring not found");
6372        return NULL;
6373    }
6374
6375    return PyInt_FromSsize_t(result);
6376}
6377
6378PyDoc_STRVAR(islower__doc__,
6379"S.islower() -> bool\n\
6380\n\
6381Return True if all cased characters in S are lowercase and there is\n\
6382at least one cased character in S, False otherwise.");
6383
6384static PyObject*
6385unicode_islower(PyUnicodeObject *self)
6386{
6387    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6388    register const Py_UNICODE *e;
6389    int cased;
6390
6391    /* Shortcut for single character strings */
6392    if (PyUnicode_GET_SIZE(self) == 1)
6393	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6394
6395    /* Special case for empty strings */
6396    if (PyUnicode_GET_SIZE(self) == 0)
6397	return PyBool_FromLong(0);
6398
6399    e = p + PyUnicode_GET_SIZE(self);
6400    cased = 0;
6401    for (; p < e; p++) {
6402	register const Py_UNICODE ch = *p;
6403
6404	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6405	    return PyBool_FromLong(0);
6406	else if (!cased && Py_UNICODE_ISLOWER(ch))
6407	    cased = 1;
6408    }
6409    return PyBool_FromLong(cased);
6410}
6411
6412PyDoc_STRVAR(isupper__doc__,
6413"S.isupper() -> bool\n\
6414\n\
6415Return True if all cased characters in S are uppercase and there is\n\
6416at least one cased character in S, False otherwise.");
6417
6418static PyObject*
6419unicode_isupper(PyUnicodeObject *self)
6420{
6421    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6422    register const Py_UNICODE *e;
6423    int cased;
6424
6425    /* Shortcut for single character strings */
6426    if (PyUnicode_GET_SIZE(self) == 1)
6427	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6428
6429    /* Special case for empty strings */
6430    if (PyUnicode_GET_SIZE(self) == 0)
6431	return PyBool_FromLong(0);
6432
6433    e = p + PyUnicode_GET_SIZE(self);
6434    cased = 0;
6435    for (; p < e; p++) {
6436	register const Py_UNICODE ch = *p;
6437
6438	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6439	    return PyBool_FromLong(0);
6440	else if (!cased && Py_UNICODE_ISUPPER(ch))
6441	    cased = 1;
6442    }
6443    return PyBool_FromLong(cased);
6444}
6445
6446PyDoc_STRVAR(istitle__doc__,
6447"S.istitle() -> bool\n\
6448\n\
6449Return True if S is a titlecased string and there is at least one\n\
6450character in S, i.e. upper- and titlecase characters may only\n\
6451follow uncased characters and lowercase characters only cased ones.\n\
6452Return False otherwise.");
6453
6454static PyObject*
6455unicode_istitle(PyUnicodeObject *self)
6456{
6457    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6458    register const Py_UNICODE *e;
6459    int cased, previous_is_cased;
6460
6461    /* Shortcut for single character strings */
6462    if (PyUnicode_GET_SIZE(self) == 1)
6463	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6464			       (Py_UNICODE_ISUPPER(*p) != 0));
6465
6466    /* Special case for empty strings */
6467    if (PyUnicode_GET_SIZE(self) == 0)
6468	return PyBool_FromLong(0);
6469
6470    e = p + PyUnicode_GET_SIZE(self);
6471    cased = 0;
6472    previous_is_cased = 0;
6473    for (; p < e; p++) {
6474	register const Py_UNICODE ch = *p;
6475
6476	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6477	    if (previous_is_cased)
6478		return PyBool_FromLong(0);
6479	    previous_is_cased = 1;
6480	    cased = 1;
6481	}
6482	else if (Py_UNICODE_ISLOWER(ch)) {
6483	    if (!previous_is_cased)
6484		return PyBool_FromLong(0);
6485	    previous_is_cased = 1;
6486	    cased = 1;
6487	}
6488	else
6489	    previous_is_cased = 0;
6490    }
6491    return PyBool_FromLong(cased);
6492}
6493
6494PyDoc_STRVAR(isspace__doc__,
6495"S.isspace() -> bool\n\
6496\n\
6497Return True if all characters in S are whitespace\n\
6498and there is at least one character in S, False otherwise.");
6499
6500static PyObject*
6501unicode_isspace(PyUnicodeObject *self)
6502{
6503    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6504    register const Py_UNICODE *e;
6505
6506    /* Shortcut for single character strings */
6507    if (PyUnicode_GET_SIZE(self) == 1 &&
6508	Py_UNICODE_ISSPACE(*p))
6509	return PyBool_FromLong(1);
6510
6511    /* Special case for empty strings */
6512    if (PyUnicode_GET_SIZE(self) == 0)
6513	return PyBool_FromLong(0);
6514
6515    e = p + PyUnicode_GET_SIZE(self);
6516    for (; p < e; p++) {
6517	if (!Py_UNICODE_ISSPACE(*p))
6518	    return PyBool_FromLong(0);
6519    }
6520    return PyBool_FromLong(1);
6521}
6522
6523PyDoc_STRVAR(isalpha__doc__,
6524"S.isalpha() -> bool\n\
6525\n\
6526Return True if all characters in S are alphabetic\n\
6527and there is at least one character in S, False otherwise.");
6528
6529static PyObject*
6530unicode_isalpha(PyUnicodeObject *self)
6531{
6532    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6533    register const Py_UNICODE *e;
6534
6535    /* Shortcut for single character strings */
6536    if (PyUnicode_GET_SIZE(self) == 1 &&
6537	Py_UNICODE_ISALPHA(*p))
6538	return PyBool_FromLong(1);
6539
6540    /* Special case for empty strings */
6541    if (PyUnicode_GET_SIZE(self) == 0)
6542	return PyBool_FromLong(0);
6543
6544    e = p + PyUnicode_GET_SIZE(self);
6545    for (; p < e; p++) {
6546	if (!Py_UNICODE_ISALPHA(*p))
6547	    return PyBool_FromLong(0);
6548    }
6549    return PyBool_FromLong(1);
6550}
6551
6552PyDoc_STRVAR(isalnum__doc__,
6553"S.isalnum() -> bool\n\
6554\n\
6555Return True if all characters in S are alphanumeric\n\
6556and there is at least one character in S, False otherwise.");
6557
6558static PyObject*
6559unicode_isalnum(PyUnicodeObject *self)
6560{
6561    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6562    register const Py_UNICODE *e;
6563
6564    /* Shortcut for single character strings */
6565    if (PyUnicode_GET_SIZE(self) == 1 &&
6566	Py_UNICODE_ISALNUM(*p))
6567	return PyBool_FromLong(1);
6568
6569    /* Special case for empty strings */
6570    if (PyUnicode_GET_SIZE(self) == 0)
6571	return PyBool_FromLong(0);
6572
6573    e = p + PyUnicode_GET_SIZE(self);
6574    for (; p < e; p++) {
6575	if (!Py_UNICODE_ISALNUM(*p))
6576	    return PyBool_FromLong(0);
6577    }
6578    return PyBool_FromLong(1);
6579}
6580
6581PyDoc_STRVAR(isdecimal__doc__,
6582"S.isdecimal() -> bool\n\
6583\n\
6584Return True if there are only decimal characters in S,\n\
6585False otherwise.");
6586
6587static PyObject*
6588unicode_isdecimal(PyUnicodeObject *self)
6589{
6590    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6591    register const Py_UNICODE *e;
6592
6593    /* Shortcut for single character strings */
6594    if (PyUnicode_GET_SIZE(self) == 1 &&
6595	Py_UNICODE_ISDECIMAL(*p))
6596	return PyBool_FromLong(1);
6597
6598    /* Special case for empty strings */
6599    if (PyUnicode_GET_SIZE(self) == 0)
6600	return PyBool_FromLong(0);
6601
6602    e = p + PyUnicode_GET_SIZE(self);
6603    for (; p < e; p++) {
6604	if (!Py_UNICODE_ISDECIMAL(*p))
6605	    return PyBool_FromLong(0);
6606    }
6607    return PyBool_FromLong(1);
6608}
6609
6610PyDoc_STRVAR(isdigit__doc__,
6611"S.isdigit() -> bool\n\
6612\n\
6613Return True if all characters in S are digits\n\
6614and there is at least one character in S, False otherwise.");
6615
6616static PyObject*
6617unicode_isdigit(PyUnicodeObject *self)
6618{
6619    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6620    register const Py_UNICODE *e;
6621
6622    /* Shortcut for single character strings */
6623    if (PyUnicode_GET_SIZE(self) == 1 &&
6624	Py_UNICODE_ISDIGIT(*p))
6625	return PyBool_FromLong(1);
6626
6627    /* Special case for empty strings */
6628    if (PyUnicode_GET_SIZE(self) == 0)
6629	return PyBool_FromLong(0);
6630
6631    e = p + PyUnicode_GET_SIZE(self);
6632    for (; p < e; p++) {
6633	if (!Py_UNICODE_ISDIGIT(*p))
6634	    return PyBool_FromLong(0);
6635    }
6636    return PyBool_FromLong(1);
6637}
6638
6639PyDoc_STRVAR(isnumeric__doc__,
6640"S.isnumeric() -> bool\n\
6641\n\
6642Return True if there are only numeric characters in S,\n\
6643False otherwise.");
6644
6645static PyObject*
6646unicode_isnumeric(PyUnicodeObject *self)
6647{
6648    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6649    register const Py_UNICODE *e;
6650
6651    /* Shortcut for single character strings */
6652    if (PyUnicode_GET_SIZE(self) == 1 &&
6653	Py_UNICODE_ISNUMERIC(*p))
6654	return PyBool_FromLong(1);
6655
6656    /* Special case for empty strings */
6657    if (PyUnicode_GET_SIZE(self) == 0)
6658	return PyBool_FromLong(0);
6659
6660    e = p + PyUnicode_GET_SIZE(self);
6661    for (; p < e; p++) {
6662	if (!Py_UNICODE_ISNUMERIC(*p))
6663	    return PyBool_FromLong(0);
6664    }
6665    return PyBool_FromLong(1);
6666}
6667
6668PyDoc_STRVAR(join__doc__,
6669"S.join(sequence) -> unicode\n\
6670\n\
6671Return a string which is the concatenation of the strings in the\n\
6672sequence.  The separator between elements is S.");
6673
6674static PyObject*
6675unicode_join(PyObject *self, PyObject *data)
6676{
6677    return PyUnicode_Join(self, data);
6678}
6679
6680static Py_ssize_t
6681unicode_length(PyUnicodeObject *self)
6682{
6683    return self->length;
6684}
6685
6686PyDoc_STRVAR(ljust__doc__,
6687"S.ljust(width[, fillchar]) -> int\n\
6688\n\
6689Return S left justified in a Unicode string of length width. Padding is\n\
6690done using the specified fill character (default is a space).");
6691
6692static PyObject *
6693unicode_ljust(PyUnicodeObject *self, PyObject *args)
6694{
6695    Py_ssize_t width;
6696    Py_UNICODE fillchar = ' ';
6697
6698    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6699        return NULL;
6700
6701    if (self->length >= width && PyUnicode_CheckExact(self)) {
6702        Py_INCREF(self);
6703        return (PyObject*) self;
6704    }
6705
6706    return (PyObject*) pad(self, 0, width - self->length, fillchar);
6707}
6708
6709PyDoc_STRVAR(lower__doc__,
6710"S.lower() -> unicode\n\
6711\n\
6712Return a copy of the string S converted to lowercase.");
6713
6714static PyObject*
6715unicode_lower(PyUnicodeObject *self)
6716{
6717    return fixup(self, fixlower);
6718}
6719
6720#define LEFTSTRIP 0
6721#define RIGHTSTRIP 1
6722#define BOTHSTRIP 2
6723
6724/* Arrays indexed by above */
6725static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6726
6727#define STRIPNAME(i) (stripformat[i]+3)
6728
6729/* externally visible for str.strip(unicode) */
6730PyObject *
6731_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6732{
6733	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6734	Py_ssize_t len = PyUnicode_GET_SIZE(self);
6735	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6736	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6737	Py_ssize_t i, j;
6738
6739        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6740
6741	i = 0;
6742	if (striptype != RIGHTSTRIP) {
6743            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6744                i++;
6745            }
6746	}
6747
6748	j = len;
6749	if (striptype != LEFTSTRIP) {
6750            do {
6751                j--;
6752            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6753            j++;
6754	}
6755
6756	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6757            Py_INCREF(self);
6758            return (PyObject*)self;
6759	}
6760	else
6761            return PyUnicode_FromUnicode(s+i, j-i);
6762}
6763
6764
6765static PyObject *
6766do_strip(PyUnicodeObject *self, int striptype)
6767{
6768	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6769	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6770
6771	i = 0;
6772	if (striptype != RIGHTSTRIP) {
6773		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6774			i++;
6775		}
6776	}
6777
6778	j = len;
6779	if (striptype != LEFTSTRIP) {
6780		do {
6781			j--;
6782		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6783		j++;
6784	}
6785
6786	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6787		Py_INCREF(self);
6788		return (PyObject*)self;
6789	}
6790	else
6791		return PyUnicode_FromUnicode(s+i, j-i);
6792}
6793
6794
6795static PyObject *
6796do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6797{
6798	PyObject *sep = NULL;
6799
6800	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6801		return NULL;
6802
6803	if (sep != NULL && sep != Py_None) {
6804		if (PyUnicode_Check(sep))
6805			return _PyUnicode_XStrip(self, striptype, sep);
6806		else if (PyString_Check(sep)) {
6807			PyObject *res;
6808			sep = PyUnicode_FromObject(sep);
6809			if (sep==NULL)
6810				return NULL;
6811			res = _PyUnicode_XStrip(self, striptype, sep);
6812			Py_DECREF(sep);
6813			return res;
6814		}
6815		else {
6816			PyErr_Format(PyExc_TypeError,
6817				     "%s arg must be None, unicode or str",
6818				     STRIPNAME(striptype));
6819			return NULL;
6820		}
6821	}
6822
6823	return do_strip(self, striptype);
6824}
6825
6826
6827PyDoc_STRVAR(strip__doc__,
6828"S.strip([chars]) -> unicode\n\
6829\n\
6830Return a copy of the string S with leading and trailing\n\
6831whitespace removed.\n\
6832If chars is given and not None, remove characters in chars instead.\n\
6833If chars is a str, it will be converted to unicode before stripping");
6834
6835static PyObject *
6836unicode_strip(PyUnicodeObject *self, PyObject *args)
6837{
6838	if (PyTuple_GET_SIZE(args) == 0)
6839		return do_strip(self, BOTHSTRIP); /* Common case */
6840	else
6841		return do_argstrip(self, BOTHSTRIP, args);
6842}
6843
6844
6845PyDoc_STRVAR(lstrip__doc__,
6846"S.lstrip([chars]) -> unicode\n\
6847\n\
6848Return a copy of the string S with leading whitespace removed.\n\
6849If chars is given and not None, remove characters in chars instead.\n\
6850If chars is a str, it will be converted to unicode before stripping");
6851
6852static PyObject *
6853unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6854{
6855	if (PyTuple_GET_SIZE(args) == 0)
6856		return do_strip(self, LEFTSTRIP); /* Common case */
6857	else
6858		return do_argstrip(self, LEFTSTRIP, args);
6859}
6860
6861
6862PyDoc_STRVAR(rstrip__doc__,
6863"S.rstrip([chars]) -> unicode\n\
6864\n\
6865Return a copy of the string S with trailing whitespace removed.\n\
6866If chars is given and not None, remove characters in chars instead.\n\
6867If chars is a str, it will be converted to unicode before stripping");
6868
6869static PyObject *
6870unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6871{
6872	if (PyTuple_GET_SIZE(args) == 0)
6873		return do_strip(self, RIGHTSTRIP); /* Common case */
6874	else
6875		return do_argstrip(self, RIGHTSTRIP, args);
6876}
6877
6878
6879static PyObject*
6880unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6881{
6882    PyUnicodeObject *u;
6883    Py_UNICODE *p;
6884    Py_ssize_t nchars;
6885    size_t nbytes;
6886
6887    if (len < 0)
6888        len = 0;
6889
6890    if (len == 1 && PyUnicode_CheckExact(str)) {
6891        /* no repeat, return original string */
6892        Py_INCREF(str);
6893        return (PyObject*) str;
6894    }
6895
6896    /* ensure # of chars needed doesn't overflow int and # of bytes
6897     * needed doesn't overflow size_t
6898     */
6899    nchars = len * str->length;
6900    if (len && nchars / len != str->length) {
6901        PyErr_SetString(PyExc_OverflowError,
6902                        "repeated string is too long");
6903        return NULL;
6904    }
6905    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6906    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6907        PyErr_SetString(PyExc_OverflowError,
6908                        "repeated string is too long");
6909        return NULL;
6910    }
6911    u = _PyUnicode_New(nchars);
6912    if (!u)
6913        return NULL;
6914
6915    p = u->str;
6916
6917    if (str->length == 1 && len > 0) {
6918        Py_UNICODE_FILL(p, str->str[0], len);
6919    } else {
6920	Py_ssize_t done = 0; /* number of characters copied this far */
6921	if (done < nchars) {
6922            Py_UNICODE_COPY(p, str->str, str->length);
6923            done = str->length;
6924	}
6925	while (done < nchars) {
6926            int n = (done <= nchars-done) ? done : nchars-done;
6927            Py_UNICODE_COPY(p+done, p, n);
6928            done += n;
6929	}
6930    }
6931
6932    return (PyObject*) u;
6933}
6934
6935PyObject *PyUnicode_Replace(PyObject *obj,
6936			    PyObject *subobj,
6937			    PyObject *replobj,
6938			    Py_ssize_t maxcount)
6939{
6940    PyObject *self;
6941    PyObject *str1;
6942    PyObject *str2;
6943    PyObject *result;
6944
6945    self = PyUnicode_FromObject(obj);
6946    if (self == NULL)
6947	return NULL;
6948    str1 = PyUnicode_FromObject(subobj);
6949    if (str1 == NULL) {
6950	Py_DECREF(self);
6951	return NULL;
6952    }
6953    str2 = PyUnicode_FromObject(replobj);
6954    if (str2 == NULL) {
6955	Py_DECREF(self);
6956	Py_DECREF(str1);
6957	return NULL;
6958    }
6959    result = replace((PyUnicodeObject *)self,
6960		     (PyUnicodeObject *)str1,
6961		     (PyUnicodeObject *)str2,
6962		     maxcount);
6963    Py_DECREF(self);
6964    Py_DECREF(str1);
6965    Py_DECREF(str2);
6966    return result;
6967}
6968
6969PyDoc_STRVAR(replace__doc__,
6970"S.replace (old, new[, maxsplit]) -> unicode\n\
6971\n\
6972Return a copy of S with all occurrences of substring\n\
6973old replaced by new.  If the optional argument maxsplit is\n\
6974given, only the first maxsplit occurrences are replaced.");
6975
6976static PyObject*
6977unicode_replace(PyUnicodeObject *self, PyObject *args)
6978{
6979    PyUnicodeObject *str1;
6980    PyUnicodeObject *str2;
6981    Py_ssize_t maxcount = -1;
6982    PyObject *result;
6983
6984    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
6985        return NULL;
6986    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6987    if (str1 == NULL)
6988	return NULL;
6989    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
6990    if (str2 == NULL) {
6991	Py_DECREF(str1);
6992	return NULL;
6993    }
6994
6995    result = replace(self, str1, str2, maxcount);
6996
6997    Py_DECREF(str1);
6998    Py_DECREF(str2);
6999    return result;
7000}
7001
7002static
7003PyObject *unicode_repr(PyObject *unicode)
7004{
7005    PyObject *repr;
7006    Py_UNICODE *p;
7007    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7008    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7009
7010    /* XXX(nnorwitz): rather than over-allocating, it would be
7011       better to choose a different scheme.  Perhaps scan the
7012       first N-chars of the string and allocate based on that size.
7013    */
7014    /* Initial allocation is based on the longest-possible unichr
7015       escape.
7016
7017       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7018       unichr, so in this case it's the longest unichr escape. In
7019       narrow (UTF-16) builds this is five chars per source unichr
7020       since there are two unichrs in the surrogate pair, so in narrow
7021       (UTF-16) builds it's not the longest unichr escape.
7022
7023       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7024       so in the narrow (UTF-16) build case it's the longest unichr
7025       escape.
7026    */
7027
7028    repr = PyUnicode_FromUnicode(NULL,
7029        2 /* quotes */
7030#ifdef Py_UNICODE_WIDE
7031        + 10*size
7032#else
7033        + 6*size
7034#endif
7035        + 1);
7036    if (repr == NULL)
7037        return NULL;
7038
7039    p = PyUnicode_AS_UNICODE(repr);
7040
7041    /* Add quote */
7042    *p++ = (findchar(s, size, '\'') &&
7043            !findchar(s, size, '"')) ? '"' : '\'';
7044    while (size-- > 0) {
7045        Py_UNICODE ch = *s++;
7046
7047        /* Escape quotes and backslashes */
7048        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7049            *p++ = '\\';
7050            *p++ = ch;
7051            continue;
7052        }
7053
7054#ifdef Py_UNICODE_WIDE
7055        /* Map 21-bit characters to '\U00xxxxxx' */
7056        else if (ch >= 0x10000) {
7057            *p++ = '\\';
7058            *p++ = 'U';
7059            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7060            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7061            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7062            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7063            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7064            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7065            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7066            *p++ = hexdigits[ch & 0x0000000F];
7067	    continue;
7068        }
7069#else
7070	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7071	else if (ch >= 0xD800 && ch < 0xDC00) {
7072	    Py_UNICODE ch2;
7073	    Py_UCS4 ucs;
7074
7075	    ch2 = *s++;
7076	    size--;
7077	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7078		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7079		*p++ = '\\';
7080		*p++ = 'U';
7081		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7082		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7083		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7084		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7085		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7086		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7087		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7088		*p++ = hexdigits[ucs & 0x0000000F];
7089		continue;
7090	    }
7091	    /* Fall through: isolated surrogates are copied as-is */
7092	    s--;
7093	    size++;
7094	}
7095#endif
7096
7097        /* Map 16-bit characters to '\uxxxx' */
7098        if (ch >= 256) {
7099            *p++ = '\\';
7100            *p++ = 'u';
7101            *p++ = hexdigits[(ch >> 12) & 0x000F];
7102            *p++ = hexdigits[(ch >> 8) & 0x000F];
7103            *p++ = hexdigits[(ch >> 4) & 0x000F];
7104            *p++ = hexdigits[ch & 0x000F];
7105        }
7106
7107        /* Map special whitespace to '\t', \n', '\r' */
7108        else if (ch == '\t') {
7109            *p++ = '\\';
7110            *p++ = 't';
7111        }
7112        else if (ch == '\n') {
7113            *p++ = '\\';
7114            *p++ = 'n';
7115        }
7116        else if (ch == '\r') {
7117            *p++ = '\\';
7118            *p++ = 'r';
7119        }
7120
7121        /* Map non-printable US ASCII to '\xhh' */
7122        else if (ch < ' ' || ch >= 0x7F) {
7123            *p++ = '\\';
7124            *p++ = 'x';
7125            *p++ = hexdigits[(ch >> 4) & 0x000F];
7126            *p++ = hexdigits[ch & 0x000F];
7127        }
7128
7129        /* Copy everything else as-is */
7130        else
7131            *p++ = (char) ch;
7132    }
7133    /* Add quote */
7134    *p++ = PyUnicode_AS_UNICODE(repr)[0];
7135
7136    *p = '\0';
7137    _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7138    return repr;
7139}
7140
7141PyDoc_STRVAR(rfind__doc__,
7142"S.rfind(sub [,start [,end]]) -> int\n\
7143\n\
7144Return the highest index in S where substring sub is found,\n\
7145such that sub is contained within s[start,end].  Optional\n\
7146arguments start and end are interpreted as in slice notation.\n\
7147\n\
7148Return -1 on failure.");
7149
7150static PyObject *
7151unicode_rfind(PyUnicodeObject *self, PyObject *args)
7152{
7153    PyObject *substring;
7154    Py_ssize_t start = 0;
7155    Py_ssize_t end = PY_SSIZE_T_MAX;
7156    Py_ssize_t result;
7157
7158    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
7159		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7160        return NULL;
7161    substring = PyUnicode_FromObject(substring);
7162    if (!substring)
7163	return NULL;
7164
7165    result = stringlib_rfind_slice(
7166        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7167        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7168        start, end
7169        );
7170
7171    Py_DECREF(substring);
7172
7173    return PyInt_FromSsize_t(result);
7174}
7175
7176PyDoc_STRVAR(rindex__doc__,
7177"S.rindex(sub [,start [,end]]) -> int\n\
7178\n\
7179Like S.rfind() but raise ValueError when the substring is not found.");
7180
7181static PyObject *
7182unicode_rindex(PyUnicodeObject *self, PyObject *args)
7183{
7184    PyObject *substring;
7185    Py_ssize_t start = 0;
7186    Py_ssize_t end = PY_SSIZE_T_MAX;
7187    Py_ssize_t result;
7188
7189    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
7190		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7191        return NULL;
7192    substring = PyUnicode_FromObject(substring);
7193    if (!substring)
7194	return NULL;
7195
7196    result = stringlib_rfind_slice(
7197        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7198        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7199        start, end
7200        );
7201
7202    Py_DECREF(substring);
7203
7204    if (result < 0) {
7205        PyErr_SetString(PyExc_ValueError, "substring not found");
7206        return NULL;
7207    }
7208    return PyInt_FromSsize_t(result);
7209}
7210
7211PyDoc_STRVAR(rjust__doc__,
7212"S.rjust(width[, fillchar]) -> unicode\n\
7213\n\
7214Return S right justified in a Unicode string of length width. Padding is\n\
7215done using the specified fill character (default is a space).");
7216
7217static PyObject *
7218unicode_rjust(PyUnicodeObject *self, PyObject *args)
7219{
7220    Py_ssize_t width;
7221    Py_UNICODE fillchar = ' ';
7222
7223    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7224        return NULL;
7225
7226    if (self->length >= width && PyUnicode_CheckExact(self)) {
7227        Py_INCREF(self);
7228        return (PyObject*) self;
7229    }
7230
7231    return (PyObject*) pad(self, width - self->length, 0, fillchar);
7232}
7233
7234static PyObject*
7235unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7236{
7237    /* standard clamping */
7238    if (start < 0)
7239        start = 0;
7240    if (end < 0)
7241        end = 0;
7242    if (end > self->length)
7243        end = self->length;
7244    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7245        /* full slice, return original string */
7246        Py_INCREF(self);
7247        return (PyObject*) self;
7248    }
7249    if (start > end)
7250        start = end;
7251    /* copy slice */
7252    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7253					     end - start);
7254}
7255
7256PyObject *PyUnicode_Split(PyObject *s,
7257			  PyObject *sep,
7258			  Py_ssize_t maxsplit)
7259{
7260    PyObject *result;
7261
7262    s = PyUnicode_FromObject(s);
7263    if (s == NULL)
7264	return NULL;
7265    if (sep != NULL) {
7266	sep = PyUnicode_FromObject(sep);
7267	if (sep == NULL) {
7268	    Py_DECREF(s);
7269	    return NULL;
7270	}
7271    }
7272
7273    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7274
7275    Py_DECREF(s);
7276    Py_XDECREF(sep);
7277    return result;
7278}
7279
7280PyDoc_STRVAR(split__doc__,
7281"S.split([sep [,maxsplit]]) -> list of strings\n\
7282\n\
7283Return a list of the words in S, using sep as the\n\
7284delimiter string.  If maxsplit is given, at most maxsplit\n\
7285splits are done. If sep is not specified or is None,\n\
7286any whitespace string is a separator.");
7287
7288static PyObject*
7289unicode_split(PyUnicodeObject *self, PyObject *args)
7290{
7291    PyObject *substring = Py_None;
7292    Py_ssize_t maxcount = -1;
7293
7294    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7295        return NULL;
7296
7297    if (substring == Py_None)
7298	return split(self, NULL, maxcount);
7299    else if (PyUnicode_Check(substring))
7300	return split(self, (PyUnicodeObject *)substring, maxcount);
7301    else
7302	return PyUnicode_Split((PyObject *)self, substring, maxcount);
7303}
7304
7305PyObject *
7306PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7307{
7308    PyObject* str_obj;
7309    PyObject* sep_obj;
7310    PyObject* out;
7311
7312    str_obj = PyUnicode_FromObject(str_in);
7313    if (!str_obj)
7314	return NULL;
7315    sep_obj = PyUnicode_FromObject(sep_in);
7316    if (!sep_obj) {
7317        Py_DECREF(str_obj);
7318        return NULL;
7319    }
7320
7321    out = stringlib_partition(
7322        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7323        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7324        );
7325
7326    Py_DECREF(sep_obj);
7327    Py_DECREF(str_obj);
7328
7329    return out;
7330}
7331
7332
7333PyObject *
7334PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7335{
7336    PyObject* str_obj;
7337    PyObject* sep_obj;
7338    PyObject* out;
7339
7340    str_obj = PyUnicode_FromObject(str_in);
7341    if (!str_obj)
7342	return NULL;
7343    sep_obj = PyUnicode_FromObject(sep_in);
7344    if (!sep_obj) {
7345        Py_DECREF(str_obj);
7346        return NULL;
7347    }
7348
7349    out = stringlib_rpartition(
7350        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7351        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7352        );
7353
7354    Py_DECREF(sep_obj);
7355    Py_DECREF(str_obj);
7356
7357    return out;
7358}
7359
7360PyDoc_STRVAR(partition__doc__,
7361"S.partition(sep) -> (head, sep, tail)\n\
7362\n\
7363Searches for the separator sep in S, and returns the part before it,\n\
7364the separator itself, and the part after it.  If the separator is not\n\
7365found, returns S and two empty strings.");
7366
7367static PyObject*
7368unicode_partition(PyUnicodeObject *self, PyObject *separator)
7369{
7370    return PyUnicode_Partition((PyObject *)self, separator);
7371}
7372
7373PyDoc_STRVAR(rpartition__doc__,
7374"S.rpartition(sep) -> (tail, sep, head)\n\
7375\n\
7376Searches for the separator sep in S, starting at the end of S, and returns\n\
7377the part before it, the separator itself, and the part after it.  If the\n\
7378separator is not found, returns two empty strings and S.");
7379
7380static PyObject*
7381unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7382{
7383    return PyUnicode_RPartition((PyObject *)self, separator);
7384}
7385
7386PyObject *PyUnicode_RSplit(PyObject *s,
7387			   PyObject *sep,
7388			   Py_ssize_t maxsplit)
7389{
7390    PyObject *result;
7391
7392    s = PyUnicode_FromObject(s);
7393    if (s == NULL)
7394	return NULL;
7395    if (sep != NULL) {
7396	sep = PyUnicode_FromObject(sep);
7397	if (sep == NULL) {
7398	    Py_DECREF(s);
7399	    return NULL;
7400	}
7401    }
7402
7403    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7404
7405    Py_DECREF(s);
7406    Py_XDECREF(sep);
7407    return result;
7408}
7409
7410PyDoc_STRVAR(rsplit__doc__,
7411"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7412\n\
7413Return a list of the words in S, using sep as the\n\
7414delimiter string, starting at the end of the string and\n\
7415working to the front.  If maxsplit is given, at most maxsplit\n\
7416splits are done. If sep is not specified, any whitespace string\n\
7417is a separator.");
7418
7419static PyObject*
7420unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7421{
7422    PyObject *substring = Py_None;
7423    Py_ssize_t maxcount = -1;
7424
7425    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7426        return NULL;
7427
7428    if (substring == Py_None)
7429	return rsplit(self, NULL, maxcount);
7430    else if (PyUnicode_Check(substring))
7431	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7432    else
7433	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7434}
7435
7436PyDoc_STRVAR(splitlines__doc__,
7437"S.splitlines([keepends]]) -> list of strings\n\
7438\n\
7439Return a list of the lines in S, breaking at line boundaries.\n\
7440Line breaks are not included in the resulting list unless keepends\n\
7441is given and true.");
7442
7443static PyObject*
7444unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7445{
7446    int keepends = 0;
7447
7448    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7449        return NULL;
7450
7451    return PyUnicode_Splitlines((PyObject *)self, keepends);
7452}
7453
7454static
7455PyObject *unicode_str(PyObject *self)
7456{
7457    if (PyUnicode_CheckExact(self)) {
7458        Py_INCREF(self);
7459        return self;
7460    } else
7461        /* Subtype -- return genuine unicode string with the same value. */
7462        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7463                                     PyUnicode_GET_SIZE(self));
7464}
7465
7466PyDoc_STRVAR(swapcase__doc__,
7467"S.swapcase() -> unicode\n\
7468\n\
7469Return a copy of S with uppercase characters converted to lowercase\n\
7470and vice versa.");
7471
7472static PyObject*
7473unicode_swapcase(PyUnicodeObject *self)
7474{
7475    return fixup(self, fixswapcase);
7476}
7477
7478PyDoc_STRVAR(translate__doc__,
7479"S.translate(table) -> unicode\n\
7480\n\
7481Return a copy of the string S, where all characters have been mapped\n\
7482through the given translation table, which must be a mapping of\n\
7483Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7484Unmapped characters are left untouched. Characters mapped to None\n\
7485are deleted.");
7486
7487static PyObject*
7488unicode_translate(PyUnicodeObject *self, PyObject *table)
7489{
7490    return PyUnicode_TranslateCharmap(self->str,
7491				      self->length,
7492				      table,
7493				      "ignore");
7494}
7495
7496PyDoc_STRVAR(upper__doc__,
7497"S.upper() -> unicode\n\
7498\n\
7499Return a copy of S converted to uppercase.");
7500
7501static PyObject*
7502unicode_upper(PyUnicodeObject *self)
7503{
7504    return fixup(self, fixupper);
7505}
7506
7507PyDoc_STRVAR(zfill__doc__,
7508"S.zfill(width) -> unicode\n\
7509\n\
7510Pad a numeric string x with zeros on the left, to fill a field\n\
7511of the specified width. The string x is never truncated.");
7512
7513static PyObject *
7514unicode_zfill(PyUnicodeObject *self, PyObject *args)
7515{
7516    Py_ssize_t fill;
7517    PyUnicodeObject *u;
7518
7519    Py_ssize_t width;
7520    if (!PyArg_ParseTuple(args, "n:zfill", &width))
7521        return NULL;
7522
7523    if (self->length >= width) {
7524        if (PyUnicode_CheckExact(self)) {
7525            Py_INCREF(self);
7526            return (PyObject*) self;
7527        }
7528        else
7529            return PyUnicode_FromUnicode(
7530                PyUnicode_AS_UNICODE(self),
7531                PyUnicode_GET_SIZE(self)
7532            );
7533    }
7534
7535    fill = width - self->length;
7536
7537    u = pad(self, fill, 0, '0');
7538
7539    if (u == NULL)
7540        return NULL;
7541
7542    if (u->str[fill] == '+' || u->str[fill] == '-') {
7543        /* move sign to beginning of string */
7544        u->str[0] = u->str[fill];
7545        u->str[fill] = '0';
7546    }
7547
7548    return (PyObject*) u;
7549}
7550
7551#if 0
7552static PyObject*
7553unicode_freelistsize(PyUnicodeObject *self)
7554{
7555    return PyInt_FromLong(unicode_freelist_size);
7556}
7557#endif
7558
7559PyDoc_STRVAR(startswith__doc__,
7560"S.startswith(prefix[, start[, end]]) -> bool\n\
7561\n\
7562Return True if S starts with the specified prefix, False otherwise.\n\
7563With optional start, test S beginning at that position.\n\
7564With optional end, stop comparing S at that position.\n\
7565prefix can also be a tuple of strings to try.");
7566
7567static PyObject *
7568unicode_startswith(PyUnicodeObject *self,
7569		   PyObject *args)
7570{
7571    PyObject *subobj;
7572    PyUnicodeObject *substring;
7573    Py_ssize_t start = 0;
7574    Py_ssize_t end = PY_SSIZE_T_MAX;
7575    int result;
7576
7577    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7578		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7579	return NULL;
7580    if (PyTuple_Check(subobj)) {
7581        Py_ssize_t i;
7582        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7583            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7584                            PyTuple_GET_ITEM(subobj, i));
7585            if (substring == NULL)
7586                return NULL;
7587            result = tailmatch(self, substring, start, end, -1);
7588            Py_DECREF(substring);
7589            if (result) {
7590                Py_RETURN_TRUE;
7591            }
7592        }
7593        /* nothing matched */
7594        Py_RETURN_FALSE;
7595    }
7596    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7597    if (substring == NULL)
7598         return NULL;
7599    result = tailmatch(self, substring, start, end, -1);
7600    Py_DECREF(substring);
7601    return PyBool_FromLong(result);
7602}
7603
7604
7605PyDoc_STRVAR(endswith__doc__,
7606"S.endswith(suffix[, start[, end]]) -> bool\n\
7607\n\
7608Return True if S ends with the specified suffix, False otherwise.\n\
7609With optional start, test S beginning at that position.\n\
7610With optional end, stop comparing S at that position.\n\
7611suffix can also be a tuple of strings to try.");
7612
7613static PyObject *
7614unicode_endswith(PyUnicodeObject *self,
7615		 PyObject *args)
7616{
7617    PyObject *subobj;
7618    PyUnicodeObject *substring;
7619    Py_ssize_t start = 0;
7620    Py_ssize_t end = PY_SSIZE_T_MAX;
7621    int result;
7622
7623    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7624        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7625	return NULL;
7626    if (PyTuple_Check(subobj)) {
7627        Py_ssize_t i;
7628        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7629            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7630                            PyTuple_GET_ITEM(subobj, i));
7631            if (substring == NULL)
7632            return NULL;
7633            result = tailmatch(self, substring, start, end, +1);
7634            Py_DECREF(substring);
7635            if (result) {
7636                Py_RETURN_TRUE;
7637            }
7638        }
7639        Py_RETURN_FALSE;
7640    }
7641    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7642    if (substring == NULL)
7643    return NULL;
7644
7645    result = tailmatch(self, substring, start, end, +1);
7646    Py_DECREF(substring);
7647    return PyBool_FromLong(result);
7648}
7649
7650
7651
7652static PyObject *
7653unicode_getnewargs(PyUnicodeObject *v)
7654{
7655	return Py_BuildValue("(u#)", v->str, v->length);
7656}
7657
7658
7659static PyMethodDef unicode_methods[] = {
7660
7661    /* Order is according to common usage: often used methods should
7662       appear first, since lookup is done sequentially. */
7663
7664    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7665    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7666    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7667    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7668    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7669    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7670    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7671    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7672    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7673    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7674    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7675    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7676    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7677    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7678    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7679    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7680    {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7681/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7682    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7683    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7684    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7685    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7686    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7687    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7688    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7689    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7690    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7691    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7692    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7693    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7694    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7695    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7696    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7697    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7698    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7699    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7700    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7701    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7702    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7703    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7704#if 0
7705    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7706#endif
7707
7708#if 0
7709    /* This one is just used for debugging the implementation. */
7710    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
7711#endif
7712
7713    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
7714    {NULL, NULL}
7715};
7716
7717static PyObject *
7718unicode_mod(PyObject *v, PyObject *w)
7719{
7720       if (!PyUnicode_Check(v)) {
7721               Py_INCREF(Py_NotImplemented);
7722               return Py_NotImplemented;
7723       }
7724       return PyUnicode_Format(v, w);
7725}
7726
7727static PyNumberMethods unicode_as_number = {
7728	0,				/*nb_add*/
7729	0,				/*nb_subtract*/
7730	0,				/*nb_multiply*/
7731	unicode_mod,			/*nb_remainder*/
7732};
7733
7734static PySequenceMethods unicode_as_sequence = {
7735    (lenfunc) unicode_length, 		/* sq_length */
7736    PyUnicode_Concat,		 	/* sq_concat */
7737    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
7738    (ssizeargfunc) unicode_getitem, 	/* sq_item */
7739    (ssizessizeargfunc) unicode_slice, 	/* sq_slice */
7740    0, 					/* sq_ass_item */
7741    0, 					/* sq_ass_slice */
7742    PyUnicode_Contains, 		/* sq_contains */
7743};
7744
7745static PyObject*
7746unicode_subscript(PyUnicodeObject* self, PyObject* item)
7747{
7748    if (PyIndex_Check(item)) {
7749        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7750        if (i == -1 && PyErr_Occurred())
7751            return NULL;
7752        if (i < 0)
7753            i += PyUnicode_GET_SIZE(self);
7754        return unicode_getitem(self, i);
7755    } else if (PySlice_Check(item)) {
7756        Py_ssize_t start, stop, step, slicelength, cur, i;
7757        Py_UNICODE* source_buf;
7758        Py_UNICODE* result_buf;
7759        PyObject* result;
7760
7761        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7762				 &start, &stop, &step, &slicelength) < 0) {
7763            return NULL;
7764        }
7765
7766        if (slicelength <= 0) {
7767            return PyUnicode_FromUnicode(NULL, 0);
7768        } else {
7769            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7770            result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7771                                                    sizeof(Py_UNICODE));
7772
7773	    if (result_buf == NULL)
7774		    return PyErr_NoMemory();
7775
7776            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7777                result_buf[i] = source_buf[cur];
7778            }
7779
7780            result = PyUnicode_FromUnicode(result_buf, slicelength);
7781            PyMem_FREE(result_buf);
7782            return result;
7783        }
7784    } else {
7785        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7786        return NULL;
7787    }
7788}
7789
7790static PyMappingMethods unicode_as_mapping = {
7791    (lenfunc)unicode_length,		/* mp_length */
7792    (binaryfunc)unicode_subscript,	/* mp_subscript */
7793    (objobjargproc)0,			/* mp_ass_subscript */
7794};
7795
7796static Py_ssize_t
7797unicode_buffer_getreadbuf(PyUnicodeObject *self,
7798			  Py_ssize_t index,
7799			  const void **ptr)
7800{
7801    if (index != 0) {
7802        PyErr_SetString(PyExc_SystemError,
7803			"accessing non-existent unicode segment");
7804        return -1;
7805    }
7806    *ptr = (void *) self->str;
7807    return PyUnicode_GET_DATA_SIZE(self);
7808}
7809
7810static Py_ssize_t
7811unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7812			   const void **ptr)
7813{
7814    PyErr_SetString(PyExc_TypeError,
7815		    "cannot use unicode as modifiable buffer");
7816    return -1;
7817}
7818
7819static int
7820unicode_buffer_getsegcount(PyUnicodeObject *self,
7821			   Py_ssize_t *lenp)
7822{
7823    if (lenp)
7824        *lenp = PyUnicode_GET_DATA_SIZE(self);
7825    return 1;
7826}
7827
7828static Py_ssize_t
7829unicode_buffer_getcharbuf(PyUnicodeObject *self,
7830			  Py_ssize_t index,
7831			  const void **ptr)
7832{
7833    PyObject *str;
7834
7835    if (index != 0) {
7836        PyErr_SetString(PyExc_SystemError,
7837			"accessing non-existent unicode segment");
7838        return -1;
7839    }
7840    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7841    if (str == NULL)
7842	return -1;
7843    *ptr = (void *) PyString_AS_STRING(str);
7844    return PyString_GET_SIZE(str);
7845}
7846
7847/* Helpers for PyUnicode_Format() */
7848
7849static PyObject *
7850getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7851{
7852    Py_ssize_t argidx = *p_argidx;
7853    if (argidx < arglen) {
7854	(*p_argidx)++;
7855	if (arglen < 0)
7856	    return args;
7857	else
7858	    return PyTuple_GetItem(args, argidx);
7859    }
7860    PyErr_SetString(PyExc_TypeError,
7861		    "not enough arguments for format string");
7862    return NULL;
7863}
7864
7865#define F_LJUST (1<<0)
7866#define F_SIGN	(1<<1)
7867#define F_BLANK (1<<2)
7868#define F_ALT	(1<<3)
7869#define F_ZERO	(1<<4)
7870
7871static Py_ssize_t
7872strtounicode(Py_UNICODE *buffer, const char *charbuffer)
7873{
7874    register Py_ssize_t i;
7875    Py_ssize_t len = strlen(charbuffer);
7876    for (i = len - 1; i >= 0; i--)
7877	buffer[i] = (Py_UNICODE) charbuffer[i];
7878
7879    return len;
7880}
7881
7882static int
7883doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7884{
7885    Py_ssize_t result;
7886
7887    PyOS_ascii_formatd((char *)buffer, len, format, x);
7888    result = strtounicode(buffer, (char *)buffer);
7889    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7890}
7891
7892static int
7893longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7894{
7895    Py_ssize_t result;
7896
7897    PyOS_snprintf((char *)buffer, len, format, x);
7898    result = strtounicode(buffer, (char *)buffer);
7899    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7900}
7901
7902/* XXX To save some code duplication, formatfloat/long/int could have been
7903   shared with stringobject.c, converting from 8-bit to Unicode after the
7904   formatting is done. */
7905
7906static int
7907formatfloat(Py_UNICODE *buf,
7908	    size_t buflen,
7909	    int flags,
7910	    int prec,
7911	    int type,
7912	    PyObject *v)
7913{
7914    /* fmt = '%#.' + `prec` + `type`
7915       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7916    char fmt[20];
7917    double x;
7918
7919    x = PyFloat_AsDouble(v);
7920    if (x == -1.0 && PyErr_Occurred())
7921	return -1;
7922    if (prec < 0)
7923	prec = 6;
7924    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7925	type = 'g';
7926    /* Worst case length calc to ensure no buffer overrun:
7927
7928       'g' formats:
7929	 fmt = %#.<prec>g
7930	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7931	    for any double rep.)
7932	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7933
7934       'f' formats:
7935	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7936	 len = 1 + 50 + 1 + prec = 52 + prec
7937
7938       If prec=0 the effective precision is 1 (the leading digit is
7939       always given), therefore increase the length by one.
7940
7941    */
7942    if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7943	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7944	PyErr_SetString(PyExc_OverflowError,
7945			"formatted float is too long (precision too large?)");
7946	return -1;
7947    }
7948    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7949		  (flags&F_ALT) ? "#" : "",
7950		  prec, type);
7951    return doubletounicode(buf, buflen, fmt, x);
7952}
7953
7954static PyObject*
7955formatlong(PyObject *val, int flags, int prec, int type)
7956{
7957	char *buf;
7958	int len;
7959	PyObject *str; /* temporary string object. */
7960	PyObject *result;
7961
7962	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7963	if (!str)
7964		return NULL;
7965	result = PyUnicode_FromStringAndSize(buf, len);
7966	Py_DECREF(str);
7967	return result;
7968}
7969
7970static int
7971formatint(Py_UNICODE *buf,
7972	  size_t buflen,
7973	  int flags,
7974	  int prec,
7975	  int type,
7976	  PyObject *v)
7977{
7978    /* fmt = '%#.' + `prec` + 'l' + `type`
7979     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7980     *                     + 1 + 1
7981     *                   = 24
7982     */
7983    char fmt[64]; /* plenty big enough! */
7984    char *sign;
7985    long x;
7986
7987    x = PyInt_AsLong(v);
7988    if (x == -1 && PyErr_Occurred())
7989        return -1;
7990    if (x < 0 && type == 'u') {
7991        type = 'd';
7992    }
7993    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7994        sign = "-";
7995    else
7996        sign = "";
7997    if (prec < 0)
7998        prec = 1;
7999
8000    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8001     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8002     */
8003    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8004        PyErr_SetString(PyExc_OverflowError,
8005    	        "formatted integer is too long (precision too large?)");
8006        return -1;
8007    }
8008
8009    if ((flags & F_ALT) &&
8010        (type == 'x' || type == 'X' || type == 'o')) {
8011        /* When converting under %#o, %#x or %#X, there are a number
8012         * of issues that cause pain:
8013	 * - for %#o, we want a different base marker than C
8014         * - when 0 is being converted, the C standard leaves off
8015         *   the '0x' or '0X', which is inconsistent with other
8016         *   %#x/%#X conversions and inconsistent with Python's
8017         *   hex() function
8018         * - there are platforms that violate the standard and
8019         *   convert 0 with the '0x' or '0X'
8020         *   (Metrowerks, Compaq Tru64)
8021         * - there are platforms that give '0x' when converting
8022         *   under %#X, but convert 0 in accordance with the
8023         *   standard (OS/2 EMX)
8024         *
8025         * We can achieve the desired consistency by inserting our
8026         * own '0x' or '0X' prefix, and substituting %x/%X in place
8027         * of %#x/%#X.
8028         *
8029         * Note that this is the same approach as used in
8030         * formatint() in stringobject.c
8031         */
8032        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8033                      sign, type, prec, type);
8034    }
8035    else {
8036        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8037                      sign, (flags&F_ALT) ? "#" : "",
8038                      prec, type);
8039    }
8040    if (sign[0])
8041        return longtounicode(buf, buflen, fmt, -x);
8042    else
8043        return longtounicode(buf, buflen, fmt, x);
8044}
8045
8046static int
8047formatchar(Py_UNICODE *buf,
8048           size_t buflen,
8049           PyObject *v)
8050{
8051    /* presume that the buffer is at least 2 characters long */
8052    if (PyUnicode_Check(v)) {
8053	if (PyUnicode_GET_SIZE(v) != 1)
8054	    goto onError;
8055	buf[0] = PyUnicode_AS_UNICODE(v)[0];
8056    }
8057
8058    else if (PyString_Check(v)) {
8059	if (PyString_GET_SIZE(v) != 1)
8060	    goto onError;
8061	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
8062    }
8063
8064    else {
8065	/* Integer input truncated to a character */
8066        long x;
8067	x = PyInt_AsLong(v);
8068	if (x == -1 && PyErr_Occurred())
8069	    goto onError;
8070#ifdef Py_UNICODE_WIDE
8071	if (x < 0 || x > 0x10ffff) {
8072	    PyErr_SetString(PyExc_OverflowError,
8073			    "%c arg not in range(0x110000) "
8074			    "(wide Python build)");
8075	    return -1;
8076	}
8077#else
8078	if (x < 0 || x > 0xffff) {
8079	    PyErr_SetString(PyExc_OverflowError,
8080			    "%c arg not in range(0x10000) "
8081			    "(narrow Python build)");
8082	    return -1;
8083	}
8084#endif
8085	buf[0] = (Py_UNICODE) x;
8086    }
8087    buf[1] = '\0';
8088    return 1;
8089
8090 onError:
8091    PyErr_SetString(PyExc_TypeError,
8092		    "%c requires int or char");
8093    return -1;
8094}
8095
8096/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8097
8098   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8099   chars are formatted. XXX This is a magic number. Each formatting
8100   routine does bounds checking to ensure no overflow, but a better
8101   solution may be to malloc a buffer of appropriate size for each
8102   format. For now, the current solution is sufficient.
8103*/
8104#define FORMATBUFLEN (size_t)120
8105
8106PyObject *PyUnicode_Format(PyObject *format,
8107			   PyObject *args)
8108{
8109    Py_UNICODE *fmt, *res;
8110    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8111    int args_owned = 0;
8112    PyUnicodeObject *result = NULL;
8113    PyObject *dict = NULL;
8114    PyObject *uformat;
8115
8116    if (format == NULL || args == NULL) {
8117	PyErr_BadInternalCall();
8118	return NULL;
8119    }
8120    uformat = PyUnicode_FromObject(format);
8121    if (uformat == NULL)
8122	return NULL;
8123    fmt = PyUnicode_AS_UNICODE(uformat);
8124    fmtcnt = PyUnicode_GET_SIZE(uformat);
8125
8126    reslen = rescnt = fmtcnt + 100;
8127    result = _PyUnicode_New(reslen);
8128    if (result == NULL)
8129	goto onError;
8130    res = PyUnicode_AS_UNICODE(result);
8131
8132    if (PyTuple_Check(args)) {
8133	arglen = PyTuple_Size(args);
8134	argidx = 0;
8135    }
8136    else {
8137	arglen = -1;
8138	argidx = -2;
8139    }
8140    if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
8141        !PyObject_TypeCheck(args, &PyBaseString_Type))
8142	dict = args;
8143
8144    while (--fmtcnt >= 0) {
8145	if (*fmt != '%') {
8146	    if (--rescnt < 0) {
8147		rescnt = fmtcnt + 100;
8148		reslen += rescnt;
8149		if (_PyUnicode_Resize(&result, reslen) < 0)
8150		    goto onError;
8151		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8152		--rescnt;
8153	    }
8154	    *res++ = *fmt++;
8155	}
8156	else {
8157	    /* Got a format specifier */
8158	    int flags = 0;
8159	    Py_ssize_t width = -1;
8160	    int prec = -1;
8161	    Py_UNICODE c = '\0';
8162	    Py_UNICODE fill;
8163	    PyObject *v = NULL;
8164	    PyObject *temp = NULL;
8165	    Py_UNICODE *pbuf;
8166	    Py_UNICODE sign;
8167	    Py_ssize_t len;
8168	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8169
8170	    fmt++;
8171	    if (*fmt == '(') {
8172		Py_UNICODE *keystart;
8173		Py_ssize_t keylen;
8174		PyObject *key;
8175		int pcount = 1;
8176
8177		if (dict == NULL) {
8178		    PyErr_SetString(PyExc_TypeError,
8179				    "format requires a mapping");
8180		    goto onError;
8181		}
8182		++fmt;
8183		--fmtcnt;
8184		keystart = fmt;
8185		/* Skip over balanced parentheses */
8186		while (pcount > 0 && --fmtcnt >= 0) {
8187		    if (*fmt == ')')
8188			--pcount;
8189		    else if (*fmt == '(')
8190			++pcount;
8191		    fmt++;
8192		}
8193		keylen = fmt - keystart - 1;
8194		if (fmtcnt < 0 || pcount > 0) {
8195		    PyErr_SetString(PyExc_ValueError,
8196				    "incomplete format key");
8197		    goto onError;
8198		}
8199#if 0
8200		/* keys are converted to strings using UTF-8 and
8201		   then looked up since Python uses strings to hold
8202		   variables names etc. in its namespaces and we
8203		   wouldn't want to break common idioms. */
8204		key = PyUnicode_EncodeUTF8(keystart,
8205					   keylen,
8206					   NULL);
8207#else
8208		key = PyUnicode_FromUnicode(keystart, keylen);
8209#endif
8210		if (key == NULL)
8211		    goto onError;
8212		if (args_owned) {
8213		    Py_DECREF(args);
8214		    args_owned = 0;
8215		}
8216		args = PyObject_GetItem(dict, key);
8217		Py_DECREF(key);
8218		if (args == NULL) {
8219		    goto onError;
8220		}
8221		args_owned = 1;
8222		arglen = -1;
8223		argidx = -2;
8224	    }
8225	    while (--fmtcnt >= 0) {
8226		switch (c = *fmt++) {
8227		case '-': flags |= F_LJUST; continue;
8228		case '+': flags |= F_SIGN; continue;
8229		case ' ': flags |= F_BLANK; continue;
8230		case '#': flags |= F_ALT; continue;
8231		case '0': flags |= F_ZERO; continue;
8232		}
8233		break;
8234	    }
8235	    if (c == '*') {
8236		v = getnextarg(args, arglen, &argidx);
8237		if (v == NULL)
8238		    goto onError;
8239		if (!PyInt_Check(v)) {
8240		    PyErr_SetString(PyExc_TypeError,
8241				    "* wants int");
8242		    goto onError;
8243		}
8244		width = PyInt_AsLong(v);
8245		if (width == -1 && PyErr_Occurred())
8246			goto onError;
8247		if (width < 0) {
8248		    flags |= F_LJUST;
8249		    width = -width;
8250		}
8251		if (--fmtcnt >= 0)
8252		    c = *fmt++;
8253	    }
8254	    else if (c >= '0' && c <= '9') {
8255		width = c - '0';
8256		while (--fmtcnt >= 0) {
8257		    c = *fmt++;
8258		    if (c < '0' || c > '9')
8259			break;
8260		    if ((width*10) / 10 != width) {
8261			PyErr_SetString(PyExc_ValueError,
8262					"width too big");
8263			goto onError;
8264		    }
8265		    width = width*10 + (c - '0');
8266		}
8267	    }
8268	    if (c == '.') {
8269		prec = 0;
8270		if (--fmtcnt >= 0)
8271		    c = *fmt++;
8272		if (c == '*') {
8273		    v = getnextarg(args, arglen, &argidx);
8274		    if (v == NULL)
8275			goto onError;
8276		    if (!PyInt_Check(v)) {
8277			PyErr_SetString(PyExc_TypeError,
8278					"* wants int");
8279			goto onError;
8280		    }
8281		    prec = PyInt_AsLong(v);
8282		    if (prec == -1 && PyErr_Occurred())
8283			goto onError;
8284		    if (prec < 0)
8285			prec = 0;
8286		    if (--fmtcnt >= 0)
8287			c = *fmt++;
8288		}
8289		else if (c >= '0' && c <= '9') {
8290		    prec = c - '0';
8291		    while (--fmtcnt >= 0) {
8292			c = Py_CHARMASK(*fmt++);
8293			if (c < '0' || c > '9')
8294			    break;
8295			if ((prec*10) / 10 != prec) {
8296			    PyErr_SetString(PyExc_ValueError,
8297					    "prec too big");
8298			    goto onError;
8299			}
8300			prec = prec*10 + (c - '0');
8301		    }
8302		}
8303	    } /* prec */
8304	    if (fmtcnt >= 0) {
8305		if (c == 'h' || c == 'l' || c == 'L') {
8306		    if (--fmtcnt >= 0)
8307			c = *fmt++;
8308		}
8309	    }
8310	    if (fmtcnt < 0) {
8311		PyErr_SetString(PyExc_ValueError,
8312				"incomplete format");
8313		goto onError;
8314	    }
8315	    if (c != '%') {
8316		v = getnextarg(args, arglen, &argidx);
8317		if (v == NULL)
8318		    goto onError;
8319	    }
8320	    sign = 0;
8321	    fill = ' ';
8322	    switch (c) {
8323
8324	    case '%':
8325		pbuf = formatbuf;
8326		/* presume that buffer length is at least 1 */
8327		pbuf[0] = '%';
8328		len = 1;
8329		break;
8330
8331	    case 's':
8332	    case 'r':
8333		if (PyUnicode_Check(v) && c == 's') {
8334		    temp = v;
8335		    Py_INCREF(temp);
8336		}
8337		else {
8338		    PyObject *unicode;
8339		    if (c == 's')
8340			temp = PyObject_Unicode(v);
8341		    else
8342			temp = PyObject_Repr(v);
8343		    if (temp == NULL)
8344			goto onError;
8345                    if (PyUnicode_Check(temp))
8346                        /* nothing to do */;
8347                    else if (PyString_Check(temp)) {
8348                        /* convert to string to Unicode */
8349		        unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8350						   PyString_GET_SIZE(temp),
8351						   NULL,
8352						   "strict");
8353		        Py_DECREF(temp);
8354		        temp = unicode;
8355		        if (temp == NULL)
8356			    goto onError;
8357		    }
8358		    else {
8359			Py_DECREF(temp);
8360			PyErr_SetString(PyExc_TypeError,
8361					"%s argument has non-string str()");
8362			goto onError;
8363		    }
8364		}
8365		pbuf = PyUnicode_AS_UNICODE(temp);
8366		len = PyUnicode_GET_SIZE(temp);
8367		if (prec >= 0 && len > prec)
8368		    len = prec;
8369		break;
8370
8371	    case 'i':
8372	    case 'd':
8373	    case 'u':
8374	    case 'o':
8375	    case 'x':
8376	    case 'X':
8377		if (c == 'i')
8378		    c = 'd';
8379		if (PyLong_Check(v)) {
8380		    temp = formatlong(v, flags, prec, c);
8381		    if (!temp)
8382			goto onError;
8383		    pbuf = PyUnicode_AS_UNICODE(temp);
8384		    len = PyUnicode_GET_SIZE(temp);
8385		    sign = 1;
8386		}
8387		else {
8388		    pbuf = formatbuf;
8389		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8390				    flags, prec, c, v);
8391		    if (len < 0)
8392			goto onError;
8393		    sign = 1;
8394		}
8395		if (flags & F_ZERO)
8396		    fill = '0';
8397		break;
8398
8399	    case 'e':
8400	    case 'E':
8401	    case 'f':
8402	    case 'F':
8403	    case 'g':
8404	    case 'G':
8405		if (c == 'F')
8406			c = 'f';
8407		pbuf = formatbuf;
8408		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8409			flags, prec, c, v);
8410		if (len < 0)
8411		    goto onError;
8412		sign = 1;
8413		if (flags & F_ZERO)
8414		    fill = '0';
8415		break;
8416
8417	    case 'c':
8418		pbuf = formatbuf;
8419		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8420		if (len < 0)
8421		    goto onError;
8422		break;
8423
8424	    default:
8425		PyErr_Format(PyExc_ValueError,
8426			     "unsupported format character '%c' (0x%x) "
8427			     "at index %zd",
8428			     (31<=c && c<=126) ? (char)c : '?',
8429                             (int)c,
8430			     (Py_ssize_t)(fmt - 1 -
8431					  PyUnicode_AS_UNICODE(uformat)));
8432		goto onError;
8433	    }
8434	    if (sign) {
8435		if (*pbuf == '-' || *pbuf == '+') {
8436		    sign = *pbuf++;
8437		    len--;
8438		}
8439		else if (flags & F_SIGN)
8440		    sign = '+';
8441		else if (flags & F_BLANK)
8442		    sign = ' ';
8443		else
8444		    sign = 0;
8445	    }
8446	    if (width < len)
8447		width = len;
8448	    if (rescnt - (sign != 0) < width) {
8449		reslen -= rescnt;
8450		rescnt = width + fmtcnt + 100;
8451		reslen += rescnt;
8452		if (reslen < 0) {
8453		    Py_XDECREF(temp);
8454		    PyErr_NoMemory();
8455		    goto onError;
8456		}
8457		if (_PyUnicode_Resize(&result, reslen) < 0) {
8458		    Py_XDECREF(temp);
8459		    goto onError;
8460		}
8461		res = PyUnicode_AS_UNICODE(result)
8462		    + reslen - rescnt;
8463	    }
8464	    if (sign) {
8465		if (fill != ' ')
8466		    *res++ = sign;
8467		rescnt--;
8468		if (width > len)
8469		    width--;
8470	    }
8471	    if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
8472		assert(pbuf[0] == '0');
8473		assert(pbuf[1] == c);
8474		if (fill != ' ') {
8475		    *res++ = *pbuf++;
8476		    *res++ = *pbuf++;
8477		}
8478		rescnt -= 2;
8479		width -= 2;
8480		if (width < 0)
8481		    width = 0;
8482		len -= 2;
8483	    }
8484	    if (width > len && !(flags & F_LJUST)) {
8485		do {
8486		    --rescnt;
8487		    *res++ = fill;
8488		} while (--width > len);
8489	    }
8490	    if (fill == ' ') {
8491		if (sign)
8492		    *res++ = sign;
8493		if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
8494		    assert(pbuf[0] == '0');
8495		    assert(pbuf[1] == c);
8496		    *res++ = *pbuf++;
8497		    *res++ = *pbuf++;
8498		}
8499	    }
8500	    Py_UNICODE_COPY(res, pbuf, len);
8501	    res += len;
8502	    rescnt -= len;
8503	    while (--width >= len) {
8504		--rescnt;
8505		*res++ = ' ';
8506	    }
8507	    if (dict && (argidx < arglen) && c != '%') {
8508		PyErr_SetString(PyExc_TypeError,
8509				"not all arguments converted during string formatting");
8510                Py_XDECREF(temp);
8511		goto onError;
8512	    }
8513	    Py_XDECREF(temp);
8514	} /* '%' */
8515    } /* until end */
8516    if (argidx < arglen && !dict) {
8517	PyErr_SetString(PyExc_TypeError,
8518			"not all arguments converted during string formatting");
8519	goto onError;
8520    }
8521
8522    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8523	goto onError;
8524    if (args_owned) {
8525	Py_DECREF(args);
8526    }
8527    Py_DECREF(uformat);
8528    return (PyObject *)result;
8529
8530 onError:
8531    Py_XDECREF(result);
8532    Py_DECREF(uformat);
8533    if (args_owned) {
8534	Py_DECREF(args);
8535    }
8536    return NULL;
8537}
8538
8539static PyBufferProcs unicode_as_buffer = {
8540    (readbufferproc) unicode_buffer_getreadbuf,
8541    (writebufferproc) unicode_buffer_getwritebuf,
8542    (segcountproc) unicode_buffer_getsegcount,
8543    (charbufferproc) unicode_buffer_getcharbuf,
8544};
8545
8546static PyObject *
8547unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8548
8549static PyObject *
8550unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8551{
8552        PyObject *x = NULL;
8553	static char *kwlist[] = {"string", "encoding", "errors", 0};
8554	char *encoding = NULL;
8555	char *errors = NULL;
8556
8557	if (type != &PyUnicode_Type)
8558		return unicode_subtype_new(type, args, kwds);
8559	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8560					  kwlist, &x, &encoding, &errors))
8561	    return NULL;
8562	if (x == NULL)
8563		return (PyObject *)_PyUnicode_New(0);
8564	if (encoding == NULL && errors == NULL)
8565	    return PyObject_Unicode(x);
8566	else
8567	return PyUnicode_FromEncodedObject(x, encoding, errors);
8568}
8569
8570static PyObject *
8571unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8572{
8573	PyUnicodeObject *tmp, *pnew;
8574	Py_ssize_t n;
8575
8576	assert(PyType_IsSubtype(type, &PyUnicode_Type));
8577	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8578	if (tmp == NULL)
8579		return NULL;
8580	assert(PyUnicode_Check(tmp));
8581	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8582	if (pnew == NULL) {
8583		Py_DECREF(tmp);
8584		return NULL;
8585	}
8586	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8587	if (pnew->str == NULL) {
8588		_Py_ForgetReference((PyObject *)pnew);
8589		PyObject_Del(pnew);
8590		Py_DECREF(tmp);
8591		return PyErr_NoMemory();
8592	}
8593	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8594	pnew->length = n;
8595	pnew->hash = tmp->hash;
8596	Py_DECREF(tmp);
8597	return (PyObject *)pnew;
8598}
8599
8600PyDoc_STRVAR(unicode_doc,
8601"unicode(string [, encoding[, errors]]) -> object\n\
8602\n\
8603Create a new Unicode object from the given encoded string.\n\
8604encoding defaults to the current default string encoding.\n\
8605errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8606
8607static PyObject *unicode_iter(PyObject *seq);
8608
8609PyTypeObject PyUnicode_Type = {
8610    PyObject_HEAD_INIT(&PyType_Type)
8611    0, 					/* ob_size */
8612    "str", 				/* tp_name */
8613    sizeof(PyUnicodeObject), 		/* tp_size */
8614    0, 					/* tp_itemsize */
8615    /* Slots */
8616    (destructor)unicode_dealloc, 	/* tp_dealloc */
8617    0, 					/* tp_print */
8618    0,				 	/* tp_getattr */
8619    0, 					/* tp_setattr */
8620    0, 					/* tp_compare */
8621    unicode_repr, 			/* tp_repr */
8622    &unicode_as_number, 		/* tp_as_number */
8623    &unicode_as_sequence, 		/* tp_as_sequence */
8624    &unicode_as_mapping, 		/* tp_as_mapping */
8625    (hashfunc) unicode_hash, 		/* tp_hash*/
8626    0, 					/* tp_call*/
8627    (reprfunc) unicode_str,	 	/* tp_str */
8628    PyObject_GenericGetAttr, 		/* tp_getattro */
8629    0,			 		/* tp_setattro */
8630    &unicode_as_buffer,			/* tp_as_buffer */
8631    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
8632        Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
8633    unicode_doc,			/* tp_doc */
8634    0,					/* tp_traverse */
8635    0,					/* tp_clear */
8636    PyUnicode_RichCompare,		/* tp_richcompare */
8637    0,					/* tp_weaklistoffset */
8638    unicode_iter,			/* tp_iter */
8639    0,					/* tp_iternext */
8640    unicode_methods,			/* tp_methods */
8641    0,					/* tp_members */
8642    0,					/* tp_getset */
8643    &PyBaseString_Type,			/* tp_base */
8644    0,					/* tp_dict */
8645    0,					/* tp_descr_get */
8646    0,					/* tp_descr_set */
8647    0,					/* tp_dictoffset */
8648    0,					/* tp_init */
8649    0,					/* tp_alloc */
8650    unicode_new,			/* tp_new */
8651    PyObject_Del,      		/* tp_free */
8652};
8653
8654/* Initialize the Unicode implementation */
8655
8656void _PyUnicode_Init(void)
8657{
8658    int i;
8659
8660    /* XXX - move this array to unicodectype.c ? */
8661    Py_UNICODE linebreak[] = {
8662        0x000A, /* LINE FEED */
8663        0x000D, /* CARRIAGE RETURN */
8664        0x001C, /* FILE SEPARATOR */
8665        0x001D, /* GROUP SEPARATOR */
8666        0x001E, /* RECORD SEPARATOR */
8667        0x0085, /* NEXT LINE */
8668        0x2028, /* LINE SEPARATOR */
8669        0x2029, /* PARAGRAPH SEPARATOR */
8670    };
8671
8672    /* Init the implementation */
8673    unicode_freelist = NULL;
8674    unicode_freelist_size = 0;
8675    unicode_empty = _PyUnicode_New(0);
8676    if (!unicode_empty)
8677	return;
8678
8679    for (i = 0; i < 256; i++)
8680	unicode_latin1[i] = NULL;
8681    if (PyType_Ready(&PyUnicode_Type) < 0)
8682	Py_FatalError("Can't initialize 'unicode'");
8683
8684    /* initialize the linebreak bloom filter */
8685    bloom_linebreak = make_bloom_mask(
8686        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8687        );
8688
8689    PyType_Ready(&EncodingMapType);
8690}
8691
8692/* Finalize the Unicode implementation */
8693
8694void
8695_PyUnicode_Fini(void)
8696{
8697    PyUnicodeObject *u;
8698    int i;
8699
8700    Py_XDECREF(unicode_empty);
8701    unicode_empty = NULL;
8702
8703    for (i = 0; i < 256; i++) {
8704	if (unicode_latin1[i]) {
8705	    Py_DECREF(unicode_latin1[i]);
8706	    unicode_latin1[i] = NULL;
8707	}
8708    }
8709
8710    for (u = unicode_freelist; u != NULL;) {
8711	PyUnicodeObject *v = u;
8712	u = *(PyUnicodeObject **)u;
8713	if (v->str)
8714	    PyMem_DEL(v->str);
8715	Py_XDECREF(v->defenc);
8716	PyObject_Del(v);
8717    }
8718    unicode_freelist = NULL;
8719    unicode_freelist_size = 0;
8720}
8721
8722void
8723PyUnicode_InternInPlace(PyObject **p)
8724{
8725	register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
8726	PyObject *t;
8727	if (s == NULL || !PyUnicode_Check(s))
8728		Py_FatalError(
8729		    "PyUnicode_InternInPlace: unicode strings only please!");
8730	/* If it's a subclass, we don't really know what putting
8731	   it in the interned dict might do. */
8732	if (!PyUnicode_CheckExact(s))
8733		return;
8734	if (PyUnicode_CHECK_INTERNED(s))
8735		return;
8736	if (interned == NULL) {
8737		interned = PyDict_New();
8738		if (interned == NULL) {
8739			PyErr_Clear(); /* Don't leave an exception */
8740			return;
8741		}
8742	}
8743	/* It might be that the GetItem call fails even
8744	   though the key is present in the dictionary,
8745	   namely when this happens during a stack overflow. */
8746	Py_ALLOW_RECURSION
8747	t = PyDict_GetItem(interned, (PyObject *)s);
8748	Py_END_ALLOW_RECURSION
8749
8750	if (t) {
8751		Py_INCREF(t);
8752		Py_DECREF(*p);
8753		*p = t;
8754		return;
8755	}
8756
8757	PyThreadState_GET()->recursion_critical = 1;
8758	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
8759		PyErr_Clear();
8760		PyThreadState_GET()->recursion_critical = 0;
8761		return;
8762	}
8763	PyThreadState_GET()->recursion_critical = 0;
8764	/* The two references in interned are not counted by refcnt.
8765	   The deallocator will take care of this */
8766	s->ob_refcnt -= 2;
8767	PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
8768}
8769
8770void
8771PyUnicode_InternImmortal(PyObject **p)
8772{
8773	PyUnicode_InternInPlace(p);
8774	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
8775		PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
8776		Py_INCREF(*p);
8777	}
8778}
8779
8780PyObject *
8781PyUnicode_InternFromString(const char *cp)
8782{
8783	PyObject *s = PyUnicode_FromString(cp);
8784	if (s == NULL)
8785		return NULL;
8786	PyUnicode_InternInPlace(&s);
8787	return s;
8788}
8789
8790void _Py_ReleaseInternedUnicodeStrings(void)
8791{
8792	PyObject *keys;
8793	PyUnicodeObject *s;
8794	Py_ssize_t i, n;
8795	Py_ssize_t immortal_size = 0, mortal_size = 0;
8796
8797	if (interned == NULL || !PyDict_Check(interned))
8798		return;
8799	keys = PyDict_Keys(interned);
8800	if (keys == NULL || !PyList_Check(keys)) {
8801		PyErr_Clear();
8802		return;
8803	}
8804
8805	/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
8806	   detector, interned unicode strings are not forcibly deallocated;
8807	   rather, we give them their stolen references back, and then clear
8808	   and DECREF the interned dict. */
8809
8810	n = PyList_GET_SIZE(keys);
8811	fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
8812		n);
8813	for (i = 0; i < n; i++) {
8814		s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
8815		switch (s->state) {
8816		case SSTATE_NOT_INTERNED:
8817			/* XXX Shouldn't happen */
8818			break;
8819		case SSTATE_INTERNED_IMMORTAL:
8820			s->ob_refcnt += 1;
8821			immortal_size += s->length;
8822			break;
8823		case SSTATE_INTERNED_MORTAL:
8824			s->ob_refcnt += 2;
8825			mortal_size += s->length;
8826			break;
8827		default:
8828			Py_FatalError("Inconsistent interned string state.");
8829		}
8830		s->state = SSTATE_NOT_INTERNED;
8831	}
8832	fprintf(stderr, "total size of all interned strings: "
8833			"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
8834			"mortal/immortal\n", mortal_size, immortal_size);
8835	Py_DECREF(keys);
8836	PyDict_Clear(interned);
8837	Py_DECREF(interned);
8838	interned = NULL;
8839}
8840
8841
8842/********************* Unicode Iterator **************************/
8843
8844typedef struct {
8845	PyObject_HEAD
8846	Py_ssize_t it_index;
8847	PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8848} unicodeiterobject;
8849
8850static void
8851unicodeiter_dealloc(unicodeiterobject *it)
8852{
8853	_PyObject_GC_UNTRACK(it);
8854	Py_XDECREF(it->it_seq);
8855	PyObject_GC_Del(it);
8856}
8857
8858static int
8859unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8860{
8861	Py_VISIT(it->it_seq);
8862	return 0;
8863}
8864
8865static PyObject *
8866unicodeiter_next(unicodeiterobject *it)
8867{
8868	PyUnicodeObject *seq;
8869	PyObject *item;
8870
8871	assert(it != NULL);
8872	seq = it->it_seq;
8873	if (seq == NULL)
8874		return NULL;
8875	assert(PyUnicode_Check(seq));
8876
8877	if (it->it_index < PyUnicode_GET_SIZE(seq)) {
8878		item = PyUnicode_FromUnicode(
8879                    PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
8880		if (item != NULL)
8881			++it->it_index;
8882		return item;
8883	}
8884
8885	Py_DECREF(seq);
8886	it->it_seq = NULL;
8887	return NULL;
8888}
8889
8890static PyObject *
8891unicodeiter_len(unicodeiterobject *it)
8892{
8893	Py_ssize_t len = 0;
8894	if (it->it_seq)
8895		len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8896	return PyInt_FromSsize_t(len);
8897}
8898
8899PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8900
8901static PyMethodDef unicodeiter_methods[] = {
8902	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8903         length_hint_doc},
8904 	{NULL,		NULL}		/* sentinel */
8905};
8906
8907PyTypeObject PyUnicodeIter_Type = {
8908	PyObject_HEAD_INIT(&PyType_Type)
8909	0,					/* ob_size */
8910	"unicodeiterator",			/* tp_name */
8911	sizeof(unicodeiterobject),		/* tp_basicsize */
8912	0,					/* tp_itemsize */
8913	/* methods */
8914	(destructor)unicodeiter_dealloc,	/* tp_dealloc */
8915	0,					/* tp_print */
8916	0,					/* tp_getattr */
8917	0,					/* tp_setattr */
8918	0,					/* tp_compare */
8919	0,					/* tp_repr */
8920	0,					/* tp_as_number */
8921	0,					/* tp_as_sequence */
8922	0,					/* tp_as_mapping */
8923	0,					/* tp_hash */
8924	0,					/* tp_call */
8925	0,					/* tp_str */
8926	PyObject_GenericGetAttr,		/* tp_getattro */
8927	0,					/* tp_setattro */
8928	0,					/* tp_as_buffer */
8929	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8930	0,					/* tp_doc */
8931	(traverseproc)unicodeiter_traverse,	/* tp_traverse */
8932	0,					/* tp_clear */
8933	0,					/* tp_richcompare */
8934	0,					/* tp_weaklistoffset */
8935	PyObject_SelfIter,			/* tp_iter */
8936	(iternextfunc)unicodeiter_next,		/* tp_iternext */
8937	unicodeiter_methods,			/* tp_methods */
8938	0,
8939};
8940
8941static PyObject *
8942unicode_iter(PyObject *seq)
8943{
8944	unicodeiterobject *it;
8945
8946	if (!PyUnicode_Check(seq)) {
8947		PyErr_BadInternalCall();
8948		return NULL;
8949	}
8950	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8951	if (it == NULL)
8952		return NULL;
8953	it->it_index = 0;
8954	Py_INCREF(seq);
8955	it->it_seq = (PyUnicodeObject *)seq;
8956	_PyObject_GC_TRACK(it);
8957	return (PyObject *)it;
8958}
8959
8960size_t
8961Py_UNICODE_strlen(const Py_UNICODE *u)
8962{
8963    int res = 0;
8964    while(*u++)
8965        res++;
8966    return res;
8967}
8968
8969Py_UNICODE*
8970Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
8971{
8972    Py_UNICODE *u = s1;
8973    while ((*u++ = *s2++));
8974    return s1;
8975}
8976
8977Py_UNICODE*
8978Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
8979{
8980    Py_UNICODE *u = s1;
8981    while ((*u++ = *s2++))
8982        if (n-- == 0)
8983            break;
8984    return s1;
8985}
8986
8987int
8988Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
8989{
8990    while (*s1 && *s2 && *s1 == *s2)
8991        s1++, s2++;
8992    if (*s1 && *s2)
8993        return (*s1 < *s2) ? -1 : +1;
8994    if (*s1)
8995        return 1;
8996    if (*s2)
8997        return -1;
8998    return 0;
8999}
9000
9001Py_UNICODE*
9002Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9003{
9004    const Py_UNICODE *p;
9005    for (p = s; *p; p++)
9006        if (*p == c)
9007            return (Py_UNICODE*)p;
9008    return NULL;
9009}
9010
9011
9012#ifdef __cplusplus
9013}
9014#endif
9015
9016
9017/*
9018Local variables:
9019c-basic-offset: 4
9020indent-tabs-mode: nil
9021End:
9022*/
9023