unicodeobject.c revision 6a27efa2d321c2b262c0cab3c2d4af3e2e8a9ead
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "bytes_methods.h"
45
46#include "unicodeobject.h"
47#include "ucnhash.h"
48
49#ifdef MS_WINDOWS
50#include <windows.h>
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96/* This dictionary holds all interned unicode strings.  Note that references
97   to strings in this dictionary are *not* counted in the string's ob_refcnt.
98   When the interned string reaches a refcnt of 0 the string deallocation
99   function will delete the reference from this dictionary.
100
101   Another way to look at this is that to say that the actual reference
102   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
103*/
104static PyObject *interned;
105
106/* Free list for Unicode objects */
107static PyUnicodeObject *free_list;
108static int numfree;
109
110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114   shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
117/* Default encoding to use and assume when NULL is passed as encoding
118   parameter; it is fixed to "utf-8".  Always use the
119   PyUnicode_GetDefaultEncoding() API to access this global.
120
121   Don't forget to alter Py_FileSystemDefaultEncoding if you change the
122   hard coded default!
123*/
124static const char unicode_default_encoding[] = "utf-8";
125
126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128	0, 0, 0, 0, 0, 0, 0, 0,
129/*     case 0x0009: * HORIZONTAL TABULATION */
130/*     case 0x000A: * LINE FEED */
131/*     case 0x000B: * VERTICAL TABULATION */
132/*     case 0x000C: * FORM FEED */
133/*     case 0x000D: * CARRIAGE RETURN */
134	0, 1, 1, 1, 1, 1, 0, 0,
135	0, 0, 0, 0, 0, 0, 0, 0,
136/*     case 0x001C: * FILE SEPARATOR */
137/*     case 0x001D: * GROUP SEPARATOR */
138/*     case 0x001E: * RECORD SEPARATOR */
139/*     case 0x001F: * UNIT SEPARATOR */
140	0, 0, 0, 0, 1, 1, 1, 1,
141/*     case 0x0020: * SPACE */
142	1, 0, 0, 0, 0, 0, 0, 0,
143	0, 0, 0, 0, 0, 0, 0, 0,
144	0, 0, 0, 0, 0, 0, 0, 0,
145	0, 0, 0, 0, 0, 0, 0, 0,
146
147	0, 0, 0, 0, 0, 0, 0, 0,
148	0, 0, 0, 0, 0, 0, 0, 0,
149	0, 0, 0, 0, 0, 0, 0, 0,
150	0, 0, 0, 0, 0, 0, 0, 0,
151	0, 0, 0, 0, 0, 0, 0, 0,
152	0, 0, 0, 0, 0, 0, 0, 0,
153	0, 0, 0, 0, 0, 0, 0, 0,
154	0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159	0, 0, 0, 0, 0, 0, 0, 0,
160/*         0x000A, * LINE FEED */
161/*         0x000D, * CARRIAGE RETURN */
162	0, 0, 1, 0, 0, 1, 0, 0,
163	0, 0, 0, 0, 0, 0, 0, 0,
164/*         0x001C, * FILE SEPARATOR */
165/*         0x001D, * GROUP SEPARATOR */
166/*         0x001E, * RECORD SEPARATOR */
167	0, 0, 0, 0, 1, 1, 1, 0,
168	0, 0, 0, 0, 0, 0, 0, 0,
169	0, 0, 0, 0, 0, 0, 0, 0,
170	0, 0, 0, 0, 0, 0, 0, 0,
171	0, 0, 0, 0, 0, 0, 0, 0,
172
173	0, 0, 0, 0, 0, 0, 0, 0,
174	0, 0, 0, 0, 0, 0, 0, 0,
175	0, 0, 0, 0, 0, 0, 0, 0,
176	0, 0, 0, 0, 0, 0, 0, 0,
177	0, 0, 0, 0, 0, 0, 0, 0,
178	0, 0, 0, 0, 0, 0, 0, 0,
179	0, 0, 0, 0, 0, 0, 0, 0,
180	0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
184Py_UNICODE
185PyUnicode_GetMax(void)
186{
187#ifdef Py_UNICODE_WIDE
188	return 0x10FFFF;
189#else
190	/* This is actually an illegal character, so it should
191	   not be passed to unichr. */
192	return 0xFFFF;
193#endif
194}
195
196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199   to keep things simple, we use a single bitmask, using the least 5
200   bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
210#define BLOOM_LINEBREAK(ch) \
211    ((ch) < 128U ? ascii_linebreak[(ch)] : \
212    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216    /* calculate simple bloom-style bitmask for a given unicode string */
217
218    long mask;
219    Py_ssize_t i;
220
221    mask = 0;
222    for (i = 0; i < len; i++)
223        mask |= (1 << (ptr[i] & 0x1F));
224
225    return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230    Py_ssize_t i;
231
232    for (i = 0; i < setlen; i++)
233        if (set[i] == chr)
234            return 1;
235
236    return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
242/* --- Unicode Object ----------------------------------------------------- */
243
244static
245int unicode_resize(register PyUnicodeObject *unicode,
246                      Py_ssize_t length)
247{
248    void *oldstr;
249
250    /* Shortcut if there's nothing much to do. */
251    if (unicode->length == length)
252	goto reset;
253
254    /* Resizing shared object (unicode_empty or single character
255       objects) in-place is not allowed. Use PyUnicode_Resize()
256       instead ! */
257
258    if (unicode == unicode_empty ||
259	(unicode->length == 1 &&
260	 unicode->str[0] < 256U &&
261	 unicode_latin1[unicode->str[0]] == unicode)) {
262        PyErr_SetString(PyExc_SystemError,
263                        "can't resize shared str objects");
264        return -1;
265    }
266
267    /* We allocate one more byte to make sure the string is Ux0000 terminated.
268       The overallocation is also used by fastsearch, which assumes that it's
269       safe to look at str[length] (without making any assumptions about what
270       it contains). */
271
272    oldstr = unicode->str;
273    unicode->str = PyObject_REALLOC(unicode->str,
274				    sizeof(Py_UNICODE) * (length + 1));
275    if (!unicode->str) {
276	unicode->str = (Py_UNICODE *)oldstr;
277        PyErr_NoMemory();
278        return -1;
279    }
280    unicode->str[length] = 0;
281    unicode->length = length;
282
283 reset:
284    /* Reset the object caches */
285    if (unicode->defenc) {
286        Py_DECREF(unicode->defenc);
287        unicode->defenc = NULL;
288    }
289    unicode->hash = -1;
290
291    return 0;
292}
293
294/* We allocate one more byte to make sure the string is
295   Ux0000 terminated; some code (e.g. new_identifier)
296   relies on that.
297
298   XXX This allocator could further be enhanced by assuring that the
299       free list never reduces its size below 1.
300
301*/
302
303static
304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305{
306    register PyUnicodeObject *unicode;
307
308    /* Optimization for empty strings */
309    if (length == 0 && unicode_empty != NULL) {
310        Py_INCREF(unicode_empty);
311        return unicode_empty;
312    }
313
314    /* Ensure we won't overflow the size. */
315    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316        return (PyUnicodeObject *)PyErr_NoMemory();
317    }
318
319    /* Unicode freelist & memory allocation */
320    if (free_list) {
321        unicode = free_list;
322        free_list = *(PyUnicodeObject **)unicode;
323        numfree--;
324	if (unicode->str) {
325	    /* Keep-Alive optimization: we only upsize the buffer,
326	       never downsize it. */
327	    if ((unicode->length < length) &&
328                unicode_resize(unicode, length) < 0) {
329		PyObject_DEL(unicode->str);
330		unicode->str = NULL;
331	    }
332	}
333        else {
334	    size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335	    unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336        }
337        PyObject_INIT(unicode, &PyUnicode_Type);
338    }
339    else {
340	size_t new_size;
341        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
342        if (unicode == NULL)
343            return NULL;
344	new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345	unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
346    }
347
348    if (!unicode->str) {
349	PyErr_NoMemory();
350	goto onError;
351    }
352    /* Initialize the first element to guard against cases where
353     * the caller fails before initializing str -- unicode_resize()
354     * reads str[0], and the Keep-Alive optimization can keep memory
355     * allocated for str alive across a call to unicode_dealloc(unicode).
356     * We don't want unicode_resize to read uninitialized memory in
357     * that case.
358     */
359    unicode->str[0] = 0;
360    unicode->str[length] = 0;
361    unicode->length = length;
362    unicode->hash = -1;
363    unicode->state = 0;
364    unicode->defenc = NULL;
365    return unicode;
366
367 onError:
368    /* XXX UNREF/NEWREF interface should be more symmetrical */
369    _Py_DEC_REFTOTAL;
370    _Py_ForgetReference((PyObject *)unicode);
371    PyObject_Del(unicode);
372    return NULL;
373}
374
375static
376void unicode_dealloc(register PyUnicodeObject *unicode)
377{
378    switch (PyUnicode_CHECK_INTERNED(unicode)) {
379        case SSTATE_NOT_INTERNED:
380            break;
381
382        case SSTATE_INTERNED_MORTAL:
383            /* revive dead object temporarily for DelItem */
384            Py_REFCNT(unicode) = 3;
385            if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
386                Py_FatalError(
387                    "deletion of interned string failed");
388            break;
389
390        case SSTATE_INTERNED_IMMORTAL:
391            Py_FatalError("Immortal interned string died.");
392
393        default:
394            Py_FatalError("Inconsistent interned string state.");
395    }
396
397    if (PyUnicode_CheckExact(unicode) &&
398	numfree < PyUnicode_MAXFREELIST) {
399        /* Keep-Alive optimization */
400	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
401	    PyObject_DEL(unicode->str);
402	    unicode->str = NULL;
403	    unicode->length = 0;
404	}
405	if (unicode->defenc) {
406	    Py_DECREF(unicode->defenc);
407	    unicode->defenc = NULL;
408	}
409	/* Add to free list */
410        *(PyUnicodeObject **)unicode = free_list;
411        free_list = unicode;
412        numfree++;
413    }
414    else {
415	PyObject_DEL(unicode->str);
416	Py_XDECREF(unicode->defenc);
417	Py_TYPE(unicode)->tp_free((PyObject *)unicode);
418    }
419}
420
421int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
422{
423    register PyUnicodeObject *v;
424
425    /* Argument checks */
426    if (unicode == NULL) {
427	PyErr_BadInternalCall();
428	return -1;
429    }
430    v = (PyUnicodeObject *)*unicode;
431    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
432	PyErr_BadInternalCall();
433	return -1;
434    }
435
436    /* Resizing unicode_empty and single character objects is not
437       possible since these are being shared. We simply return a fresh
438       copy with the same Unicode content. */
439    if (v->length != length &&
440	(v == unicode_empty || v->length == 1)) {
441	PyUnicodeObject *w = _PyUnicode_New(length);
442	if (w == NULL)
443	    return -1;
444	Py_UNICODE_COPY(w->str, v->str,
445			length < v->length ? length : v->length);
446	Py_DECREF(*unicode);
447	*unicode = (PyObject *)w;
448	return 0;
449    }
450
451    /* Note that we don't have to modify *unicode for unshared Unicode
452       objects, since we can modify them in-place. */
453    return unicode_resize(v, length);
454}
455
456/* Internal API for use in unicodeobject.c only ! */
457#define _PyUnicode_Resize(unicodevar, length) \
458        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
459
460PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
461				Py_ssize_t size)
462{
463    PyUnicodeObject *unicode;
464
465    /* If the Unicode data is known at construction time, we can apply
466       some optimizations which share commonly used objects. */
467    if (u != NULL) {
468
469	/* Optimization for empty strings */
470	if (size == 0 && unicode_empty != NULL) {
471	    Py_INCREF(unicode_empty);
472	    return (PyObject *)unicode_empty;
473	}
474
475	/* Single character Unicode objects in the Latin-1 range are
476	   shared when using this constructor */
477	if (size == 1 && *u < 256) {
478	    unicode = unicode_latin1[*u];
479	    if (!unicode) {
480		unicode = _PyUnicode_New(1);
481		if (!unicode)
482		    return NULL;
483		unicode->str[0] = *u;
484		unicode_latin1[*u] = unicode;
485	    }
486	    Py_INCREF(unicode);
487	    return (PyObject *)unicode;
488	}
489    }
490
491    unicode = _PyUnicode_New(size);
492    if (!unicode)
493        return NULL;
494
495    /* Copy the Unicode data into the new object */
496    if (u != NULL)
497	Py_UNICODE_COPY(unicode->str, u, size);
498
499    return (PyObject *)unicode;
500}
501
502PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
503{
504    PyUnicodeObject *unicode;
505
506	if (size < 0) {
507		PyErr_SetString(PyExc_SystemError,
508		    "Negative size passed to PyUnicode_FromStringAndSize");
509		return NULL;
510	}
511
512    /* If the Unicode data is known at construction time, we can apply
513       some optimizations which share commonly used objects.
514       Also, this means the input must be UTF-8, so fall back to the
515       UTF-8 decoder at the end. */
516    if (u != NULL) {
517
518	/* Optimization for empty strings */
519	if (size == 0 && unicode_empty != NULL) {
520	    Py_INCREF(unicode_empty);
521	    return (PyObject *)unicode_empty;
522	}
523
524	/* Single characters are shared when using this constructor.
525           Restrict to ASCII, since the input must be UTF-8. */
526	if (size == 1 && Py_CHARMASK(*u) < 128) {
527	    unicode = unicode_latin1[Py_CHARMASK(*u)];
528	    if (!unicode) {
529		unicode = _PyUnicode_New(1);
530		if (!unicode)
531		    return NULL;
532		unicode->str[0] = Py_CHARMASK(*u);
533		unicode_latin1[Py_CHARMASK(*u)] = unicode;
534	    }
535	    Py_INCREF(unicode);
536	    return (PyObject *)unicode;
537	}
538
539        return PyUnicode_DecodeUTF8(u, size, NULL);
540    }
541
542    unicode = _PyUnicode_New(size);
543    if (!unicode)
544        return NULL;
545
546    return (PyObject *)unicode;
547}
548
549PyObject *PyUnicode_FromString(const char *u)
550{
551    size_t size = strlen(u);
552    if (size > PY_SSIZE_T_MAX) {
553        PyErr_SetString(PyExc_OverflowError, "input too long");
554        return NULL;
555    }
556
557    return PyUnicode_FromStringAndSize(u, size);
558}
559
560#ifdef HAVE_WCHAR_H
561
562PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
563				 Py_ssize_t size)
564{
565    PyUnicodeObject *unicode;
566
567    if (w == NULL) {
568        if (size == 0)
569            return PyUnicode_FromStringAndSize(NULL, 0);
570	PyErr_BadInternalCall();
571	return NULL;
572    }
573
574    if (size == -1) {
575        size = wcslen(w);
576    }
577
578    unicode = _PyUnicode_New(size);
579    if (!unicode)
580        return NULL;
581
582    /* Copy the wchar_t data into the new object */
583#ifdef HAVE_USABLE_WCHAR_T
584    memcpy(unicode->str, w, size * sizeof(wchar_t));
585#else
586    {
587	register Py_UNICODE *u;
588	register Py_ssize_t i;
589	u = PyUnicode_AS_UNICODE(unicode);
590	for (i = size; i > 0; i--)
591	    *u++ = *w++;
592    }
593#endif
594
595    return (PyObject *)unicode;
596}
597
598static void
599makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
600{
601	*fmt++ = '%';
602	if (width) {
603		if (zeropad)
604			*fmt++ = '0';
605		fmt += sprintf(fmt, "%d", width);
606	}
607	if (precision)
608		fmt += sprintf(fmt, ".%d", precision);
609	if (longflag)
610		*fmt++ = 'l';
611	else if (size_tflag) {
612		char *f = PY_FORMAT_SIZE_T;
613		while (*f)
614			*fmt++ = *f++;
615	}
616	*fmt++ = c;
617	*fmt = '\0';
618}
619
620#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
621
622PyObject *
623PyUnicode_FromFormatV(const char *format, va_list vargs)
624{
625	va_list count;
626	Py_ssize_t callcount = 0;
627	PyObject **callresults = NULL;
628	PyObject **callresult = NULL;
629	Py_ssize_t n = 0;
630	int width = 0;
631	int precision = 0;
632	int zeropad;
633	const char* f;
634	Py_UNICODE *s;
635	PyObject *string;
636	/* used by sprintf */
637	char buffer[21];
638	/* use abuffer instead of buffer, if we need more space
639	 * (which can happen if there's a format specifier with width). */
640	char *abuffer = NULL;
641	char *realbuffer;
642	Py_ssize_t abuffersize = 0;
643	char fmt[60]; /* should be enough for %0width.precisionld */
644	const char *copy;
645
646#ifdef VA_LIST_IS_ARRAY
647	Py_MEMCPY(count, vargs, sizeof(va_list));
648#else
649#ifdef  __va_copy
650	__va_copy(count, vargs);
651#else
652	count = vargs;
653#endif
654#endif
655	/* step 1: count the number of %S/%R/%A format specifications
656	 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
657	 * these objects once during step 3 and put the result in
658	   an array) */
659	for (f = format; *f; f++) {
660		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
661			++callcount;
662	}
663	/* step 2: allocate memory for the results of
664	 * PyObject_Str()/PyObject_Repr() calls */
665	if (callcount) {
666		callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
667		if (!callresults) {
668			PyErr_NoMemory();
669			return NULL;
670		}
671		callresult = callresults;
672	}
673	/* step 3: figure out how large a buffer we need */
674	for (f = format; *f; f++) {
675		if (*f == '%') {
676			const char* p = f;
677			width = 0;
678			while (ISDIGIT((unsigned)*f))
679				width = (width*10) + *f++ - '0';
680			while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
681				;
682
683			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
684			 * they don't affect the amount of space we reserve.
685			 */
686			if ((*f == 'l' || *f == 'z') &&
687					(f[1] == 'd' || f[1] == 'u'))
688                                ++f;
689
690			switch (*f) {
691			case 'c':
692				(void)va_arg(count, int);
693				/* fall through... */
694			case '%':
695				n++;
696				break;
697			case 'd': case 'u': case 'i': case 'x':
698				(void) va_arg(count, int);
699				/* 20 bytes is enough to hold a 64-bit
700				   integer.  Decimal takes the most space.
701				   This isn't enough for octal.
702				   If a width is specified we need more
703				   (which we allocate later). */
704				if (width < 20)
705					width = 20;
706				n += width;
707				if (abuffersize < width)
708					abuffersize = width;
709				break;
710			case 's':
711			{
712				/* UTF-8 */
713				unsigned char*s;
714				s = va_arg(count, unsigned char*);
715				while (*s) {
716					if (*s < 128) {
717						n++; s++;
718					} else if (*s < 0xc0) {
719						/* invalid UTF-8 */
720						n++; s++;
721					} else if (*s < 0xc0) {
722						n++;
723						s++; if(!*s)break;
724						s++;
725					} else if (*s < 0xe0) {
726						n++;
727						s++; if(!*s)break;
728						s++; if(!*s)break;
729						s++;
730					} else {
731						#ifdef Py_UNICODE_WIDE
732						n++;
733						#else
734						n+=2;
735						#endif
736						s++; if(!*s)break;
737						s++; if(!*s)break;
738						s++; if(!*s)break;
739						s++;
740					}
741				}
742				break;
743			}
744			case 'U':
745			{
746				PyObject *obj = va_arg(count, PyObject *);
747				assert(obj && PyUnicode_Check(obj));
748				n += PyUnicode_GET_SIZE(obj);
749				break;
750			}
751			case 'V':
752			{
753				PyObject *obj = va_arg(count, PyObject *);
754				const char *str = va_arg(count, const char *);
755				assert(obj || str);
756				assert(!obj || PyUnicode_Check(obj));
757				if (obj)
758					n += PyUnicode_GET_SIZE(obj);
759				else
760					n += strlen(str);
761				break;
762			}
763			case 'S':
764			{
765				PyObject *obj = va_arg(count, PyObject *);
766				PyObject *str;
767				assert(obj);
768				str = PyObject_Str(obj);
769				if (!str)
770					goto fail;
771				n += PyUnicode_GET_SIZE(str);
772				/* Remember the str and switch to the next slot */
773				*callresult++ = str;
774				break;
775			}
776			case 'R':
777			{
778				PyObject *obj = va_arg(count, PyObject *);
779				PyObject *repr;
780				assert(obj);
781				repr = PyObject_Repr(obj);
782				if (!repr)
783					goto fail;
784				n += PyUnicode_GET_SIZE(repr);
785				/* Remember the repr and switch to the next slot */
786				*callresult++ = repr;
787				break;
788			}
789			case 'A':
790			{
791				PyObject *obj = va_arg(count, PyObject *);
792				PyObject *ascii;
793				assert(obj);
794				ascii = PyObject_ASCII(obj);
795				if (!ascii)
796					goto fail;
797				n += PyUnicode_GET_SIZE(ascii);
798				/* Remember the repr and switch to the next slot */
799				*callresult++ = ascii;
800				break;
801			}
802			case 'p':
803				(void) va_arg(count, int);
804				/* maximum 64-bit pointer representation:
805				 * 0xffffffffffffffff
806				 * so 19 characters is enough.
807				 * XXX I count 18 -- what's the extra for?
808				 */
809				n += 19;
810				break;
811			default:
812				/* if we stumble upon an unknown
813				   formatting code, copy the rest of
814				   the format string to the output
815				   string. (we cannot just skip the
816				   code, since there's no way to know
817				   what's in the argument list) */
818				n += strlen(p);
819				goto expand;
820			}
821		} else
822			n++;
823	}
824 expand:
825	if (abuffersize > 20) {
826		abuffer = PyObject_Malloc(abuffersize);
827		if (!abuffer) {
828			PyErr_NoMemory();
829			goto fail;
830		}
831		realbuffer = abuffer;
832	}
833	else
834		realbuffer = buffer;
835	/* step 4: fill the buffer */
836	/* Since we've analyzed how much space we need for the worst case,
837	   we don't have to resize the string.
838	   There can be no errors beyond this point. */
839	string = PyUnicode_FromUnicode(NULL, n);
840	if (!string)
841		goto fail;
842
843	s = PyUnicode_AS_UNICODE(string);
844	callresult = callresults;
845
846	for (f = format; *f; f++) {
847		if (*f == '%') {
848			const char* p = f++;
849			int longflag = 0;
850			int size_tflag = 0;
851			zeropad = (*f == '0');
852			/* parse the width.precision part */
853			width = 0;
854			while (ISDIGIT((unsigned)*f))
855				width = (width*10) + *f++ - '0';
856			precision = 0;
857			if (*f == '.') {
858				f++;
859				while (ISDIGIT((unsigned)*f))
860					precision = (precision*10) + *f++ - '0';
861			}
862			/* handle the long flag, but only for %ld and %lu.
863			   others can be added when necessary. */
864			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
865				longflag = 1;
866				++f;
867			}
868			/* handle the size_t flag. */
869			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
870				size_tflag = 1;
871				++f;
872			}
873
874			switch (*f) {
875			case 'c':
876				*s++ = va_arg(vargs, int);
877				break;
878			case 'd':
879				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
880				if (longflag)
881					sprintf(realbuffer, fmt, va_arg(vargs, long));
882				else if (size_tflag)
883					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
884				else
885					sprintf(realbuffer, fmt, va_arg(vargs, int));
886				appendstring(realbuffer);
887				break;
888			case 'u':
889				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
890				if (longflag)
891					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
892				else if (size_tflag)
893					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
894				else
895					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
896				appendstring(realbuffer);
897				break;
898			case 'i':
899				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
900				sprintf(realbuffer, fmt, va_arg(vargs, int));
901				appendstring(realbuffer);
902				break;
903			case 'x':
904				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
905				sprintf(realbuffer, fmt, va_arg(vargs, int));
906				appendstring(realbuffer);
907				break;
908			case 's':
909			{
910				/* Parameter must be UTF-8 encoded.
911				   In case of encoding errors, use
912				   the replacement character. */
913				PyObject *u;
914				p = va_arg(vargs, char*);
915				u = PyUnicode_DecodeUTF8(p, strlen(p),
916							 "replace");
917				if (!u)
918					goto fail;
919				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
920						PyUnicode_GET_SIZE(u));
921				s += PyUnicode_GET_SIZE(u);
922				Py_DECREF(u);
923				break;
924			}
925			case 'U':
926			{
927				PyObject *obj = va_arg(vargs, PyObject *);
928				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
929				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
930				s += size;
931				break;
932			}
933			case 'V':
934			{
935				PyObject *obj = va_arg(vargs, PyObject *);
936				const char *str = va_arg(vargs, const char *);
937				if (obj) {
938					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
939					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
940					s += size;
941				} else {
942					appendstring(str);
943				}
944				break;
945			}
946			case 'S':
947			case 'R':
948			{
949				Py_UNICODE *ucopy;
950				Py_ssize_t usize;
951				Py_ssize_t upos;
952				/* unused, since we already have the result */
953				(void) va_arg(vargs, PyObject *);
954				ucopy = PyUnicode_AS_UNICODE(*callresult);
955				usize = PyUnicode_GET_SIZE(*callresult);
956				for (upos = 0; upos<usize;)
957					*s++ = ucopy[upos++];
958				/* We're done with the unicode()/repr() => forget it */
959				Py_DECREF(*callresult);
960				/* switch to next unicode()/repr() result */
961				++callresult;
962				break;
963			}
964			case 'p':
965				sprintf(buffer, "%p", va_arg(vargs, void*));
966				/* %p is ill-defined:  ensure leading 0x. */
967				if (buffer[1] == 'X')
968					buffer[1] = 'x';
969				else if (buffer[1] != 'x') {
970					memmove(buffer+2, buffer, strlen(buffer)+1);
971					buffer[0] = '0';
972					buffer[1] = 'x';
973				}
974				appendstring(buffer);
975				break;
976			case '%':
977				*s++ = '%';
978				break;
979			default:
980				appendstring(p);
981				goto end;
982			}
983		} else
984			*s++ = *f;
985	}
986
987 end:
988	if (callresults)
989		PyObject_Free(callresults);
990	if (abuffer)
991		PyObject_Free(abuffer);
992	_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
993	return string;
994 fail:
995	if (callresults) {
996		PyObject **callresult2 = callresults;
997		while (callresult2 < callresult) {
998			Py_DECREF(*callresult2);
999			++callresult2;
1000		}
1001		PyObject_Free(callresults);
1002	}
1003	if (abuffer)
1004		PyObject_Free(abuffer);
1005	return NULL;
1006}
1007
1008#undef appendstring
1009
1010PyObject *
1011PyUnicode_FromFormat(const char *format, ...)
1012{
1013	PyObject* ret;
1014	va_list vargs;
1015
1016#ifdef HAVE_STDARG_PROTOTYPES
1017	va_start(vargs, format);
1018#else
1019	va_start(vargs);
1020#endif
1021	ret = PyUnicode_FromFormatV(format, vargs);
1022	va_end(vargs);
1023	return ret;
1024}
1025
1026Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1027				wchar_t *w,
1028				Py_ssize_t size)
1029{
1030    if (unicode == NULL) {
1031	PyErr_BadInternalCall();
1032	return -1;
1033    }
1034
1035    /* If possible, try to copy the 0-termination as well */
1036    if (size > PyUnicode_GET_SIZE(unicode))
1037	size = PyUnicode_GET_SIZE(unicode) + 1;
1038
1039#ifdef HAVE_USABLE_WCHAR_T
1040    memcpy(w, unicode->str, size * sizeof(wchar_t));
1041#else
1042    {
1043	register Py_UNICODE *u;
1044	register Py_ssize_t i;
1045	u = PyUnicode_AS_UNICODE(unicode);
1046	for (i = size; i > 0; i--)
1047	    *w++ = *u++;
1048    }
1049#endif
1050
1051    if (size > PyUnicode_GET_SIZE(unicode))
1052        return PyUnicode_GET_SIZE(unicode);
1053    else
1054    return size;
1055}
1056
1057#endif
1058
1059PyObject *PyUnicode_FromOrdinal(int ordinal)
1060{
1061    Py_UNICODE s[2];
1062
1063    if (ordinal < 0 || ordinal > 0x10ffff) {
1064	PyErr_SetString(PyExc_ValueError,
1065			"chr() arg not in range(0x110000)");
1066	return NULL;
1067    }
1068
1069#ifndef Py_UNICODE_WIDE
1070    if (ordinal > 0xffff) {
1071        ordinal -= 0x10000;
1072        s[0] = 0xD800 | (ordinal >> 10);
1073        s[1] = 0xDC00 | (ordinal & 0x3FF);
1074        return PyUnicode_FromUnicode(s, 2);
1075    }
1076#endif
1077
1078    s[0] = (Py_UNICODE)ordinal;
1079    return PyUnicode_FromUnicode(s, 1);
1080}
1081
1082PyObject *PyUnicode_FromObject(register PyObject *obj)
1083{
1084    /* XXX Perhaps we should make this API an alias of
1085           PyObject_Str() instead ?! */
1086    if (PyUnicode_CheckExact(obj)) {
1087	Py_INCREF(obj);
1088	return obj;
1089    }
1090    if (PyUnicode_Check(obj)) {
1091	/* For a Unicode subtype that's not a Unicode object,
1092	   return a true Unicode object with the same data. */
1093	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1094				     PyUnicode_GET_SIZE(obj));
1095    }
1096    PyErr_Format(PyExc_TypeError,
1097                 "Can't convert '%.100s' object to str implicitly",
1098                 Py_TYPE(obj)->tp_name);
1099    return NULL;
1100}
1101
1102PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1103				      const char *encoding,
1104				      const char *errors)
1105{
1106    const char *s = NULL;
1107    Py_ssize_t len;
1108    PyObject *v;
1109
1110    if (obj == NULL) {
1111	PyErr_BadInternalCall();
1112	return NULL;
1113    }
1114
1115    if (PyUnicode_Check(obj)) {
1116	PyErr_SetString(PyExc_TypeError,
1117			"decoding str is not supported");
1118	return NULL;
1119	}
1120
1121    /* Coerce object */
1122    if (PyBytes_Check(obj)) {
1123        s = PyBytes_AS_STRING(obj);
1124        len = PyBytes_GET_SIZE(obj);
1125    }
1126    else if (PyByteArray_Check(obj)) {
1127        s = PyByteArray_AS_STRING(obj);
1128        len = PyByteArray_GET_SIZE(obj);
1129    }
1130    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1131	/* Overwrite the error message with something more useful in
1132	   case of a TypeError. */
1133	if (PyErr_ExceptionMatches(PyExc_TypeError))
1134            PyErr_Format(PyExc_TypeError,
1135			 "coercing to str: need string or buffer, "
1136			 "%.80s found",
1137		     Py_TYPE(obj)->tp_name);
1138	goto onError;
1139    }
1140
1141    /* Convert to Unicode */
1142    if (len == 0) {
1143	Py_INCREF(unicode_empty);
1144	v = (PyObject *)unicode_empty;
1145    }
1146    else
1147	v = PyUnicode_Decode(s, len, encoding, errors);
1148
1149    return v;
1150
1151 onError:
1152    return NULL;
1153}
1154
1155PyObject *PyUnicode_Decode(const char *s,
1156			   Py_ssize_t size,
1157			   const char *encoding,
1158			   const char *errors)
1159{
1160    PyObject *buffer = NULL, *unicode;
1161    Py_buffer info;
1162    char lower[20];  /* Enough for any encoding name we recognize */
1163    char *l;
1164    const char *e;
1165
1166    if (encoding == NULL)
1167        encoding = PyUnicode_GetDefaultEncoding();
1168
1169    /* Convert encoding to lower case and replace '_' with '-' in order to
1170       catch e.g. UTF_8 */
1171    e = encoding;
1172    l = lower;
1173    while (*e && l < &lower[(sizeof lower) - 2]) {
1174        if (ISUPPER(*e)) {
1175            *l++ = TOLOWER(*e++);
1176        }
1177        else if (*e == '_') {
1178            *l++ = '-';
1179            e++;
1180        }
1181        else {
1182            *l++ = *e++;
1183        }
1184    }
1185    *l = '\0';
1186
1187    /* Shortcuts for common default encodings */
1188    if (strcmp(lower, "utf-8") == 0)
1189        return PyUnicode_DecodeUTF8(s, size, errors);
1190    else if ((strcmp(lower, "latin-1") == 0) ||
1191             (strcmp(lower, "iso-8859-1") == 0))
1192        return PyUnicode_DecodeLatin1(s, size, errors);
1193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194    else if (strcmp(lower, "mbcs") == 0)
1195        return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
1197    else if (strcmp(lower, "ascii") == 0)
1198        return PyUnicode_DecodeASCII(s, size, errors);
1199    else if (strcmp(lower, "utf-16") == 0)
1200        return PyUnicode_DecodeUTF16(s, size, errors, 0);
1201    else if (strcmp(lower, "utf-32") == 0)
1202        return PyUnicode_DecodeUTF32(s, size, errors, 0);
1203
1204    /* Decode via the codec registry */
1205    buffer = NULL;
1206    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1207        goto onError;
1208    buffer = PyMemoryView_FromBuffer(&info);
1209    if (buffer == NULL)
1210        goto onError;
1211    unicode = PyCodec_Decode(buffer, encoding, errors);
1212    if (unicode == NULL)
1213        goto onError;
1214    if (!PyUnicode_Check(unicode)) {
1215        PyErr_Format(PyExc_TypeError,
1216                     "decoder did not return a str object (type=%.400s)",
1217                     Py_TYPE(unicode)->tp_name);
1218        Py_DECREF(unicode);
1219        goto onError;
1220    }
1221    Py_DECREF(buffer);
1222    return unicode;
1223
1224 onError:
1225    Py_XDECREF(buffer);
1226    return NULL;
1227}
1228
1229PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1230                                    const char *encoding,
1231                                    const char *errors)
1232{
1233    PyObject *v;
1234
1235    if (!PyUnicode_Check(unicode)) {
1236        PyErr_BadArgument();
1237        goto onError;
1238    }
1239
1240    if (encoding == NULL)
1241	encoding = PyUnicode_GetDefaultEncoding();
1242
1243    /* Decode via the codec registry */
1244    v = PyCodec_Decode(unicode, encoding, errors);
1245    if (v == NULL)
1246        goto onError;
1247    return v;
1248
1249 onError:
1250    return NULL;
1251}
1252
1253PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1254                                     const char *encoding,
1255                                     const char *errors)
1256{
1257    PyObject *v;
1258
1259    if (!PyUnicode_Check(unicode)) {
1260        PyErr_BadArgument();
1261        goto onError;
1262    }
1263
1264    if (encoding == NULL)
1265	encoding = PyUnicode_GetDefaultEncoding();
1266
1267    /* Decode via the codec registry */
1268    v = PyCodec_Decode(unicode, encoding, errors);
1269    if (v == NULL)
1270        goto onError;
1271    if (!PyUnicode_Check(v)) {
1272        PyErr_Format(PyExc_TypeError,
1273                     "decoder did not return a str object (type=%.400s)",
1274                     Py_TYPE(v)->tp_name);
1275        Py_DECREF(v);
1276        goto onError;
1277    }
1278    return v;
1279
1280 onError:
1281    return NULL;
1282}
1283
1284PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1285			   Py_ssize_t size,
1286			   const char *encoding,
1287			   const char *errors)
1288{
1289    PyObject *v, *unicode;
1290
1291    unicode = PyUnicode_FromUnicode(s, size);
1292    if (unicode == NULL)
1293	return NULL;
1294    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1295    Py_DECREF(unicode);
1296    return v;
1297}
1298
1299PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1300                                    const char *encoding,
1301                                    const char *errors)
1302{
1303    PyObject *v;
1304
1305    if (!PyUnicode_Check(unicode)) {
1306        PyErr_BadArgument();
1307        goto onError;
1308    }
1309
1310    if (encoding == NULL)
1311	encoding = PyUnicode_GetDefaultEncoding();
1312
1313    /* Encode via the codec registry */
1314    v = PyCodec_Encode(unicode, encoding, errors);
1315    if (v == NULL)
1316        goto onError;
1317    return v;
1318
1319 onError:
1320    return NULL;
1321}
1322
1323PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1324                                    const char *encoding,
1325                                    const char *errors)
1326{
1327    PyObject *v;
1328
1329    if (!PyUnicode_Check(unicode)) {
1330        PyErr_BadArgument();
1331        return NULL;
1332    }
1333
1334    if (encoding == NULL)
1335	encoding = PyUnicode_GetDefaultEncoding();
1336
1337    /* Shortcuts for common default encodings */
1338    if (errors == NULL) {
1339	if (strcmp(encoding, "utf-8") == 0)
1340	    return PyUnicode_AsUTF8String(unicode);
1341	else if (strcmp(encoding, "latin-1") == 0)
1342	    return PyUnicode_AsLatin1String(unicode);
1343#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1344	else if (strcmp(encoding, "mbcs") == 0)
1345	    return PyUnicode_AsMBCSString(unicode);
1346#endif
1347	else if (strcmp(encoding, "ascii") == 0)
1348	    return PyUnicode_AsASCIIString(unicode);
1349        /* During bootstrap, we may need to find the encodings
1350           package, to load the file system encoding, and require the
1351           file system encoding in order to load the encodings
1352           package.
1353
1354           Break out of this dependency by assuming that the path to
1355           the encodings module is ASCII-only.  XXX could try wcstombs
1356           instead, if the file system encoding is the locale's
1357           encoding. */
1358        else if (Py_FileSystemDefaultEncoding &&
1359                 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1360                 !PyThreadState_GET()->interp->codecs_initialized)
1361	    return PyUnicode_AsASCIIString(unicode);
1362    }
1363
1364    /* Encode via the codec registry */
1365    v = PyCodec_Encode(unicode, encoding, errors);
1366    if (v == NULL)
1367        return NULL;
1368
1369    /* The normal path */
1370    if (PyBytes_Check(v))
1371        return v;
1372
1373    /* If the codec returns a buffer, raise a warning and convert to bytes */
1374    if (PyByteArray_Check(v)) {
1375        char msg[100];
1376        PyObject *b;
1377        PyOS_snprintf(msg, sizeof(msg),
1378                      "encoder %s returned buffer instead of bytes",
1379                      encoding);
1380        if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1381            Py_DECREF(v);
1382            return NULL;
1383        }
1384
1385        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1386        Py_DECREF(v);
1387        return b;
1388    }
1389
1390    PyErr_Format(PyExc_TypeError,
1391                 "encoder did not return a bytes object (type=%.400s)",
1392                 Py_TYPE(v)->tp_name);
1393    Py_DECREF(v);
1394    return NULL;
1395}
1396
1397PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1398                                     const char *encoding,
1399                                     const char *errors)
1400{
1401    PyObject *v;
1402
1403    if (!PyUnicode_Check(unicode)) {
1404        PyErr_BadArgument();
1405        goto onError;
1406    }
1407
1408    if (encoding == NULL)
1409	encoding = PyUnicode_GetDefaultEncoding();
1410
1411    /* Encode via the codec registry */
1412    v = PyCodec_Encode(unicode, encoding, errors);
1413    if (v == NULL)
1414        goto onError;
1415    if (!PyUnicode_Check(v)) {
1416        PyErr_Format(PyExc_TypeError,
1417                     "encoder did not return an str object (type=%.400s)",
1418                     Py_TYPE(v)->tp_name);
1419        Py_DECREF(v);
1420        goto onError;
1421    }
1422    return v;
1423
1424 onError:
1425    return NULL;
1426}
1427
1428PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1429					    const char *errors)
1430{
1431    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1432    if (v)
1433        return v;
1434    if (errors != NULL)
1435        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1436    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1437                             PyUnicode_GET_SIZE(unicode),
1438                             NULL);
1439    if (!v)
1440        return NULL;
1441    ((PyUnicodeObject *)unicode)->defenc = v;
1442    return v;
1443}
1444
1445PyObject*
1446PyUnicode_DecodeFSDefault(const char *s) {
1447    Py_ssize_t size = (Py_ssize_t)strlen(s);
1448    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1449}
1450
1451PyObject*
1452PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1453{
1454    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1455       can be undefined. If it is case, decode using UTF-8. The following assumes
1456       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1457       bootstrapping process where the codecs aren't ready yet.
1458    */
1459    if (Py_FileSystemDefaultEncoding) {
1460#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1461        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1462            return PyUnicode_DecodeMBCS(s, size, "replace");
1463        }
1464#elif defined(__APPLE__)
1465        if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1466            return PyUnicode_DecodeUTF8(s, size, "replace");
1467        }
1468#endif
1469        return PyUnicode_Decode(s, size,
1470                                Py_FileSystemDefaultEncoding,
1471                                "replace");
1472    }
1473    else {
1474        return PyUnicode_DecodeUTF8(s, size, "replace");
1475    }
1476}
1477
1478char*
1479_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1480{
1481    PyObject *bytes;
1482    if (!PyUnicode_Check(unicode)) {
1483        PyErr_BadArgument();
1484        return NULL;
1485    }
1486    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1487    if (bytes == NULL)
1488        return NULL;
1489    if (psize != NULL)
1490        *psize = PyBytes_GET_SIZE(bytes);
1491    return PyBytes_AS_STRING(bytes);
1492}
1493
1494char*
1495_PyUnicode_AsString(PyObject *unicode)
1496{
1497    return _PyUnicode_AsStringAndSize(unicode, NULL);
1498}
1499
1500Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1501{
1502    if (!PyUnicode_Check(unicode)) {
1503        PyErr_BadArgument();
1504        goto onError;
1505    }
1506    return PyUnicode_AS_UNICODE(unicode);
1507
1508 onError:
1509    return NULL;
1510}
1511
1512Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1513{
1514    if (!PyUnicode_Check(unicode)) {
1515        PyErr_BadArgument();
1516        goto onError;
1517    }
1518    return PyUnicode_GET_SIZE(unicode);
1519
1520 onError:
1521    return -1;
1522}
1523
1524const char *PyUnicode_GetDefaultEncoding(void)
1525{
1526    return unicode_default_encoding;
1527}
1528
1529int PyUnicode_SetDefaultEncoding(const char *encoding)
1530{
1531    if (strcmp(encoding, unicode_default_encoding) != 0) {
1532        PyErr_Format(PyExc_ValueError,
1533                     "Can only set default encoding to %s",
1534                     unicode_default_encoding);
1535        return -1;
1536    }
1537    return 0;
1538}
1539
1540/* error handling callback helper:
1541   build arguments, call the callback and check the arguments,
1542   if no exception occurred, copy the replacement to the output
1543   and adjust various state variables.
1544   return 0 on success, -1 on error
1545*/
1546
1547static
1548int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1549                 const char *encoding, const char *reason,
1550                 const char **input, const char **inend, Py_ssize_t *startinpos,
1551                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1552                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1553{
1554    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
1555
1556    PyObject *restuple = NULL;
1557    PyObject *repunicode = NULL;
1558    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1559    Py_ssize_t insize;
1560    Py_ssize_t requiredsize;
1561    Py_ssize_t newpos;
1562    Py_UNICODE *repptr;
1563    PyObject *inputobj = NULL;
1564    Py_ssize_t repsize;
1565    int res = -1;
1566
1567    if (*errorHandler == NULL) {
1568	*errorHandler = PyCodec_LookupError(errors);
1569	if (*errorHandler == NULL)
1570	   goto onError;
1571    }
1572
1573    if (*exceptionObject == NULL) {
1574    	*exceptionObject = PyUnicodeDecodeError_Create(
1575	    encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1576	if (*exceptionObject == NULL)
1577	   goto onError;
1578    }
1579    else {
1580	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1581	    goto onError;
1582	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1583	    goto onError;
1584	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1585	    goto onError;
1586    }
1587
1588    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1589    if (restuple == NULL)
1590	goto onError;
1591    if (!PyTuple_Check(restuple)) {
1592	PyErr_Format(PyExc_TypeError, &argparse[4]);
1593	goto onError;
1594    }
1595    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1596	goto onError;
1597
1598    /* Copy back the bytes variables, which might have been modified by the
1599       callback */
1600    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1601    if (!inputobj)
1602        goto onError;
1603    if (!PyBytes_Check(inputobj)) {
1604	PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1605    }
1606    *input = PyBytes_AS_STRING(inputobj);
1607    insize = PyBytes_GET_SIZE(inputobj);
1608    *inend = *input + insize;
1609    /* we can DECREF safely, as the exception has another reference,
1610       so the object won't go away. */
1611    Py_DECREF(inputobj);
1612
1613    if (newpos<0)
1614	newpos = insize+newpos;
1615    if (newpos<0 || newpos>insize) {
1616	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1617	goto onError;
1618    }
1619
1620    /* need more space? (at least enough for what we
1621       have+the replacement+the rest of the string (starting
1622       at the new input position), so we won't have to check space
1623       when there are no errors in the rest of the string) */
1624    repptr = PyUnicode_AS_UNICODE(repunicode);
1625    repsize = PyUnicode_GET_SIZE(repunicode);
1626    requiredsize = *outpos + repsize + insize-newpos;
1627    if (requiredsize > outsize) {
1628	if (requiredsize<2*outsize)
1629	    requiredsize = 2*outsize;
1630	if (PyUnicode_Resize(output, requiredsize) < 0)
1631	    goto onError;
1632	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1633    }
1634    *endinpos = newpos;
1635    *inptr = *input + newpos;
1636    Py_UNICODE_COPY(*outptr, repptr, repsize);
1637    *outptr += repsize;
1638    *outpos += repsize;
1639
1640    /* we made it! */
1641    res = 0;
1642
1643    onError:
1644    Py_XDECREF(restuple);
1645    return res;
1646}
1647
1648/* --- UTF-7 Codec -------------------------------------------------------- */
1649
1650/* see RFC2152 for details */
1651
1652static
1653char utf7_special[128] = {
1654    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1655       encoded:
1656	   0 - not special
1657	   1 - special
1658	   2 - whitespace (optional)
1659	   3 - RFC2152 Set O (optional) */
1660    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1661    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1662    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1663    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1664    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1665    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1666    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1667    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1668
1669};
1670
1671/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1672   warnings about the comparison always being false; since
1673   utf7_special[0] is 1, we can safely make that one comparison
1674   true  */
1675
1676#define SPECIAL(c, encodeO, encodeWS) \
1677    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1678     (encodeWS && (utf7_special[(c)] == 2)) || \
1679     (encodeO && (utf7_special[(c)] == 3)))
1680
1681#define B64(n)  \
1682    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1683#define B64CHAR(c) \
1684    (ISALNUM(c) || (c) == '+' || (c) == '/')
1685#define UB64(c) \
1686    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1687     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1688
1689#define ENCODE(out, ch, bits)                   \
1690    while (bits >= 6) {                         \
1691        *out++ = B64(ch >> (bits-6));           \
1692        bits -= 6;                              \
1693    }
1694
1695#define DECODE(out, ch, bits, surrogate)                                \
1696    while (bits >= 16) {                                                \
1697        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1698        bits -= 16;                                                     \
1699        if (surrogate) {                                                \
1700            /* We have already generated an error for the high surrogate \
1701               so let's not bother seeing if the low surrogate is correct or not */ \
1702            surrogate = 0;                                              \
1703        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1704            /* This is a surrogate pair. Unfortunately we can't represent \
1705               it in a 16-bit character */                              \
1706            surrogate = 1;                                              \
1707            errmsg = "code pairs are not supported";                    \
1708            goto utf7Error;                                             \
1709        } else {                                                        \
1710            *out++ = outCh;                                             \
1711        }                                                               \
1712    }
1713
1714PyObject *PyUnicode_DecodeUTF7(const char *s,
1715			       Py_ssize_t size,
1716			       const char *errors)
1717{
1718    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1719}
1720
1721PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1722			       Py_ssize_t size,
1723			       const char *errors,
1724			       Py_ssize_t *consumed)
1725{
1726    const char *starts = s;
1727    Py_ssize_t startinpos;
1728    Py_ssize_t endinpos;
1729    Py_ssize_t outpos;
1730    const char *e;
1731    PyUnicodeObject *unicode;
1732    Py_UNICODE *p;
1733    const char *errmsg = "";
1734    int inShift = 0;
1735    unsigned int bitsleft = 0;
1736    unsigned long charsleft = 0;
1737    int surrogate = 0;
1738    PyObject *errorHandler = NULL;
1739    PyObject *exc = NULL;
1740
1741    unicode = _PyUnicode_New(size);
1742    if (!unicode)
1743        return NULL;
1744    if (size == 0) {
1745        if (consumed)
1746            *consumed = 0;
1747        return (PyObject *)unicode;
1748    }
1749
1750    p = unicode->str;
1751    e = s + size;
1752
1753    while (s < e) {
1754        Py_UNICODE ch;
1755        restart:
1756        ch = (unsigned char) *s;
1757
1758        if (inShift) {
1759            if ((ch == '-') || !B64CHAR(ch)) {
1760                inShift = 0;
1761                s++;
1762
1763                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1764                if (bitsleft >= 6) {
1765                    /* The shift sequence has a partial character in it. If
1766                       bitsleft < 6 then we could just classify it as padding
1767                       but that is not the case here */
1768
1769                    errmsg = "partial character in shift sequence";
1770                    goto utf7Error;
1771                }
1772                /* According to RFC2152 the remaining bits should be zero. We
1773                   choose to signal an error/insert a replacement character
1774                   here so indicate the potential of a misencoded character. */
1775
1776                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1777                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1778                    errmsg = "non-zero padding bits in shift sequence";
1779                    goto utf7Error;
1780                }
1781
1782                if (ch == '-') {
1783                    if ((s < e) && (*(s) == '-')) {
1784                        *p++ = '-';
1785                        inShift = 1;
1786                    }
1787                } else if (SPECIAL(ch,0,0)) {
1788                    errmsg = "unexpected special character";
1789	                goto utf7Error;
1790                } else  {
1791                    *p++ = ch;
1792                }
1793            } else {
1794                charsleft = (charsleft << 6) | UB64(ch);
1795                bitsleft += 6;
1796                s++;
1797                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1798            }
1799        }
1800        else if ( ch == '+' ) {
1801            startinpos = s-starts;
1802            s++;
1803            if (s < e && *s == '-') {
1804                s++;
1805                *p++ = '+';
1806            } else
1807            {
1808                inShift = 1;
1809                bitsleft = 0;
1810            }
1811        }
1812        else if (SPECIAL(ch,0,0)) {
1813            startinpos = s-starts;
1814            errmsg = "unexpected special character";
1815            s++;
1816            goto utf7Error;
1817        }
1818        else {
1819            *p++ = ch;
1820            s++;
1821        }
1822        continue;
1823    utf7Error:
1824        outpos = p-PyUnicode_AS_UNICODE(unicode);
1825        endinpos = s-starts;
1826        if (unicode_decode_call_errorhandler(
1827             errors, &errorHandler,
1828             "utf7", errmsg,
1829             &starts, &e, &startinpos, &endinpos, &exc, &s,
1830             (PyObject **)&unicode, &outpos, &p))
1831        goto onError;
1832    }
1833
1834    if (inShift && !consumed) {
1835        outpos = p-PyUnicode_AS_UNICODE(unicode);
1836        endinpos = size;
1837        if (unicode_decode_call_errorhandler(
1838             errors, &errorHandler,
1839             "utf7", "unterminated shift sequence",
1840             &starts, &e, &startinpos, &endinpos, &exc, &s,
1841             (PyObject **)&unicode, &outpos, &p))
1842            goto onError;
1843        if (s < e)
1844           goto restart;
1845    }
1846    if (consumed) {
1847        if(inShift)
1848            *consumed = startinpos;
1849        else
1850            *consumed = s-starts;
1851    }
1852
1853    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1854        goto onError;
1855
1856    Py_XDECREF(errorHandler);
1857    Py_XDECREF(exc);
1858    return (PyObject *)unicode;
1859
1860onError:
1861    Py_XDECREF(errorHandler);
1862    Py_XDECREF(exc);
1863    Py_DECREF(unicode);
1864    return NULL;
1865}
1866
1867
1868PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1869                   Py_ssize_t size,
1870                   int encodeSetO,
1871                   int encodeWhiteSpace,
1872                   const char *errors)
1873{
1874    PyObject *v, *result;
1875    /* It might be possible to tighten this worst case */
1876    Py_ssize_t cbAllocated = 5 * size;
1877    int inShift = 0;
1878    Py_ssize_t i = 0;
1879    unsigned int bitsleft = 0;
1880    unsigned long charsleft = 0;
1881    char * out;
1882    char * start;
1883
1884    if (size == 0)
1885       return PyBytes_FromStringAndSize(NULL, 0);
1886
1887    if (cbAllocated / 5 != size)
1888        return PyErr_NoMemory();
1889
1890    v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
1891    if (v == NULL)
1892        return NULL;
1893
1894    start = out = PyByteArray_AS_STRING(v);
1895    for (;i < size; ++i) {
1896        Py_UNICODE ch = s[i];
1897
1898        if (!inShift) {
1899            if (ch == '+') {
1900                *out++ = '+';
1901                *out++ = '-';
1902            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1903                charsleft = ch;
1904                bitsleft = 16;
1905                *out++ = '+';
1906                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1907                inShift = bitsleft > 0;
1908            } else {
1909                *out++ = (char) ch;
1910            }
1911        } else {
1912            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1913                *out++ = B64(charsleft << (6-bitsleft));
1914                charsleft = 0;
1915                bitsleft = 0;
1916                /* Characters not in the BASE64 set implicitly unshift the sequence
1917                   so no '-' is required, except if the character is itself a '-' */
1918                if (B64CHAR(ch) || ch == '-') {
1919                    *out++ = '-';
1920                }
1921                inShift = 0;
1922                *out++ = (char) ch;
1923            } else {
1924                bitsleft += 16;
1925                charsleft = (charsleft << 16) | ch;
1926                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1927
1928                /* If the next character is special then we dont' need to terminate
1929                   the shift sequence. If the next character is not a BASE64 character
1930                   or '-' then the shift sequence will be terminated implicitly and we
1931                   don't have to insert a '-'. */
1932
1933                if (bitsleft == 0) {
1934                    if (i + 1 < size) {
1935                        Py_UNICODE ch2 = s[i+1];
1936
1937                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1938
1939                        } else if (B64CHAR(ch2) || ch2 == '-') {
1940                            *out++ = '-';
1941                            inShift = 0;
1942                        } else {
1943                            inShift = 0;
1944                        }
1945
1946                    }
1947                    else {
1948                        *out++ = '-';
1949                        inShift = 0;
1950                    }
1951                }
1952            }
1953        }
1954    }
1955    if (bitsleft) {
1956        *out++= B64(charsleft << (6-bitsleft) );
1957        *out++ = '-';
1958    }
1959
1960    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
1961    Py_DECREF(v);
1962    return result;
1963}
1964
1965#undef SPECIAL
1966#undef B64
1967#undef B64CHAR
1968#undef UB64
1969#undef ENCODE
1970#undef DECODE
1971
1972/* --- UTF-8 Codec -------------------------------------------------------- */
1973
1974static
1975char utf8_code_length[256] = {
1976    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1977       illegal prefix.  see RFC 2279 for details */
1978    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1979    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1980    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1981    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1982    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1983    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1984    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1985    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1986    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1987    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1988    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1989    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1990    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1991    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1992    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1993    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1994};
1995
1996PyObject *PyUnicode_DecodeUTF8(const char *s,
1997			       Py_ssize_t size,
1998			       const char *errors)
1999{
2000    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2001}
2002
2003PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2004			                Py_ssize_t size,
2005			                const char *errors,
2006			                Py_ssize_t *consumed)
2007{
2008    const char *starts = s;
2009    int n;
2010    Py_ssize_t startinpos;
2011    Py_ssize_t endinpos;
2012    Py_ssize_t outpos;
2013    const char *e;
2014    PyUnicodeObject *unicode;
2015    Py_UNICODE *p;
2016    const char *errmsg = "";
2017    PyObject *errorHandler = NULL;
2018    PyObject *exc = NULL;
2019
2020    /* Note: size will always be longer than the resulting Unicode
2021       character count */
2022    unicode = _PyUnicode_New(size);
2023    if (!unicode)
2024        return NULL;
2025    if (size == 0) {
2026        if (consumed)
2027            *consumed = 0;
2028        return (PyObject *)unicode;
2029    }
2030
2031    /* Unpack UTF-8 encoded data */
2032    p = unicode->str;
2033    e = s + size;
2034
2035    while (s < e) {
2036        Py_UCS4 ch = (unsigned char)*s;
2037
2038        if (ch < 0x80) {
2039            *p++ = (Py_UNICODE)ch;
2040            s++;
2041            continue;
2042        }
2043
2044        n = utf8_code_length[ch];
2045
2046        if (s + n > e) {
2047	    if (consumed)
2048		break;
2049	    else {
2050		errmsg = "unexpected end of data";
2051		startinpos = s-starts;
2052		endinpos = size;
2053		goto utf8Error;
2054	    }
2055	}
2056
2057        switch (n) {
2058
2059        case 0:
2060            errmsg = "unexpected code byte";
2061	    startinpos = s-starts;
2062	    endinpos = startinpos+1;
2063	    goto utf8Error;
2064
2065        case 1:
2066            errmsg = "internal error";
2067	    startinpos = s-starts;
2068	    endinpos = startinpos+1;
2069	    goto utf8Error;
2070
2071        case 2:
2072            if ((s[1] & 0xc0) != 0x80) {
2073                errmsg = "invalid data";
2074		startinpos = s-starts;
2075		endinpos = startinpos+2;
2076		goto utf8Error;
2077	    }
2078            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2079            if (ch < 0x80) {
2080		startinpos = s-starts;
2081		endinpos = startinpos+2;
2082                errmsg = "illegal encoding";
2083		goto utf8Error;
2084	    }
2085	    else
2086		*p++ = (Py_UNICODE)ch;
2087            break;
2088
2089        case 3:
2090            if ((s[1] & 0xc0) != 0x80 ||
2091                (s[2] & 0xc0) != 0x80) {
2092                errmsg = "invalid data";
2093		startinpos = s-starts;
2094		endinpos = startinpos+3;
2095		goto utf8Error;
2096	    }
2097            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2098            if (ch < 0x0800) {
2099		/* Note: UTF-8 encodings of surrogates are considered
2100		   legal UTF-8 sequences;
2101
2102		   XXX For wide builds (UCS-4) we should probably try
2103		       to recombine the surrogates into a single code
2104		       unit.
2105		*/
2106                errmsg = "illegal encoding";
2107		startinpos = s-starts;
2108		endinpos = startinpos+3;
2109		goto utf8Error;
2110	    }
2111	    else
2112		*p++ = (Py_UNICODE)ch;
2113            break;
2114
2115        case 4:
2116            if ((s[1] & 0xc0) != 0x80 ||
2117                (s[2] & 0xc0) != 0x80 ||
2118                (s[3] & 0xc0) != 0x80) {
2119                errmsg = "invalid data";
2120		startinpos = s-starts;
2121		endinpos = startinpos+4;
2122		goto utf8Error;
2123	    }
2124            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2125                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2126            /* validate and convert to UTF-16 */
2127            if ((ch < 0x10000)        /* minimum value allowed for 4
2128					 byte encoding */
2129                || (ch > 0x10ffff))   /* maximum value allowed for
2130					 UTF-16 */
2131	    {
2132                errmsg = "illegal encoding";
2133		startinpos = s-starts;
2134		endinpos = startinpos+4;
2135		goto utf8Error;
2136	    }
2137#ifdef Py_UNICODE_WIDE
2138	    *p++ = (Py_UNICODE)ch;
2139#else
2140            /*  compute and append the two surrogates: */
2141
2142            /*  translate from 10000..10FFFF to 0..FFFF */
2143            ch -= 0x10000;
2144
2145            /*  high surrogate = top 10 bits added to D800 */
2146            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2147
2148            /*  low surrogate = bottom 10 bits added to DC00 */
2149            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2150#endif
2151            break;
2152
2153        default:
2154            /* Other sizes are only needed for UCS-4 */
2155            errmsg = "unsupported Unicode code range";
2156	    startinpos = s-starts;
2157	    endinpos = startinpos+n;
2158	    goto utf8Error;
2159        }
2160        s += n;
2161	continue;
2162
2163    utf8Error:
2164    outpos = p-PyUnicode_AS_UNICODE(unicode);
2165    if (unicode_decode_call_errorhandler(
2166	     errors, &errorHandler,
2167	     "utf8", errmsg,
2168	     &starts, &e, &startinpos, &endinpos, &exc, &s,
2169	     (PyObject **)&unicode, &outpos, &p))
2170	goto onError;
2171    }
2172    if (consumed)
2173	*consumed = s-starts;
2174
2175    /* Adjust length */
2176    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2177        goto onError;
2178
2179    Py_XDECREF(errorHandler);
2180    Py_XDECREF(exc);
2181    return (PyObject *)unicode;
2182
2183onError:
2184    Py_XDECREF(errorHandler);
2185    Py_XDECREF(exc);
2186    Py_DECREF(unicode);
2187    return NULL;
2188}
2189
2190/* Allocation strategy:  if the string is short, convert into a stack buffer
2191   and allocate exactly as much space needed at the end.  Else allocate the
2192   maximum possible needed (4 result bytes per Unicode character), and return
2193   the excess memory at the end.
2194*/
2195PyObject *
2196PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2197		     Py_ssize_t size,
2198		     const char *errors)
2199{
2200#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2201
2202    Py_ssize_t i;                /* index into s of next input byte */
2203    PyObject *result;            /* result string object */
2204    char *p;                     /* next free byte in output buffer */
2205    Py_ssize_t nallocated;      /* number of result bytes allocated */
2206    Py_ssize_t nneeded;            /* number of result bytes needed */
2207    char stackbuf[MAX_SHORT_UNICHARS * 4];
2208
2209    assert(s != NULL);
2210    assert(size >= 0);
2211
2212    if (size <= MAX_SHORT_UNICHARS) {
2213        /* Write into the stack buffer; nallocated can't overflow.
2214         * At the end, we'll allocate exactly as much heap space as it
2215         * turns out we need.
2216         */
2217        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2218        result = NULL;   /* will allocate after we're done */
2219        p = stackbuf;
2220    }
2221    else {
2222        /* Overallocate on the heap, and give the excess back at the end. */
2223        nallocated = size * 4;
2224        if (nallocated / 4 != size)  /* overflow! */
2225            return PyErr_NoMemory();
2226        result = PyBytes_FromStringAndSize(NULL, nallocated);
2227        if (result == NULL)
2228            return NULL;
2229        p = PyBytes_AS_STRING(result);
2230    }
2231
2232    for (i = 0; i < size;) {
2233        Py_UCS4 ch = s[i++];
2234
2235        if (ch < 0x80)
2236            /* Encode ASCII */
2237            *p++ = (char) ch;
2238
2239        else if (ch < 0x0800) {
2240            /* Encode Latin-1 */
2241            *p++ = (char)(0xc0 | (ch >> 6));
2242            *p++ = (char)(0x80 | (ch & 0x3f));
2243        }
2244        else {
2245            /* Encode UCS2 Unicode ordinals */
2246            if (ch < 0x10000) {
2247                /* Special case: check for high surrogate */
2248                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2249                    Py_UCS4 ch2 = s[i];
2250                    /* Check for low surrogate and combine the two to
2251                       form a UCS4 value */
2252                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2253                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2254                        i++;
2255                        goto encodeUCS4;
2256                    }
2257                    /* Fall through: handles isolated high surrogates */
2258                }
2259                *p++ = (char)(0xe0 | (ch >> 12));
2260                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2261                *p++ = (char)(0x80 | (ch & 0x3f));
2262                continue;
2263    	    }
2264encodeUCS4:
2265            /* Encode UCS4 Unicode ordinals */
2266            *p++ = (char)(0xf0 | (ch >> 18));
2267            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2268            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2269            *p++ = (char)(0x80 | (ch & 0x3f));
2270        }
2271    }
2272
2273    if (result == NULL) {
2274        /* This was stack allocated. */
2275        nneeded = p - stackbuf;
2276        assert(nneeded <= nallocated);
2277        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2278    }
2279    else {
2280        /* Cut back to size actually needed. */
2281        nneeded = p - PyBytes_AS_STRING(result);
2282        assert(nneeded <= nallocated);
2283        _PyBytes_Resize(&result, nneeded);
2284    }
2285    return result;
2286
2287#undef MAX_SHORT_UNICHARS
2288}
2289
2290PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2291{
2292    if (!PyUnicode_Check(unicode)) {
2293        PyErr_BadArgument();
2294        return NULL;
2295    }
2296    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2297				PyUnicode_GET_SIZE(unicode),
2298				NULL);
2299}
2300
2301/* --- UTF-32 Codec ------------------------------------------------------- */
2302
2303PyObject *
2304PyUnicode_DecodeUTF32(const char *s,
2305		      Py_ssize_t size,
2306		      const char *errors,
2307		      int *byteorder)
2308{
2309    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2310}
2311
2312PyObject *
2313PyUnicode_DecodeUTF32Stateful(const char *s,
2314			      Py_ssize_t size,
2315			      const char *errors,
2316			      int *byteorder,
2317			      Py_ssize_t *consumed)
2318{
2319    const char *starts = s;
2320    Py_ssize_t startinpos;
2321    Py_ssize_t endinpos;
2322    Py_ssize_t outpos;
2323    PyUnicodeObject *unicode;
2324    Py_UNICODE *p;
2325#ifndef Py_UNICODE_WIDE
2326    int i, pairs;
2327#else
2328    const int pairs = 0;
2329#endif
2330    const unsigned char *q, *e;
2331    int bo = 0;       /* assume native ordering by default */
2332    const char *errmsg = "";
2333    /* Offsets from q for retrieving bytes in the right order. */
2334#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2335    int iorder[] = {0, 1, 2, 3};
2336#else
2337    int iorder[] = {3, 2, 1, 0};
2338#endif
2339    PyObject *errorHandler = NULL;
2340    PyObject *exc = NULL;
2341    /* On narrow builds we split characters outside the BMP into two
2342       codepoints => count how much extra space we need. */
2343#ifndef Py_UNICODE_WIDE
2344    for (i = pairs = 0; i < size/4; i++)
2345	if (((Py_UCS4 *)s)[i] >= 0x10000)
2346	    pairs++;
2347#endif
2348
2349    /* This might be one to much, because of a BOM */
2350    unicode = _PyUnicode_New((size+3)/4+pairs);
2351    if (!unicode)
2352        return NULL;
2353    if (size == 0)
2354        return (PyObject *)unicode;
2355
2356    /* Unpack UTF-32 encoded data */
2357    p = unicode->str;
2358    q = (unsigned char *)s;
2359    e = q + size;
2360
2361    if (byteorder)
2362        bo = *byteorder;
2363
2364    /* Check for BOM marks (U+FEFF) in the input and adjust current
2365       byte order setting accordingly. In native mode, the leading BOM
2366       mark is skipped, in all other modes, it is copied to the output
2367       stream as-is (giving a ZWNBSP character). */
2368    if (bo == 0) {
2369        if (size >= 4) {
2370            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2371                                (q[iorder[1]] << 8) | q[iorder[0]];
2372#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2373	    if (bom == 0x0000FEFF) {
2374		q += 4;
2375		bo = -1;
2376	    }
2377	    else if (bom == 0xFFFE0000) {
2378		q += 4;
2379		bo = 1;
2380	    }
2381#else
2382	    if (bom == 0x0000FEFF) {
2383		q += 4;
2384		bo = 1;
2385	    }
2386	    else if (bom == 0xFFFE0000) {
2387		q += 4;
2388		bo = -1;
2389	    }
2390#endif
2391	}
2392    }
2393
2394    if (bo == -1) {
2395        /* force LE */
2396        iorder[0] = 0;
2397        iorder[1] = 1;
2398        iorder[2] = 2;
2399        iorder[3] = 3;
2400    }
2401    else if (bo == 1) {
2402        /* force BE */
2403        iorder[0] = 3;
2404        iorder[1] = 2;
2405        iorder[2] = 1;
2406        iorder[3] = 0;
2407    }
2408
2409    while (q < e) {
2410	Py_UCS4 ch;
2411	/* remaining bytes at the end? (size should be divisible by 4) */
2412	if (e-q<4) {
2413	    if (consumed)
2414		break;
2415	    errmsg = "truncated data";
2416	    startinpos = ((const char *)q)-starts;
2417	    endinpos = ((const char *)e)-starts;
2418	    goto utf32Error;
2419	    /* The remaining input chars are ignored if the callback
2420	       chooses to skip the input */
2421	}
2422	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2423	     (q[iorder[1]] << 8) | q[iorder[0]];
2424
2425	if (ch >= 0x110000)
2426	{
2427	    errmsg = "codepoint not in range(0x110000)";
2428	    startinpos = ((const char *)q)-starts;
2429	    endinpos = startinpos+4;
2430	    goto utf32Error;
2431	}
2432#ifndef Py_UNICODE_WIDE
2433	if (ch >= 0x10000)
2434	{
2435	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
2436	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2437	}
2438	else
2439#endif
2440	    *p++ = ch;
2441	q += 4;
2442	continue;
2443    utf32Error:
2444	outpos = p-PyUnicode_AS_UNICODE(unicode);
2445	if (unicode_decode_call_errorhandler(
2446	         errors, &errorHandler,
2447	         "utf32", errmsg,
2448	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2449	         (PyObject **)&unicode, &outpos, &p))
2450	    goto onError;
2451    }
2452
2453    if (byteorder)
2454        *byteorder = bo;
2455
2456    if (consumed)
2457	*consumed = (const char *)q-starts;
2458
2459    /* Adjust length */
2460    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2461        goto onError;
2462
2463    Py_XDECREF(errorHandler);
2464    Py_XDECREF(exc);
2465    return (PyObject *)unicode;
2466
2467onError:
2468    Py_DECREF(unicode);
2469    Py_XDECREF(errorHandler);
2470    Py_XDECREF(exc);
2471    return NULL;
2472}
2473
2474PyObject *
2475PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2476		      Py_ssize_t size,
2477		      const char *errors,
2478		      int byteorder)
2479{
2480    PyObject *v, *result;
2481    unsigned char *p;
2482    Py_ssize_t nsize, bytesize;
2483#ifndef Py_UNICODE_WIDE
2484    Py_ssize_t i, pairs;
2485#else
2486    const int pairs = 0;
2487#endif
2488    /* Offsets from p for storing byte pairs in the right order. */
2489#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2490    int iorder[] = {0, 1, 2, 3};
2491#else
2492    int iorder[] = {3, 2, 1, 0};
2493#endif
2494
2495#define STORECHAR(CH)                       \
2496    do {                                    \
2497        p[iorder[3]] = ((CH) >> 24) & 0xff; \
2498        p[iorder[2]] = ((CH) >> 16) & 0xff; \
2499        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2500        p[iorder[0]] = (CH) & 0xff;         \
2501        p += 4;                             \
2502    } while(0)
2503
2504    /* In narrow builds we can output surrogate pairs as one codepoint,
2505       so we need less space. */
2506#ifndef Py_UNICODE_WIDE
2507    for (i = pairs = 0; i < size-1; i++)
2508	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2509	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2510	    pairs++;
2511#endif
2512    nsize = (size - pairs + (byteorder == 0));
2513    bytesize = nsize * 4;
2514    if (bytesize / 4 != nsize)
2515	return PyErr_NoMemory();
2516    v = PyByteArray_FromStringAndSize(NULL, bytesize);
2517    if (v == NULL)
2518        return NULL;
2519
2520    p = (unsigned char *)PyByteArray_AS_STRING(v);
2521    if (byteorder == 0)
2522	STORECHAR(0xFEFF);
2523    if (size == 0)
2524        goto done;
2525
2526    if (byteorder == -1) {
2527        /* force LE */
2528        iorder[0] = 0;
2529        iorder[1] = 1;
2530        iorder[2] = 2;
2531        iorder[3] = 3;
2532    }
2533    else if (byteorder == 1) {
2534        /* force BE */
2535        iorder[0] = 3;
2536        iorder[1] = 2;
2537        iorder[2] = 1;
2538        iorder[3] = 0;
2539    }
2540
2541    while (size-- > 0) {
2542	Py_UCS4 ch = *s++;
2543#ifndef Py_UNICODE_WIDE
2544	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2545	    Py_UCS4 ch2 = *s;
2546	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2547		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2548		s++;
2549		size--;
2550	    }
2551	}
2552#endif
2553        STORECHAR(ch);
2554    }
2555
2556  done:
2557    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2558    Py_DECREF(v);
2559    return result;
2560#undef STORECHAR
2561}
2562
2563PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2564{
2565    if (!PyUnicode_Check(unicode)) {
2566        PyErr_BadArgument();
2567        return NULL;
2568    }
2569    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2570				 PyUnicode_GET_SIZE(unicode),
2571				 NULL,
2572				 0);
2573}
2574
2575/* --- UTF-16 Codec ------------------------------------------------------- */
2576
2577PyObject *
2578PyUnicode_DecodeUTF16(const char *s,
2579		      Py_ssize_t size,
2580		      const char *errors,
2581		      int *byteorder)
2582{
2583    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2584}
2585
2586PyObject *
2587PyUnicode_DecodeUTF16Stateful(const char *s,
2588			      Py_ssize_t size,
2589			      const char *errors,
2590			      int *byteorder,
2591			      Py_ssize_t *consumed)
2592{
2593    const char *starts = s;
2594    Py_ssize_t startinpos;
2595    Py_ssize_t endinpos;
2596    Py_ssize_t outpos;
2597    PyUnicodeObject *unicode;
2598    Py_UNICODE *p;
2599    const unsigned char *q, *e;
2600    int bo = 0;       /* assume native ordering by default */
2601    const char *errmsg = "";
2602    /* Offsets from q for retrieving byte pairs in the right order. */
2603#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2604    int ihi = 1, ilo = 0;
2605#else
2606    int ihi = 0, ilo = 1;
2607#endif
2608    PyObject *errorHandler = NULL;
2609    PyObject *exc = NULL;
2610
2611    /* Note: size will always be longer than the resulting Unicode
2612       character count */
2613    unicode = _PyUnicode_New(size);
2614    if (!unicode)
2615        return NULL;
2616    if (size == 0)
2617        return (PyObject *)unicode;
2618
2619    /* Unpack UTF-16 encoded data */
2620    p = unicode->str;
2621    q = (unsigned char *)s;
2622    e = q + size;
2623
2624    if (byteorder)
2625        bo = *byteorder;
2626
2627    /* Check for BOM marks (U+FEFF) in the input and adjust current
2628       byte order setting accordingly. In native mode, the leading BOM
2629       mark is skipped, in all other modes, it is copied to the output
2630       stream as-is (giving a ZWNBSP character). */
2631    if (bo == 0) {
2632        if (size >= 2) {
2633            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2634#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2635	    if (bom == 0xFEFF) {
2636		q += 2;
2637		bo = -1;
2638	    }
2639	    else if (bom == 0xFFFE) {
2640		q += 2;
2641		bo = 1;
2642	    }
2643#else
2644	    if (bom == 0xFEFF) {
2645		q += 2;
2646		bo = 1;
2647	    }
2648	    else if (bom == 0xFFFE) {
2649		q += 2;
2650		bo = -1;
2651	    }
2652#endif
2653	}
2654    }
2655
2656    if (bo == -1) {
2657        /* force LE */
2658        ihi = 1;
2659        ilo = 0;
2660    }
2661    else if (bo == 1) {
2662        /* force BE */
2663        ihi = 0;
2664        ilo = 1;
2665    }
2666
2667    while (q < e) {
2668	Py_UNICODE ch;
2669	/* remaining bytes at the end? (size should be even) */
2670	if (e-q<2) {
2671	    if (consumed)
2672		break;
2673	    errmsg = "truncated data";
2674	    startinpos = ((const char *)q)-starts;
2675	    endinpos = ((const char *)e)-starts;
2676	    goto utf16Error;
2677	    /* The remaining input chars are ignored if the callback
2678	       chooses to skip the input */
2679	}
2680	ch = (q[ihi] << 8) | q[ilo];
2681
2682	q += 2;
2683
2684	if (ch < 0xD800 || ch > 0xDFFF) {
2685	    *p++ = ch;
2686	    continue;
2687	}
2688
2689	/* UTF-16 code pair: */
2690	if (q >= e) {
2691	    errmsg = "unexpected end of data";
2692	    startinpos = (((const char *)q)-2)-starts;
2693	    endinpos = ((const char *)e)-starts;
2694	    goto utf16Error;
2695	}
2696	if (0xD800 <= ch && ch <= 0xDBFF) {
2697	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2698	    q += 2;
2699	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2700#ifndef Py_UNICODE_WIDE
2701		*p++ = ch;
2702		*p++ = ch2;
2703#else
2704		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2705#endif
2706		continue;
2707	    }
2708	    else {
2709                errmsg = "illegal UTF-16 surrogate";
2710		startinpos = (((const char *)q)-4)-starts;
2711		endinpos = startinpos+2;
2712		goto utf16Error;
2713	    }
2714
2715	}
2716	errmsg = "illegal encoding";
2717	startinpos = (((const char *)q)-2)-starts;
2718	endinpos = startinpos+2;
2719	/* Fall through to report the error */
2720
2721    utf16Error:
2722	outpos = p-PyUnicode_AS_UNICODE(unicode);
2723	if (unicode_decode_call_errorhandler(
2724	         errors, &errorHandler,
2725	         "utf16", errmsg,
2726	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2727	         (PyObject **)&unicode, &outpos, &p))
2728	    goto onError;
2729    }
2730
2731    if (byteorder)
2732        *byteorder = bo;
2733
2734    if (consumed)
2735	*consumed = (const char *)q-starts;
2736
2737    /* Adjust length */
2738    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2739        goto onError;
2740
2741    Py_XDECREF(errorHandler);
2742    Py_XDECREF(exc);
2743    return (PyObject *)unicode;
2744
2745onError:
2746    Py_DECREF(unicode);
2747    Py_XDECREF(errorHandler);
2748    Py_XDECREF(exc);
2749    return NULL;
2750}
2751
2752PyObject *
2753PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2754		      Py_ssize_t size,
2755		      const char *errors,
2756		      int byteorder)
2757{
2758    PyObject *v, *result;
2759    unsigned char *p;
2760    Py_ssize_t nsize, bytesize;
2761#ifdef Py_UNICODE_WIDE
2762    Py_ssize_t i, pairs;
2763#else
2764    const int pairs = 0;
2765#endif
2766    /* Offsets from p for storing byte pairs in the right order. */
2767#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2768    int ihi = 1, ilo = 0;
2769#else
2770    int ihi = 0, ilo = 1;
2771#endif
2772
2773#define STORECHAR(CH)                   \
2774    do {                                \
2775        p[ihi] = ((CH) >> 8) & 0xff;    \
2776        p[ilo] = (CH) & 0xff;           \
2777        p += 2;                         \
2778    } while(0)
2779
2780#ifdef Py_UNICODE_WIDE
2781    for (i = pairs = 0; i < size; i++)
2782	if (s[i] >= 0x10000)
2783	    pairs++;
2784#endif
2785    /* 2 * (size + pairs + (byteorder == 0)) */
2786    if (size > PY_SSIZE_T_MAX ||
2787        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2788	return PyErr_NoMemory();
2789    nsize = size + pairs + (byteorder == 0);
2790    bytesize = nsize * 2;
2791    if (bytesize / 2 != nsize)
2792	return PyErr_NoMemory();
2793    v = PyByteArray_FromStringAndSize(NULL, bytesize);
2794    if (v == NULL)
2795        return NULL;
2796
2797    p = (unsigned char *)PyByteArray_AS_STRING(v);
2798    if (byteorder == 0)
2799	STORECHAR(0xFEFF);
2800    if (size == 0)
2801        goto done;
2802
2803    if (byteorder == -1) {
2804        /* force LE */
2805        ihi = 1;
2806        ilo = 0;
2807    }
2808    else if (byteorder == 1) {
2809        /* force BE */
2810        ihi = 0;
2811        ilo = 1;
2812    }
2813
2814    while (size-- > 0) {
2815	Py_UNICODE ch = *s++;
2816	Py_UNICODE ch2 = 0;
2817#ifdef Py_UNICODE_WIDE
2818	if (ch >= 0x10000) {
2819	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2820	    ch  = 0xD800 | ((ch-0x10000) >> 10);
2821	}
2822#endif
2823        STORECHAR(ch);
2824        if (ch2)
2825            STORECHAR(ch2);
2826    }
2827
2828  done:
2829    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2830    Py_DECREF(v);
2831    return result;
2832#undef STORECHAR
2833}
2834
2835PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2836{
2837    if (!PyUnicode_Check(unicode)) {
2838        PyErr_BadArgument();
2839        return NULL;
2840    }
2841    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2842				 PyUnicode_GET_SIZE(unicode),
2843				 NULL,
2844				 0);
2845}
2846
2847/* --- Unicode Escape Codec ----------------------------------------------- */
2848
2849static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2850
2851PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2852					Py_ssize_t size,
2853					const char *errors)
2854{
2855    const char *starts = s;
2856    Py_ssize_t startinpos;
2857    Py_ssize_t endinpos;
2858    Py_ssize_t outpos;
2859    int i;
2860    PyUnicodeObject *v;
2861    Py_UNICODE *p;
2862    const char *end;
2863    char* message;
2864    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2865    PyObject *errorHandler = NULL;
2866    PyObject *exc = NULL;
2867
2868    /* Escaped strings will always be longer than the resulting
2869       Unicode string, so we start with size here and then reduce the
2870       length after conversion to the true value.
2871       (but if the error callback returns a long replacement string
2872       we'll have to allocate more space) */
2873    v = _PyUnicode_New(size);
2874    if (v == NULL)
2875        goto onError;
2876    if (size == 0)
2877        return (PyObject *)v;
2878
2879    p = PyUnicode_AS_UNICODE(v);
2880    end = s + size;
2881
2882    while (s < end) {
2883        unsigned char c;
2884        Py_UNICODE x;
2885        int digits;
2886
2887        /* Non-escape characters are interpreted as Unicode ordinals */
2888        if (*s != '\\') {
2889            *p++ = (unsigned char) *s++;
2890            continue;
2891        }
2892
2893        startinpos = s-starts;
2894        /* \ - Escapes */
2895        s++;
2896        c = *s++;
2897        if (s > end)
2898            c = '\0'; /* Invalid after \ */
2899        switch (c) {
2900
2901        /* \x escapes */
2902        case '\n': break;
2903        case '\\': *p++ = '\\'; break;
2904        case '\'': *p++ = '\''; break;
2905        case '\"': *p++ = '\"'; break;
2906        case 'b': *p++ = '\b'; break;
2907        case 'f': *p++ = '\014'; break; /* FF */
2908        case 't': *p++ = '\t'; break;
2909        case 'n': *p++ = '\n'; break;
2910        case 'r': *p++ = '\r'; break;
2911        case 'v': *p++ = '\013'; break; /* VT */
2912        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2913
2914        /* \OOO (octal) escapes */
2915        case '0': case '1': case '2': case '3':
2916        case '4': case '5': case '6': case '7':
2917            x = s[-1] - '0';
2918            if (s < end && '0' <= *s && *s <= '7') {
2919                x = (x<<3) + *s++ - '0';
2920                if (s < end && '0' <= *s && *s <= '7')
2921                    x = (x<<3) + *s++ - '0';
2922            }
2923            *p++ = x;
2924            break;
2925
2926        /* hex escapes */
2927        /* \xXX */
2928        case 'x':
2929            digits = 2;
2930            message = "truncated \\xXX escape";
2931            goto hexescape;
2932
2933        /* \uXXXX */
2934        case 'u':
2935            digits = 4;
2936            message = "truncated \\uXXXX escape";
2937            goto hexescape;
2938
2939        /* \UXXXXXXXX */
2940        case 'U':
2941            digits = 8;
2942            message = "truncated \\UXXXXXXXX escape";
2943        hexescape:
2944            chr = 0;
2945            outpos = p-PyUnicode_AS_UNICODE(v);
2946            if (s+digits>end) {
2947                endinpos = size;
2948                if (unicode_decode_call_errorhandler(
2949                    errors, &errorHandler,
2950                    "unicodeescape", "end of string in escape sequence",
2951                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2952                    (PyObject **)&v, &outpos, &p))
2953                    goto onError;
2954                goto nextByte;
2955            }
2956            for (i = 0; i < digits; ++i) {
2957                c = (unsigned char) s[i];
2958                if (!ISXDIGIT(c)) {
2959                    endinpos = (s+i+1)-starts;
2960                    if (unicode_decode_call_errorhandler(
2961                        errors, &errorHandler,
2962                        "unicodeescape", message,
2963                        &starts, &end, &startinpos, &endinpos, &exc, &s,
2964                        (PyObject **)&v, &outpos, &p))
2965                        goto onError;
2966                    goto nextByte;
2967                }
2968                chr = (chr<<4) & ~0xF;
2969                if (c >= '0' && c <= '9')
2970                    chr += c - '0';
2971                else if (c >= 'a' && c <= 'f')
2972                    chr += 10 + c - 'a';
2973                else
2974                    chr += 10 + c - 'A';
2975            }
2976            s += i;
2977            if (chr == 0xffffffff && PyErr_Occurred())
2978                /* _decoding_error will have already written into the
2979                   target buffer. */
2980                break;
2981        store:
2982            /* when we get here, chr is a 32-bit unicode character */
2983            if (chr <= 0xffff)
2984                /* UCS-2 character */
2985                *p++ = (Py_UNICODE) chr;
2986            else if (chr <= 0x10ffff) {
2987                /* UCS-4 character. Either store directly, or as
2988                   surrogate pair. */
2989#ifdef Py_UNICODE_WIDE
2990                *p++ = chr;
2991#else
2992                chr -= 0x10000L;
2993                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2994                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2995#endif
2996            } else {
2997                endinpos = s-starts;
2998                outpos = p-PyUnicode_AS_UNICODE(v);
2999                if (unicode_decode_call_errorhandler(
3000                    errors, &errorHandler,
3001                    "unicodeescape", "illegal Unicode character",
3002                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3003                    (PyObject **)&v, &outpos, &p))
3004                    goto onError;
3005            }
3006            break;
3007
3008        /* \N{name} */
3009        case 'N':
3010            message = "malformed \\N character escape";
3011            if (ucnhash_CAPI == NULL) {
3012                /* load the unicode data module */
3013                PyObject *m, *api;
3014                m = PyImport_ImportModuleNoBlock("unicodedata");
3015                if (m == NULL)
3016                    goto ucnhashError;
3017                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
3018                Py_DECREF(m);
3019                if (api == NULL)
3020                    goto ucnhashError;
3021                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
3022                Py_DECREF(api);
3023                if (ucnhash_CAPI == NULL)
3024                    goto ucnhashError;
3025            }
3026            if (*s == '{') {
3027                const char *start = s+1;
3028                /* look for the closing brace */
3029                while (*s != '}' && s < end)
3030                    s++;
3031                if (s > start && s < end && *s == '}') {
3032                    /* found a name.  look it up in the unicode database */
3033                    message = "unknown Unicode character name";
3034                    s++;
3035                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3036                        goto store;
3037                }
3038            }
3039            endinpos = s-starts;
3040            outpos = p-PyUnicode_AS_UNICODE(v);
3041            if (unicode_decode_call_errorhandler(
3042                errors, &errorHandler,
3043                "unicodeescape", message,
3044                &starts, &end, &startinpos, &endinpos, &exc, &s,
3045                (PyObject **)&v, &outpos, &p))
3046                goto onError;
3047            break;
3048
3049        default:
3050            if (s > end) {
3051                message = "\\ at end of string";
3052                s--;
3053                endinpos = s-starts;
3054                outpos = p-PyUnicode_AS_UNICODE(v);
3055                if (unicode_decode_call_errorhandler(
3056                    errors, &errorHandler,
3057                    "unicodeescape", message,
3058                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3059                    (PyObject **)&v, &outpos, &p))
3060                    goto onError;
3061            }
3062            else {
3063                *p++ = '\\';
3064                *p++ = (unsigned char)s[-1];
3065            }
3066            break;
3067        }
3068        nextByte:
3069        ;
3070    }
3071    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3072        goto onError;
3073    Py_XDECREF(errorHandler);
3074    Py_XDECREF(exc);
3075    return (PyObject *)v;
3076
3077ucnhashError:
3078    PyErr_SetString(
3079        PyExc_UnicodeError,
3080        "\\N escapes not supported (can't load unicodedata module)"
3081        );
3082    Py_XDECREF(v);
3083    Py_XDECREF(errorHandler);
3084    Py_XDECREF(exc);
3085    return NULL;
3086
3087onError:
3088    Py_XDECREF(v);
3089    Py_XDECREF(errorHandler);
3090    Py_XDECREF(exc);
3091    return NULL;
3092}
3093
3094/* Return a Unicode-Escape string version of the Unicode object.
3095
3096   If quotes is true, the string is enclosed in u"" or u'' quotes as
3097   appropriate.
3098
3099*/
3100
3101Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3102                                      Py_ssize_t size,
3103                                      Py_UNICODE ch)
3104{
3105    /* like wcschr, but doesn't stop at NULL characters */
3106
3107    while (size-- > 0) {
3108        if (*s == ch)
3109            return s;
3110        s++;
3111    }
3112
3113    return NULL;
3114}
3115
3116static const char *hexdigits = "0123456789abcdef";
3117
3118PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3119					Py_ssize_t size)
3120{
3121    PyObject *repr, *result;
3122    char *p;
3123
3124#ifdef Py_UNICODE_WIDE
3125    const Py_ssize_t expandsize = 10;
3126#else
3127    const Py_ssize_t expandsize = 6;
3128#endif
3129
3130    /* XXX(nnorwitz): rather than over-allocating, it would be
3131       better to choose a different scheme.  Perhaps scan the
3132       first N-chars of the string and allocate based on that size.
3133    */
3134    /* Initial allocation is based on the longest-possible unichr
3135       escape.
3136
3137       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3138       unichr, so in this case it's the longest unichr escape. In
3139       narrow (UTF-16) builds this is five chars per source unichr
3140       since there are two unichrs in the surrogate pair, so in narrow
3141       (UTF-16) builds it's not the longest unichr escape.
3142
3143       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3144       so in the narrow (UTF-16) build case it's the longest unichr
3145       escape.
3146    */
3147
3148    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3149	return PyErr_NoMemory();
3150
3151    repr = PyByteArray_FromStringAndSize(NULL,
3152        2
3153        + expandsize*size
3154        + 1);
3155    if (repr == NULL)
3156        return NULL;
3157
3158    p = PyByteArray_AS_STRING(repr);
3159
3160    while (size-- > 0) {
3161        Py_UNICODE ch = *s++;
3162
3163        /* Escape backslashes */
3164        if (ch == '\\') {
3165            *p++ = '\\';
3166            *p++ = (char) ch;
3167            continue;
3168        }
3169
3170#ifdef Py_UNICODE_WIDE
3171        /* Map 21-bit characters to '\U00xxxxxx' */
3172        else if (ch >= 0x10000) {
3173            *p++ = '\\';
3174            *p++ = 'U';
3175            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3176            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3177            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3178            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3179            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3180            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3181            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3182            *p++ = hexdigits[ch & 0x0000000F];
3183	    continue;
3184        }
3185#else
3186	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3187	else if (ch >= 0xD800 && ch < 0xDC00) {
3188	    Py_UNICODE ch2;
3189	    Py_UCS4 ucs;
3190
3191	    ch2 = *s++;
3192	    size--;
3193	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3194		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3195		*p++ = '\\';
3196		*p++ = 'U';
3197		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3198		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3199		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3200		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3201		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3202		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3203		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3204		*p++ = hexdigits[ucs & 0x0000000F];
3205		continue;
3206	    }
3207	    /* Fall through: isolated surrogates are copied as-is */
3208	    s--;
3209	    size++;
3210	}
3211#endif
3212
3213        /* Map 16-bit characters to '\uxxxx' */
3214        if (ch >= 256) {
3215            *p++ = '\\';
3216            *p++ = 'u';
3217            *p++ = hexdigits[(ch >> 12) & 0x000F];
3218            *p++ = hexdigits[(ch >> 8) & 0x000F];
3219            *p++ = hexdigits[(ch >> 4) & 0x000F];
3220            *p++ = hexdigits[ch & 0x000F];
3221        }
3222
3223        /* Map special whitespace to '\t', \n', '\r' */
3224        else if (ch == '\t') {
3225            *p++ = '\\';
3226            *p++ = 't';
3227        }
3228        else if (ch == '\n') {
3229            *p++ = '\\';
3230            *p++ = 'n';
3231        }
3232        else if (ch == '\r') {
3233            *p++ = '\\';
3234            *p++ = 'r';
3235        }
3236
3237        /* Map non-printable US ASCII to '\xhh' */
3238        else if (ch < ' ' || ch >= 0x7F) {
3239            *p++ = '\\';
3240            *p++ = 'x';
3241            *p++ = hexdigits[(ch >> 4) & 0x000F];
3242            *p++ = hexdigits[ch & 0x000F];
3243        }
3244
3245        /* Copy everything else as-is */
3246        else
3247            *p++ = (char) ch;
3248    }
3249
3250    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
3251                                        p - PyByteArray_AS_STRING(repr));
3252    Py_DECREF(repr);
3253    return result;
3254}
3255
3256PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3257{
3258    PyObject *s, *result;
3259    if (!PyUnicode_Check(unicode)) {
3260        PyErr_BadArgument();
3261        return NULL;
3262    }
3263    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3264                                      PyUnicode_GET_SIZE(unicode));
3265
3266    if (!s)
3267        return NULL;
3268    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
3269                                        PyByteArray_GET_SIZE(s));
3270    Py_DECREF(s);
3271    return result;
3272}
3273
3274/* --- Raw Unicode Escape Codec ------------------------------------------- */
3275
3276PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3277					   Py_ssize_t size,
3278					   const char *errors)
3279{
3280    const char *starts = s;
3281    Py_ssize_t startinpos;
3282    Py_ssize_t endinpos;
3283    Py_ssize_t outpos;
3284    PyUnicodeObject *v;
3285    Py_UNICODE *p;
3286    const char *end;
3287    const char *bs;
3288    PyObject *errorHandler = NULL;
3289    PyObject *exc = NULL;
3290
3291    /* Escaped strings will always be longer than the resulting
3292       Unicode string, so we start with size here and then reduce the
3293       length after conversion to the true value. (But decoding error
3294       handler might have to resize the string) */
3295    v = _PyUnicode_New(size);
3296    if (v == NULL)
3297	goto onError;
3298    if (size == 0)
3299	return (PyObject *)v;
3300    p = PyUnicode_AS_UNICODE(v);
3301    end = s + size;
3302    while (s < end) {
3303	unsigned char c;
3304	Py_UCS4 x;
3305	int i;
3306        int count;
3307
3308	/* Non-escape characters are interpreted as Unicode ordinals */
3309	if (*s != '\\') {
3310	    *p++ = (unsigned char)*s++;
3311	    continue;
3312	}
3313	startinpos = s-starts;
3314
3315	/* \u-escapes are only interpreted iff the number of leading
3316	   backslashes if odd */
3317	bs = s;
3318	for (;s < end;) {
3319	    if (*s != '\\')
3320		break;
3321	    *p++ = (unsigned char)*s++;
3322	}
3323	if (((s - bs) & 1) == 0 ||
3324	    s >= end ||
3325	    (*s != 'u' && *s != 'U')) {
3326	    continue;
3327	}
3328	p--;
3329        count = *s=='u' ? 4 : 8;
3330	s++;
3331
3332	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3333	outpos = p-PyUnicode_AS_UNICODE(v);
3334	for (x = 0, i = 0; i < count; ++i, ++s) {
3335	    c = (unsigned char)*s;
3336	    if (!ISXDIGIT(c)) {
3337		endinpos = s-starts;
3338		if (unicode_decode_call_errorhandler(
3339		    errors, &errorHandler,
3340		    "rawunicodeescape", "truncated \\uXXXX",
3341		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3342		    (PyObject **)&v, &outpos, &p))
3343		    goto onError;
3344		goto nextByte;
3345	    }
3346	    x = (x<<4) & ~0xF;
3347	    if (c >= '0' && c <= '9')
3348		x += c - '0';
3349	    else if (c >= 'a' && c <= 'f')
3350		x += 10 + c - 'a';
3351	    else
3352		x += 10 + c - 'A';
3353	}
3354        if (x <= 0xffff)
3355                /* UCS-2 character */
3356                *p++ = (Py_UNICODE) x;
3357        else if (x <= 0x10ffff) {
3358                /* UCS-4 character. Either store directly, or as
3359                   surrogate pair. */
3360#ifdef Py_UNICODE_WIDE
3361                *p++ = (Py_UNICODE) x;
3362#else
3363                x -= 0x10000L;
3364                *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3365                *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3366#endif
3367        } else {
3368            endinpos = s-starts;
3369            outpos = p-PyUnicode_AS_UNICODE(v);
3370            if (unicode_decode_call_errorhandler(
3371                    errors, &errorHandler,
3372                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3373		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3374		    (PyObject **)&v, &outpos, &p))
3375		    goto onError;
3376        }
3377	nextByte:
3378	;
3379    }
3380    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3381	goto onError;
3382    Py_XDECREF(errorHandler);
3383    Py_XDECREF(exc);
3384    return (PyObject *)v;
3385
3386 onError:
3387    Py_XDECREF(v);
3388    Py_XDECREF(errorHandler);
3389    Py_XDECREF(exc);
3390    return NULL;
3391}
3392
3393PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3394					   Py_ssize_t size)
3395{
3396    PyObject *repr, *result;
3397    char *p;
3398    char *q;
3399
3400#ifdef Py_UNICODE_WIDE
3401    const Py_ssize_t expandsize = 10;
3402#else
3403    const Py_ssize_t expandsize = 6;
3404#endif
3405
3406    if (size > PY_SSIZE_T_MAX / expandsize)
3407	return PyErr_NoMemory();
3408
3409    repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
3410    if (repr == NULL)
3411        return NULL;
3412    if (size == 0)
3413        goto done;
3414
3415    p = q = PyByteArray_AS_STRING(repr);
3416    while (size-- > 0) {
3417        Py_UNICODE ch = *s++;
3418#ifdef Py_UNICODE_WIDE
3419	/* Map 32-bit characters to '\Uxxxxxxxx' */
3420	if (ch >= 0x10000) {
3421            *p++ = '\\';
3422            *p++ = 'U';
3423            *p++ = hexdigits[(ch >> 28) & 0xf];
3424            *p++ = hexdigits[(ch >> 24) & 0xf];
3425            *p++ = hexdigits[(ch >> 20) & 0xf];
3426            *p++ = hexdigits[(ch >> 16) & 0xf];
3427            *p++ = hexdigits[(ch >> 12) & 0xf];
3428            *p++ = hexdigits[(ch >> 8) & 0xf];
3429            *p++ = hexdigits[(ch >> 4) & 0xf];
3430            *p++ = hexdigits[ch & 15];
3431        }
3432        else
3433#else
3434	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3435	if (ch >= 0xD800 && ch < 0xDC00) {
3436	    Py_UNICODE ch2;
3437	    Py_UCS4 ucs;
3438
3439	    ch2 = *s++;
3440	    size--;
3441	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3442		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3443		*p++ = '\\';
3444		*p++ = 'U';
3445		*p++ = hexdigits[(ucs >> 28) & 0xf];
3446		*p++ = hexdigits[(ucs >> 24) & 0xf];
3447		*p++ = hexdigits[(ucs >> 20) & 0xf];
3448		*p++ = hexdigits[(ucs >> 16) & 0xf];
3449		*p++ = hexdigits[(ucs >> 12) & 0xf];
3450		*p++ = hexdigits[(ucs >> 8) & 0xf];
3451		*p++ = hexdigits[(ucs >> 4) & 0xf];
3452		*p++ = hexdigits[ucs & 0xf];
3453		continue;
3454	    }
3455	    /* Fall through: isolated surrogates are copied as-is */
3456	    s--;
3457	    size++;
3458	}
3459#endif
3460	/* Map 16-bit characters to '\uxxxx' */
3461	if (ch >= 256) {
3462            *p++ = '\\';
3463            *p++ = 'u';
3464            *p++ = hexdigits[(ch >> 12) & 0xf];
3465            *p++ = hexdigits[(ch >> 8) & 0xf];
3466            *p++ = hexdigits[(ch >> 4) & 0xf];
3467            *p++ = hexdigits[ch & 15];
3468        }
3469	/* Copy everything else as-is */
3470	else
3471            *p++ = (char) ch;
3472    }
3473    size = p - q;
3474
3475  done:
3476    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
3477    Py_DECREF(repr);
3478    return result;
3479}
3480
3481PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3482{
3483    PyObject *s, *result;
3484    if (!PyUnicode_Check(unicode)) {
3485        PyErr_BadArgument();
3486        return NULL;
3487    }
3488    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3489                                         PyUnicode_GET_SIZE(unicode));
3490
3491    if (!s)
3492        return NULL;
3493    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
3494                                        PyByteArray_GET_SIZE(s));
3495    Py_DECREF(s);
3496    return result;
3497}
3498
3499/* --- Unicode Internal Codec ------------------------------------------- */
3500
3501PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3502					   Py_ssize_t size,
3503					   const char *errors)
3504{
3505    const char *starts = s;
3506    Py_ssize_t startinpos;
3507    Py_ssize_t endinpos;
3508    Py_ssize_t outpos;
3509    PyUnicodeObject *v;
3510    Py_UNICODE *p;
3511    const char *end;
3512    const char *reason;
3513    PyObject *errorHandler = NULL;
3514    PyObject *exc = NULL;
3515
3516#ifdef Py_UNICODE_WIDE
3517    Py_UNICODE unimax = PyUnicode_GetMax();
3518#endif
3519
3520    /* XXX overflow detection missing */
3521    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3522    if (v == NULL)
3523	goto onError;
3524    if (PyUnicode_GetSize((PyObject *)v) == 0)
3525	return (PyObject *)v;
3526    p = PyUnicode_AS_UNICODE(v);
3527    end = s + size;
3528
3529    while (s < end) {
3530        memcpy(p, s, sizeof(Py_UNICODE));
3531        /* We have to sanity check the raw data, otherwise doom looms for
3532           some malformed UCS-4 data. */
3533        if (
3534            #ifdef Py_UNICODE_WIDE
3535            *p > unimax || *p < 0 ||
3536            #endif
3537            end-s < Py_UNICODE_SIZE
3538            )
3539            {
3540            startinpos = s - starts;
3541            if (end-s < Py_UNICODE_SIZE) {
3542                endinpos = end-starts;
3543                reason = "truncated input";
3544            }
3545            else {
3546                endinpos = s - starts + Py_UNICODE_SIZE;
3547                reason = "illegal code point (> 0x10FFFF)";
3548            }
3549            outpos = p - PyUnicode_AS_UNICODE(v);
3550            if (unicode_decode_call_errorhandler(
3551                    errors, &errorHandler,
3552                    "unicode_internal", reason,
3553                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3554                    (PyObject **)&v, &outpos, &p)) {
3555                goto onError;
3556            }
3557        }
3558        else {
3559            p++;
3560            s += Py_UNICODE_SIZE;
3561        }
3562    }
3563
3564    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3565        goto onError;
3566    Py_XDECREF(errorHandler);
3567    Py_XDECREF(exc);
3568    return (PyObject *)v;
3569
3570 onError:
3571    Py_XDECREF(v);
3572    Py_XDECREF(errorHandler);
3573    Py_XDECREF(exc);
3574    return NULL;
3575}
3576
3577/* --- Latin-1 Codec ------------------------------------------------------ */
3578
3579PyObject *PyUnicode_DecodeLatin1(const char *s,
3580				 Py_ssize_t size,
3581				 const char *errors)
3582{
3583    PyUnicodeObject *v;
3584    Py_UNICODE *p;
3585
3586    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3587    if (size == 1) {
3588	Py_UNICODE r = *(unsigned char*)s;
3589	return PyUnicode_FromUnicode(&r, 1);
3590    }
3591
3592    v = _PyUnicode_New(size);
3593    if (v == NULL)
3594	goto onError;
3595    if (size == 0)
3596	return (PyObject *)v;
3597    p = PyUnicode_AS_UNICODE(v);
3598    while (size-- > 0)
3599	*p++ = (unsigned char)*s++;
3600    return (PyObject *)v;
3601
3602 onError:
3603    Py_XDECREF(v);
3604    return NULL;
3605}
3606
3607/* create or adjust a UnicodeEncodeError */
3608static void make_encode_exception(PyObject **exceptionObject,
3609    const char *encoding,
3610    const Py_UNICODE *unicode, Py_ssize_t size,
3611    Py_ssize_t startpos, Py_ssize_t endpos,
3612    const char *reason)
3613{
3614    if (*exceptionObject == NULL) {
3615	*exceptionObject = PyUnicodeEncodeError_Create(
3616	    encoding, unicode, size, startpos, endpos, reason);
3617    }
3618    else {
3619	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3620	    goto onError;
3621	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3622	    goto onError;
3623	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3624	    goto onError;
3625	return;
3626	onError:
3627	Py_DECREF(*exceptionObject);
3628	*exceptionObject = NULL;
3629    }
3630}
3631
3632/* raises a UnicodeEncodeError */
3633static void raise_encode_exception(PyObject **exceptionObject,
3634    const char *encoding,
3635    const Py_UNICODE *unicode, Py_ssize_t size,
3636    Py_ssize_t startpos, Py_ssize_t endpos,
3637    const char *reason)
3638{
3639    make_encode_exception(exceptionObject,
3640	encoding, unicode, size, startpos, endpos, reason);
3641    if (*exceptionObject != NULL)
3642	PyCodec_StrictErrors(*exceptionObject);
3643}
3644
3645/* error handling callback helper:
3646   build arguments, call the callback and check the arguments,
3647   put the result into newpos and return the replacement string, which
3648   has to be freed by the caller */
3649static PyObject *unicode_encode_call_errorhandler(const char *errors,
3650    PyObject **errorHandler,
3651    const char *encoding, const char *reason,
3652    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3653    Py_ssize_t startpos, Py_ssize_t endpos,
3654    Py_ssize_t *newpos)
3655{
3656    static char *argparse = "O!n;encoding error handler must return (str, int) tuple";
3657
3658    PyObject *restuple;
3659    PyObject *resunicode;
3660
3661    if (*errorHandler == NULL) {
3662	*errorHandler = PyCodec_LookupError(errors);
3663        if (*errorHandler == NULL)
3664	    return NULL;
3665    }
3666
3667    make_encode_exception(exceptionObject,
3668	encoding, unicode, size, startpos, endpos, reason);
3669    if (*exceptionObject == NULL)
3670	return NULL;
3671
3672    restuple = PyObject_CallFunctionObjArgs(
3673	*errorHandler, *exceptionObject, NULL);
3674    if (restuple == NULL)
3675	return NULL;
3676    if (!PyTuple_Check(restuple)) {
3677	PyErr_Format(PyExc_TypeError, &argparse[4]);
3678	Py_DECREF(restuple);
3679	return NULL;
3680    }
3681    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3682	&resunicode, newpos)) {
3683	Py_DECREF(restuple);
3684	return NULL;
3685    }
3686    if (*newpos<0)
3687	*newpos = size+*newpos;
3688    if (*newpos<0 || *newpos>size) {
3689	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3690	Py_DECREF(restuple);
3691	return NULL;
3692    }
3693    Py_INCREF(resunicode);
3694    Py_DECREF(restuple);
3695    return resunicode;
3696}
3697
3698static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3699				 Py_ssize_t size,
3700				 const char *errors,
3701				 int limit)
3702{
3703    /* output object */
3704    PyObject *res;
3705    /* pointers to the beginning and end+1 of input */
3706    const Py_UNICODE *startp = p;
3707    const Py_UNICODE *endp = p + size;
3708    /* pointer to the beginning of the unencodable characters */
3709    /* const Py_UNICODE *badp = NULL; */
3710    /* pointer into the output */
3711    char *str;
3712    /* current output position */
3713    Py_ssize_t ressize;
3714    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3715    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3716    PyObject *errorHandler = NULL;
3717    PyObject *exc = NULL;
3718    PyObject *result = NULL;
3719    /* the following variable is used for caching string comparisons
3720     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3721    int known_errorHandler = -1;
3722
3723    /* allocate enough for a simple encoding without
3724       replacements, if we need more, we'll resize */
3725    if (size == 0)
3726        return PyBytes_FromStringAndSize(NULL, 0);
3727    res = PyByteArray_FromStringAndSize(NULL, size);
3728    if (res == NULL)
3729        return NULL;
3730    str = PyByteArray_AS_STRING(res);
3731    ressize = size;
3732
3733    while (p<endp) {
3734	Py_UNICODE c = *p;
3735
3736	/* can we encode this? */
3737	if (c<limit) {
3738	    /* no overflow check, because we know that the space is enough */
3739	    *str++ = (char)c;
3740	    ++p;
3741	}
3742	else {
3743	    Py_ssize_t unicodepos = p-startp;
3744	    Py_ssize_t requiredsize;
3745	    PyObject *repunicode;
3746	    Py_ssize_t repsize;
3747	    Py_ssize_t newpos;
3748	    Py_ssize_t respos;
3749	    Py_UNICODE *uni2;
3750	    /* startpos for collecting unencodable chars */
3751	    const Py_UNICODE *collstart = p;
3752	    const Py_UNICODE *collend = p;
3753	    /* find all unecodable characters */
3754	    while ((collend < endp) && ((*collend)>=limit))
3755		++collend;
3756	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3757	    if (known_errorHandler==-1) {
3758		if ((errors==NULL) || (!strcmp(errors, "strict")))
3759		    known_errorHandler = 1;
3760		else if (!strcmp(errors, "replace"))
3761		    known_errorHandler = 2;
3762		else if (!strcmp(errors, "ignore"))
3763		    known_errorHandler = 3;
3764		else if (!strcmp(errors, "xmlcharrefreplace"))
3765		    known_errorHandler = 4;
3766		else
3767		    known_errorHandler = 0;
3768	    }
3769	    switch (known_errorHandler) {
3770		case 1: /* strict */
3771		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3772		    goto onError;
3773		case 2: /* replace */
3774		    while (collstart++<collend)
3775			*str++ = '?'; /* fall through */
3776		case 3: /* ignore */
3777		    p = collend;
3778		    break;
3779		case 4: /* xmlcharrefreplace */
3780		    respos = str - PyByteArray_AS_STRING(res);
3781		    /* determine replacement size (temporarily (mis)uses p) */
3782		    for (p = collstart, repsize = 0; p < collend; ++p) {
3783			if (*p<10)
3784			    repsize += 2+1+1;
3785			else if (*p<100)
3786			    repsize += 2+2+1;
3787			else if (*p<1000)
3788			    repsize += 2+3+1;
3789			else if (*p<10000)
3790			    repsize += 2+4+1;
3791#ifndef Py_UNICODE_WIDE
3792			else
3793			    repsize += 2+5+1;
3794#else
3795			else if (*p<100000)
3796			    repsize += 2+5+1;
3797			else if (*p<1000000)
3798			    repsize += 2+6+1;
3799			else
3800			    repsize += 2+7+1;
3801#endif
3802		    }
3803		    requiredsize = respos+repsize+(endp-collend);
3804		    if (requiredsize > ressize) {
3805			if (requiredsize<2*ressize)
3806			    requiredsize = 2*ressize;
3807			if (PyByteArray_Resize(res, requiredsize))
3808			    goto onError;
3809			str = PyByteArray_AS_STRING(res) + respos;
3810			ressize = requiredsize;
3811		    }
3812		    /* generate replacement (temporarily (mis)uses p) */
3813		    for (p = collstart; p < collend; ++p) {
3814			str += sprintf(str, "&#%d;", (int)*p);
3815		    }
3816		    p = collend;
3817		    break;
3818		default:
3819		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3820			encoding, reason, startp, size, &exc,
3821			collstart-startp, collend-startp, &newpos);
3822		    if (repunicode == NULL)
3823			goto onError;
3824		    /* need more space? (at least enough for what we
3825		       have+the replacement+the rest of the string, so
3826		       we won't have to check space for encodable characters) */
3827		    respos = str - PyByteArray_AS_STRING(res);
3828		    repsize = PyUnicode_GET_SIZE(repunicode);
3829		    requiredsize = respos+repsize+(endp-collend);
3830		    if (requiredsize > ressize) {
3831			if (requiredsize<2*ressize)
3832			    requiredsize = 2*ressize;
3833			if (PyByteArray_Resize(res, requiredsize)) {
3834			    Py_DECREF(repunicode);
3835			    goto onError;
3836			}
3837			str = PyByteArray_AS_STRING(res) + respos;
3838			ressize = requiredsize;
3839		    }
3840		    /* check if there is anything unencodable in the replacement
3841		       and copy it to the output */
3842		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3843			c = *uni2;
3844			if (c >= limit) {
3845			    raise_encode_exception(&exc, encoding, startp, size,
3846				unicodepos, unicodepos+1, reason);
3847			    Py_DECREF(repunicode);
3848			    goto onError;
3849			}
3850			*str = (char)c;
3851		    }
3852		    p = startp + newpos;
3853		    Py_DECREF(repunicode);
3854	    }
3855	}
3856    }
3857    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
3858                                        str - PyByteArray_AS_STRING(res));
3859  onError:
3860    Py_DECREF(res);
3861    Py_XDECREF(errorHandler);
3862    Py_XDECREF(exc);
3863    return result;
3864}
3865
3866PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3867				 Py_ssize_t size,
3868				 const char *errors)
3869{
3870    return unicode_encode_ucs1(p, size, errors, 256);
3871}
3872
3873PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3874{
3875    if (!PyUnicode_Check(unicode)) {
3876	PyErr_BadArgument();
3877	return NULL;
3878    }
3879    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3880				  PyUnicode_GET_SIZE(unicode),
3881				  NULL);
3882}
3883
3884/* --- 7-bit ASCII Codec -------------------------------------------------- */
3885
3886PyObject *PyUnicode_DecodeASCII(const char *s,
3887				Py_ssize_t size,
3888				const char *errors)
3889{
3890    const char *starts = s;
3891    PyUnicodeObject *v;
3892    Py_UNICODE *p;
3893    Py_ssize_t startinpos;
3894    Py_ssize_t endinpos;
3895    Py_ssize_t outpos;
3896    const char *e;
3897    PyObject *errorHandler = NULL;
3898    PyObject *exc = NULL;
3899
3900    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3901    if (size == 1 && *(unsigned char*)s < 128) {
3902	Py_UNICODE r = *(unsigned char*)s;
3903	return PyUnicode_FromUnicode(&r, 1);
3904    }
3905
3906    v = _PyUnicode_New(size);
3907    if (v == NULL)
3908	goto onError;
3909    if (size == 0)
3910	return (PyObject *)v;
3911    p = PyUnicode_AS_UNICODE(v);
3912    e = s + size;
3913    while (s < e) {
3914	register unsigned char c = (unsigned char)*s;
3915	if (c < 128) {
3916	    *p++ = c;
3917	    ++s;
3918	}
3919	else {
3920	    startinpos = s-starts;
3921	    endinpos = startinpos + 1;
3922	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3923	    if (unicode_decode_call_errorhandler(
3924		 errors, &errorHandler,
3925		 "ascii", "ordinal not in range(128)",
3926		 &starts, &e, &startinpos, &endinpos, &exc, &s,
3927		 (PyObject **)&v, &outpos, &p))
3928		goto onError;
3929	}
3930    }
3931    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3932	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3933	    goto onError;
3934    Py_XDECREF(errorHandler);
3935    Py_XDECREF(exc);
3936    return (PyObject *)v;
3937
3938 onError:
3939    Py_XDECREF(v);
3940    Py_XDECREF(errorHandler);
3941    Py_XDECREF(exc);
3942    return NULL;
3943}
3944
3945PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3946				Py_ssize_t size,
3947				const char *errors)
3948{
3949    return unicode_encode_ucs1(p, size, errors, 128);
3950}
3951
3952PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3953{
3954    if (!PyUnicode_Check(unicode)) {
3955	PyErr_BadArgument();
3956	return NULL;
3957    }
3958    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3959				 PyUnicode_GET_SIZE(unicode),
3960				 NULL);
3961}
3962
3963#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3964
3965/* --- MBCS codecs for Windows -------------------------------------------- */
3966
3967#if SIZEOF_INT < SIZEOF_SSIZE_T
3968#define NEED_RETRY
3969#endif
3970
3971/* XXX This code is limited to "true" double-byte encodings, as
3972   a) it assumes an incomplete character consists of a single byte, and
3973   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3974      encodings, see IsDBCSLeadByteEx documentation. */
3975
3976static int is_dbcs_lead_byte(const char *s, int offset)
3977{
3978    const char *curr = s + offset;
3979
3980    if (IsDBCSLeadByte(*curr)) {
3981	const char *prev = CharPrev(s, curr);
3982	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3983    }
3984    return 0;
3985}
3986
3987/*
3988 * Decode MBCS string into unicode object. If 'final' is set, converts
3989 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3990 */
3991static int decode_mbcs(PyUnicodeObject **v,
3992			const char *s, /* MBCS string */
3993			int size, /* sizeof MBCS string */
3994			int final)
3995{
3996    Py_UNICODE *p;
3997    Py_ssize_t n = 0;
3998    int usize = 0;
3999
4000    assert(size >= 0);
4001
4002    /* Skip trailing lead-byte unless 'final' is set */
4003    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4004	--size;
4005
4006    /* First get the size of the result */
4007    if (size > 0) {
4008	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
4009	if (usize == 0) {
4010	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4011	    return -1;
4012	}
4013    }
4014
4015    if (*v == NULL) {
4016	/* Create unicode object */
4017	*v = _PyUnicode_New(usize);
4018	if (*v == NULL)
4019	    return -1;
4020    }
4021    else {
4022	/* Extend unicode object */
4023	n = PyUnicode_GET_SIZE(*v);
4024	if (_PyUnicode_Resize(v, n + usize) < 0)
4025	    return -1;
4026    }
4027
4028    /* Do the conversion */
4029    if (size > 0) {
4030	p = PyUnicode_AS_UNICODE(*v) + n;
4031	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
4032	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4033	    return -1;
4034	}
4035    }
4036
4037    return size;
4038}
4039
4040PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4041					Py_ssize_t size,
4042					const char *errors,
4043					Py_ssize_t *consumed)
4044{
4045    PyUnicodeObject *v = NULL;
4046    int done;
4047
4048    if (consumed)
4049	*consumed = 0;
4050
4051#ifdef NEED_RETRY
4052  retry:
4053    if (size > INT_MAX)
4054	done = decode_mbcs(&v, s, INT_MAX, 0);
4055    else
4056#endif
4057	done = decode_mbcs(&v, s, (int)size, !consumed);
4058
4059    if (done < 0) {
4060        Py_XDECREF(v);
4061	return NULL;
4062    }
4063
4064    if (consumed)
4065	*consumed += done;
4066
4067#ifdef NEED_RETRY
4068    if (size > INT_MAX) {
4069	s += done;
4070	size -= done;
4071	goto retry;
4072    }
4073#endif
4074
4075    return (PyObject *)v;
4076}
4077
4078PyObject *PyUnicode_DecodeMBCS(const char *s,
4079				Py_ssize_t size,
4080				const char *errors)
4081{
4082    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4083}
4084
4085/*
4086 * Convert unicode into string object (MBCS).
4087 * Returns 0 if succeed, -1 otherwise.
4088 */
4089static int encode_mbcs(PyObject **repr,
4090			const Py_UNICODE *p, /* unicode */
4091			int size) /* size of unicode */
4092{
4093    int mbcssize = 0;
4094    Py_ssize_t n = 0;
4095
4096    assert(size >= 0);
4097
4098    /* First get the size of the result */
4099    if (size > 0) {
4100	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4101	if (mbcssize == 0) {
4102	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4103	    return -1;
4104	}
4105    }
4106
4107    if (*repr == NULL) {
4108	/* Create string object */
4109	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4110	if (*repr == NULL)
4111	    return -1;
4112    }
4113    else {
4114	/* Extend string object */
4115	n = PyBytes_Size(*repr);
4116	if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4117	    return -1;
4118    }
4119
4120    /* Do the conversion */
4121    if (size > 0) {
4122	char *s = PyBytes_AS_STRING(*repr) + n;
4123	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4124	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4125	    return -1;
4126	}
4127    }
4128
4129    return 0;
4130}
4131
4132PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4133				Py_ssize_t size,
4134				const char *errors)
4135{
4136    PyObject *repr = NULL;
4137    int ret;
4138
4139#ifdef NEED_RETRY
4140 retry:
4141    if (size > INT_MAX)
4142	ret = encode_mbcs(&repr, p, INT_MAX);
4143    else
4144#endif
4145	ret = encode_mbcs(&repr, p, (int)size);
4146
4147    if (ret < 0) {
4148	Py_XDECREF(repr);
4149	return NULL;
4150    }
4151
4152#ifdef NEED_RETRY
4153    if (size > INT_MAX) {
4154	p += INT_MAX;
4155	size -= INT_MAX;
4156	goto retry;
4157    }
4158#endif
4159
4160    return repr;
4161}
4162
4163PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4164{
4165    if (!PyUnicode_Check(unicode)) {
4166        PyErr_BadArgument();
4167        return NULL;
4168    }
4169    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4170				PyUnicode_GET_SIZE(unicode),
4171				NULL);
4172}
4173
4174#undef NEED_RETRY
4175
4176#endif /* MS_WINDOWS */
4177
4178/* --- Character Mapping Codec -------------------------------------------- */
4179
4180PyObject *PyUnicode_DecodeCharmap(const char *s,
4181				  Py_ssize_t size,
4182				  PyObject *mapping,
4183				  const char *errors)
4184{
4185    const char *starts = s;
4186    Py_ssize_t startinpos;
4187    Py_ssize_t endinpos;
4188    Py_ssize_t outpos;
4189    const char *e;
4190    PyUnicodeObject *v;
4191    Py_UNICODE *p;
4192    Py_ssize_t extrachars = 0;
4193    PyObject *errorHandler = NULL;
4194    PyObject *exc = NULL;
4195    Py_UNICODE *mapstring = NULL;
4196    Py_ssize_t maplen = 0;
4197
4198    /* Default to Latin-1 */
4199    if (mapping == NULL)
4200	return PyUnicode_DecodeLatin1(s, size, errors);
4201
4202    v = _PyUnicode_New(size);
4203    if (v == NULL)
4204	goto onError;
4205    if (size == 0)
4206	return (PyObject *)v;
4207    p = PyUnicode_AS_UNICODE(v);
4208    e = s + size;
4209    if (PyUnicode_CheckExact(mapping)) {
4210	mapstring = PyUnicode_AS_UNICODE(mapping);
4211	maplen = PyUnicode_GET_SIZE(mapping);
4212	while (s < e) {
4213	    unsigned char ch = *s;
4214	    Py_UNICODE x = 0xfffe; /* illegal value */
4215
4216	    if (ch < maplen)
4217		x = mapstring[ch];
4218
4219	    if (x == 0xfffe) {
4220		/* undefined mapping */
4221		outpos = p-PyUnicode_AS_UNICODE(v);
4222		startinpos = s-starts;
4223		endinpos = startinpos+1;
4224		if (unicode_decode_call_errorhandler(
4225		     errors, &errorHandler,
4226		     "charmap", "character maps to <undefined>",
4227		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4228		     (PyObject **)&v, &outpos, &p)) {
4229		    goto onError;
4230		}
4231		continue;
4232	    }
4233	    *p++ = x;
4234	    ++s;
4235	}
4236    }
4237    else {
4238	while (s < e) {
4239	    unsigned char ch = *s;
4240	    PyObject *w, *x;
4241
4242	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
4243	    w = PyLong_FromLong((long)ch);
4244	    if (w == NULL)
4245		goto onError;
4246	    x = PyObject_GetItem(mapping, w);
4247	    Py_DECREF(w);
4248	    if (x == NULL) {
4249		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4250		    /* No mapping found means: mapping is undefined. */
4251		    PyErr_Clear();
4252		    x = Py_None;
4253		    Py_INCREF(x);
4254		} else
4255		    goto onError;
4256	    }
4257
4258	    /* Apply mapping */
4259	    if (PyLong_Check(x)) {
4260		long value = PyLong_AS_LONG(x);
4261		if (value < 0 || value > 65535) {
4262		    PyErr_SetString(PyExc_TypeError,
4263				    "character mapping must be in range(65536)");
4264		    Py_DECREF(x);
4265		    goto onError;
4266		}
4267		*p++ = (Py_UNICODE)value;
4268	    }
4269	    else if (x == Py_None) {
4270		/* undefined mapping */
4271		outpos = p-PyUnicode_AS_UNICODE(v);
4272		startinpos = s-starts;
4273		endinpos = startinpos+1;
4274		if (unicode_decode_call_errorhandler(
4275		     errors, &errorHandler,
4276		     "charmap", "character maps to <undefined>",
4277		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4278		     (PyObject **)&v, &outpos, &p)) {
4279		    Py_DECREF(x);
4280		    goto onError;
4281		}
4282		Py_DECREF(x);
4283		continue;
4284	    }
4285	    else if (PyUnicode_Check(x)) {
4286		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4287
4288		if (targetsize == 1)
4289		    /* 1-1 mapping */
4290		    *p++ = *PyUnicode_AS_UNICODE(x);
4291
4292		else if (targetsize > 1) {
4293		    /* 1-n mapping */
4294		    if (targetsize > extrachars) {
4295			/* resize first */
4296			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4297			Py_ssize_t needed = (targetsize - extrachars) + \
4298				     (targetsize << 2);
4299			extrachars += needed;
4300			/* XXX overflow detection missing */
4301			if (_PyUnicode_Resize(&v,
4302					     PyUnicode_GET_SIZE(v) + needed) < 0) {
4303			    Py_DECREF(x);
4304			    goto onError;
4305			}
4306			p = PyUnicode_AS_UNICODE(v) + oldpos;
4307		    }
4308		    Py_UNICODE_COPY(p,
4309				    PyUnicode_AS_UNICODE(x),
4310				    targetsize);
4311		    p += targetsize;
4312		    extrachars -= targetsize;
4313		}
4314		/* 1-0 mapping: skip the character */
4315	    }
4316	    else {
4317		/* wrong return value */
4318		PyErr_SetString(PyExc_TypeError,
4319		      "character mapping must return integer, None or str");
4320		Py_DECREF(x);
4321		goto onError;
4322	    }
4323	    Py_DECREF(x);
4324	    ++s;
4325	}
4326    }
4327    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4328	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4329	    goto onError;
4330    Py_XDECREF(errorHandler);
4331    Py_XDECREF(exc);
4332    return (PyObject *)v;
4333
4334 onError:
4335    Py_XDECREF(errorHandler);
4336    Py_XDECREF(exc);
4337    Py_XDECREF(v);
4338    return NULL;
4339}
4340
4341/* Charmap encoding: the lookup table */
4342
4343struct encoding_map{
4344  PyObject_HEAD
4345  unsigned char level1[32];
4346  int count2, count3;
4347  unsigned char level23[1];
4348};
4349
4350static PyObject*
4351encoding_map_size(PyObject *obj, PyObject* args)
4352{
4353    struct encoding_map *map = (struct encoding_map*)obj;
4354    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4355                          128*map->count3);
4356}
4357
4358static PyMethodDef encoding_map_methods[] = {
4359	{"size", encoding_map_size, METH_NOARGS,
4360         PyDoc_STR("Return the size (in bytes) of this object") },
4361        { 0 }
4362};
4363
4364static void
4365encoding_map_dealloc(PyObject* o)
4366{
4367	PyObject_FREE(o);
4368}
4369
4370static PyTypeObject EncodingMapType = {
4371	PyVarObject_HEAD_INIT(NULL, 0)
4372        "EncodingMap",          /*tp_name*/
4373        sizeof(struct encoding_map),   /*tp_basicsize*/
4374        0,                      /*tp_itemsize*/
4375        /* methods */
4376        encoding_map_dealloc,   /*tp_dealloc*/
4377        0,                      /*tp_print*/
4378        0,                      /*tp_getattr*/
4379        0,                      /*tp_setattr*/
4380        0,                      /*tp_compare*/
4381        0,                      /*tp_repr*/
4382        0,                      /*tp_as_number*/
4383        0,                      /*tp_as_sequence*/
4384        0,                      /*tp_as_mapping*/
4385        0,                      /*tp_hash*/
4386        0,                      /*tp_call*/
4387        0,                      /*tp_str*/
4388        0,                      /*tp_getattro*/
4389        0,                      /*tp_setattro*/
4390        0,                      /*tp_as_buffer*/
4391        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4392        0,                      /*tp_doc*/
4393        0,                      /*tp_traverse*/
4394        0,                      /*tp_clear*/
4395        0,                      /*tp_richcompare*/
4396        0,                      /*tp_weaklistoffset*/
4397        0,                      /*tp_iter*/
4398        0,                      /*tp_iternext*/
4399        encoding_map_methods,   /*tp_methods*/
4400        0,                      /*tp_members*/
4401        0,                      /*tp_getset*/
4402        0,                      /*tp_base*/
4403        0,                      /*tp_dict*/
4404        0,                      /*tp_descr_get*/
4405        0,                      /*tp_descr_set*/
4406        0,                      /*tp_dictoffset*/
4407        0,                      /*tp_init*/
4408        0,                      /*tp_alloc*/
4409        0,                      /*tp_new*/
4410        0,                      /*tp_free*/
4411        0,                      /*tp_is_gc*/
4412};
4413
4414PyObject*
4415PyUnicode_BuildEncodingMap(PyObject* string)
4416{
4417    Py_UNICODE *decode;
4418    PyObject *result;
4419    struct encoding_map *mresult;
4420    int i;
4421    int need_dict = 0;
4422    unsigned char level1[32];
4423    unsigned char level2[512];
4424    unsigned char *mlevel1, *mlevel2, *mlevel3;
4425    int count2 = 0, count3 = 0;
4426
4427    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4428        PyErr_BadArgument();
4429        return NULL;
4430    }
4431    decode = PyUnicode_AS_UNICODE(string);
4432    memset(level1, 0xFF, sizeof level1);
4433    memset(level2, 0xFF, sizeof level2);
4434
4435    /* If there isn't a one-to-one mapping of NULL to \0,
4436       or if there are non-BMP characters, we need to use
4437       a mapping dictionary. */
4438    if (decode[0] != 0)
4439        need_dict = 1;
4440    for (i = 1; i < 256; i++) {
4441        int l1, l2;
4442        if (decode[i] == 0
4443            #ifdef Py_UNICODE_WIDE
4444            || decode[i] > 0xFFFF
4445            #endif
4446        ) {
4447            need_dict = 1;
4448            break;
4449        }
4450        if (decode[i] == 0xFFFE)
4451            /* unmapped character */
4452            continue;
4453        l1 = decode[i] >> 11;
4454        l2 = decode[i] >> 7;
4455        if (level1[l1] == 0xFF)
4456            level1[l1] = count2++;
4457        if (level2[l2] == 0xFF)
4458            level2[l2] = count3++;
4459    }
4460
4461    if (count2 >= 0xFF || count3 >= 0xFF)
4462        need_dict = 1;
4463
4464    if (need_dict) {
4465        PyObject *result = PyDict_New();
4466        PyObject *key, *value;
4467        if (!result)
4468            return NULL;
4469        for (i = 0; i < 256; i++) {
4470            key = value = NULL;
4471            key = PyLong_FromLong(decode[i]);
4472            value = PyLong_FromLong(i);
4473            if (!key || !value)
4474                goto failed1;
4475            if (PyDict_SetItem(result, key, value) == -1)
4476                goto failed1;
4477            Py_DECREF(key);
4478            Py_DECREF(value);
4479        }
4480        return result;
4481      failed1:
4482        Py_XDECREF(key);
4483        Py_XDECREF(value);
4484        Py_DECREF(result);
4485        return NULL;
4486    }
4487
4488    /* Create a three-level trie */
4489    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4490                             16*count2 + 128*count3 - 1);
4491    if (!result)
4492        return PyErr_NoMemory();
4493    PyObject_Init(result, &EncodingMapType);
4494    mresult = (struct encoding_map*)result;
4495    mresult->count2 = count2;
4496    mresult->count3 = count3;
4497    mlevel1 = mresult->level1;
4498    mlevel2 = mresult->level23;
4499    mlevel3 = mresult->level23 + 16*count2;
4500    memcpy(mlevel1, level1, 32);
4501    memset(mlevel2, 0xFF, 16*count2);
4502    memset(mlevel3, 0, 128*count3);
4503    count3 = 0;
4504    for (i = 1; i < 256; i++) {
4505        int o1, o2, o3, i2, i3;
4506        if (decode[i] == 0xFFFE)
4507            /* unmapped character */
4508            continue;
4509        o1 = decode[i]>>11;
4510        o2 = (decode[i]>>7) & 0xF;
4511        i2 = 16*mlevel1[o1] + o2;
4512        if (mlevel2[i2] == 0xFF)
4513            mlevel2[i2] = count3++;
4514        o3 = decode[i] & 0x7F;
4515        i3 = 128*mlevel2[i2] + o3;
4516        mlevel3[i3] = i;
4517    }
4518    return result;
4519}
4520
4521static int
4522encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4523{
4524    struct encoding_map *map = (struct encoding_map*)mapping;
4525    int l1 = c>>11;
4526    int l2 = (c>>7) & 0xF;
4527    int l3 = c & 0x7F;
4528    int i;
4529
4530#ifdef Py_UNICODE_WIDE
4531    if (c > 0xFFFF) {
4532	return -1;
4533    }
4534#endif
4535    if (c == 0)
4536        return 0;
4537    /* level 1*/
4538    i = map->level1[l1];
4539    if (i == 0xFF) {
4540        return -1;
4541    }
4542    /* level 2*/
4543    i = map->level23[16*i+l2];
4544    if (i == 0xFF) {
4545        return -1;
4546    }
4547    /* level 3 */
4548    i = map->level23[16*map->count2 + 128*i + l3];
4549    if (i == 0) {
4550        return -1;
4551    }
4552    return i;
4553}
4554
4555/* Lookup the character ch in the mapping. If the character
4556   can't be found, Py_None is returned (or NULL, if another
4557   error occurred). */
4558static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4559{
4560    PyObject *w = PyLong_FromLong((long)c);
4561    PyObject *x;
4562
4563    if (w == NULL)
4564	 return NULL;
4565    x = PyObject_GetItem(mapping, w);
4566    Py_DECREF(w);
4567    if (x == NULL) {
4568	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4569	    /* No mapping found means: mapping is undefined. */
4570	    PyErr_Clear();
4571	    x = Py_None;
4572	    Py_INCREF(x);
4573	    return x;
4574	} else
4575	    return NULL;
4576    }
4577    else if (x == Py_None)
4578	return x;
4579    else if (PyLong_Check(x)) {
4580	long value = PyLong_AS_LONG(x);
4581	if (value < 0 || value > 255) {
4582	    PyErr_SetString(PyExc_TypeError,
4583			     "character mapping must be in range(256)");
4584	    Py_DECREF(x);
4585	    return NULL;
4586	}
4587	return x;
4588    }
4589    else if (PyBytes_Check(x))
4590	return x;
4591    else {
4592	/* wrong return value */
4593	PyErr_Format(PyExc_TypeError,
4594                "character mapping must return integer, bytes or None, not %.400s",
4595                x->ob_type->tp_name);
4596	Py_DECREF(x);
4597	return NULL;
4598    }
4599}
4600
4601static int
4602charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4603{
4604	Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4605	/* exponentially overallocate to minimize reallocations */
4606	if (requiredsize < 2*outsize)
4607	    requiredsize = 2*outsize;
4608	if (_PyBytes_Resize(outobj, requiredsize))
4609	    return -1;
4610	return 0;
4611}
4612
4613typedef enum charmapencode_result {
4614  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4615}charmapencode_result;
4616/* lookup the character, put the result in the output string and adjust
4617   various state variables. Resize the output bytes object if not enough
4618   space is available. Return a new reference to the object that
4619   was put in the output buffer, or Py_None, if the mapping was undefined
4620   (in which case no character was written) or NULL, if a
4621   reallocation error occurred. The caller must decref the result */
4622static
4623charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4624    PyObject **outobj, Py_ssize_t *outpos)
4625{
4626    PyObject *rep;
4627    char *outstart;
4628    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4629
4630    if (Py_TYPE(mapping) == &EncodingMapType) {
4631        int res = encoding_map_lookup(c, mapping);
4632	Py_ssize_t requiredsize = *outpos+1;
4633        if (res == -1)
4634            return enc_FAILED;
4635	if (outsize<requiredsize)
4636	    if (charmapencode_resize(outobj, outpos, requiredsize))
4637		return enc_EXCEPTION;
4638        outstart = PyBytes_AS_STRING(*outobj);
4639	outstart[(*outpos)++] = (char)res;
4640	return enc_SUCCESS;
4641    }
4642
4643    rep = charmapencode_lookup(c, mapping);
4644    if (rep==NULL)
4645	return enc_EXCEPTION;
4646    else if (rep==Py_None) {
4647	Py_DECREF(rep);
4648	return enc_FAILED;
4649    } else {
4650	if (PyLong_Check(rep)) {
4651	    Py_ssize_t requiredsize = *outpos+1;
4652	    if (outsize<requiredsize)
4653		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4654		    Py_DECREF(rep);
4655		    return enc_EXCEPTION;
4656		}
4657            outstart = PyBytes_AS_STRING(*outobj);
4658	    outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
4659	}
4660	else {
4661	    const char *repchars = PyBytes_AS_STRING(rep);
4662	    Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4663	    Py_ssize_t requiredsize = *outpos+repsize;
4664	    if (outsize<requiredsize)
4665		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4666		    Py_DECREF(rep);
4667		    return enc_EXCEPTION;
4668		}
4669            outstart = PyBytes_AS_STRING(*outobj);
4670	    memcpy(outstart + *outpos, repchars, repsize);
4671	    *outpos += repsize;
4672	}
4673    }
4674    Py_DECREF(rep);
4675    return enc_SUCCESS;
4676}
4677
4678/* handle an error in PyUnicode_EncodeCharmap
4679   Return 0 on success, -1 on error */
4680static
4681int charmap_encoding_error(
4682    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4683    PyObject **exceptionObject,
4684    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4685    PyObject **res, Py_ssize_t *respos)
4686{
4687    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4688    Py_ssize_t repsize;
4689    Py_ssize_t newpos;
4690    Py_UNICODE *uni2;
4691    /* startpos for collecting unencodable chars */
4692    Py_ssize_t collstartpos = *inpos;
4693    Py_ssize_t collendpos = *inpos+1;
4694    Py_ssize_t collpos;
4695    char *encoding = "charmap";
4696    char *reason = "character maps to <undefined>";
4697    charmapencode_result x;
4698
4699    /* find all unencodable characters */
4700    while (collendpos < size) {
4701        PyObject *rep;
4702        if (Py_TYPE(mapping) == &EncodingMapType) {
4703	    int res = encoding_map_lookup(p[collendpos], mapping);
4704	    if (res != -1)
4705		break;
4706	    ++collendpos;
4707	    continue;
4708	}
4709
4710	rep = charmapencode_lookup(p[collendpos], mapping);
4711	if (rep==NULL)
4712	    return -1;
4713	else if (rep!=Py_None) {
4714	    Py_DECREF(rep);
4715	    break;
4716	}
4717	Py_DECREF(rep);
4718	++collendpos;
4719    }
4720    /* cache callback name lookup
4721     * (if not done yet, i.e. it's the first error) */
4722    if (*known_errorHandler==-1) {
4723	if ((errors==NULL) || (!strcmp(errors, "strict")))
4724	    *known_errorHandler = 1;
4725	else if (!strcmp(errors, "replace"))
4726	    *known_errorHandler = 2;
4727	else if (!strcmp(errors, "ignore"))
4728	    *known_errorHandler = 3;
4729	else if (!strcmp(errors, "xmlcharrefreplace"))
4730	    *known_errorHandler = 4;
4731	else
4732	    *known_errorHandler = 0;
4733    }
4734    switch (*known_errorHandler) {
4735	case 1: /* strict */
4736	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4737	    return -1;
4738	case 2: /* replace */
4739	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4740		x = charmapencode_output('?', mapping, res, respos);
4741		if (x==enc_EXCEPTION) {
4742		    return -1;
4743		}
4744		else if (x==enc_FAILED) {
4745		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4746		    return -1;
4747		}
4748	    }
4749	    /* fall through */
4750	case 3: /* ignore */
4751	    *inpos = collendpos;
4752	    break;
4753	case 4: /* xmlcharrefreplace */
4754	    /* generate replacement (temporarily (mis)uses p) */
4755	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4756		char buffer[2+29+1+1];
4757		char *cp;
4758		sprintf(buffer, "&#%d;", (int)p[collpos]);
4759		for (cp = buffer; *cp; ++cp) {
4760		    x = charmapencode_output(*cp, mapping, res, respos);
4761		    if (x==enc_EXCEPTION)
4762			return -1;
4763		    else if (x==enc_FAILED) {
4764			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4765			return -1;
4766		    }
4767		}
4768	    }
4769	    *inpos = collendpos;
4770	    break;
4771	default:
4772	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4773		encoding, reason, p, size, exceptionObject,
4774		collstartpos, collendpos, &newpos);
4775	    if (repunicode == NULL)
4776		return -1;
4777	    /* generate replacement  */
4778	    repsize = PyUnicode_GET_SIZE(repunicode);
4779	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4780		x = charmapencode_output(*uni2, mapping, res, respos);
4781		if (x==enc_EXCEPTION) {
4782		    return -1;
4783		}
4784		else if (x==enc_FAILED) {
4785		    Py_DECREF(repunicode);
4786		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4787		    return -1;
4788		}
4789	    }
4790	    *inpos = newpos;
4791	    Py_DECREF(repunicode);
4792    }
4793    return 0;
4794}
4795
4796PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4797				  Py_ssize_t size,
4798				  PyObject *mapping,
4799				  const char *errors)
4800{
4801    /* output object */
4802    PyObject *res = NULL;
4803    /* current input position */
4804    Py_ssize_t inpos = 0;
4805    /* current output position */
4806    Py_ssize_t respos = 0;
4807    PyObject *errorHandler = NULL;
4808    PyObject *exc = NULL;
4809    /* the following variable is used for caching string comparisons
4810     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4811     * 3=ignore, 4=xmlcharrefreplace */
4812    int known_errorHandler = -1;
4813
4814    /* Default to Latin-1 */
4815    if (mapping == NULL)
4816	return PyUnicode_EncodeLatin1(p, size, errors);
4817
4818    /* allocate enough for a simple encoding without
4819       replacements, if we need more, we'll resize */
4820    res = PyBytes_FromStringAndSize(NULL, size);
4821    if (res == NULL)
4822        goto onError;
4823    if (size == 0)
4824	return res;
4825
4826    while (inpos<size) {
4827	/* try to encode it */
4828	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4829	if (x==enc_EXCEPTION) /* error */
4830	    goto onError;
4831	if (x==enc_FAILED) { /* unencodable character */
4832	    if (charmap_encoding_error(p, size, &inpos, mapping,
4833		&exc,
4834		&known_errorHandler, &errorHandler, errors,
4835		&res, &respos)) {
4836		goto onError;
4837	    }
4838	}
4839	else
4840	    /* done with this character => adjust input position */
4841	    ++inpos;
4842    }
4843
4844    /* Resize if we allocated to much */
4845    if (respos<PyBytes_GET_SIZE(res))
4846	_PyBytes_Resize(&res, respos);
4847
4848    Py_XDECREF(exc);
4849    Py_XDECREF(errorHandler);
4850    return res;
4851
4852    onError:
4853    Py_XDECREF(res);
4854    Py_XDECREF(exc);
4855    Py_XDECREF(errorHandler);
4856    return NULL;
4857}
4858
4859PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4860				    PyObject *mapping)
4861{
4862    if (!PyUnicode_Check(unicode) || mapping == NULL) {
4863	PyErr_BadArgument();
4864	return NULL;
4865    }
4866    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4867				   PyUnicode_GET_SIZE(unicode),
4868				   mapping,
4869				   NULL);
4870}
4871
4872/* create or adjust a UnicodeTranslateError */
4873static void make_translate_exception(PyObject **exceptionObject,
4874    const Py_UNICODE *unicode, Py_ssize_t size,
4875    Py_ssize_t startpos, Py_ssize_t endpos,
4876    const char *reason)
4877{
4878    if (*exceptionObject == NULL) {
4879    	*exceptionObject = PyUnicodeTranslateError_Create(
4880	    unicode, size, startpos, endpos, reason);
4881    }
4882    else {
4883	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4884	    goto onError;
4885	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4886	    goto onError;
4887	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4888	    goto onError;
4889	return;
4890	onError:
4891	Py_DECREF(*exceptionObject);
4892	*exceptionObject = NULL;
4893    }
4894}
4895
4896/* raises a UnicodeTranslateError */
4897static void raise_translate_exception(PyObject **exceptionObject,
4898    const Py_UNICODE *unicode, Py_ssize_t size,
4899    Py_ssize_t startpos, Py_ssize_t endpos,
4900    const char *reason)
4901{
4902    make_translate_exception(exceptionObject,
4903	unicode, size, startpos, endpos, reason);
4904    if (*exceptionObject != NULL)
4905	PyCodec_StrictErrors(*exceptionObject);
4906}
4907
4908/* error handling callback helper:
4909   build arguments, call the callback and check the arguments,
4910   put the result into newpos and return the replacement string, which
4911   has to be freed by the caller */
4912static PyObject *unicode_translate_call_errorhandler(const char *errors,
4913    PyObject **errorHandler,
4914    const char *reason,
4915    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4916    Py_ssize_t startpos, Py_ssize_t endpos,
4917    Py_ssize_t *newpos)
4918{
4919    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
4920
4921    Py_ssize_t i_newpos;
4922    PyObject *restuple;
4923    PyObject *resunicode;
4924
4925    if (*errorHandler == NULL) {
4926	*errorHandler = PyCodec_LookupError(errors);
4927        if (*errorHandler == NULL)
4928	    return NULL;
4929    }
4930
4931    make_translate_exception(exceptionObject,
4932	unicode, size, startpos, endpos, reason);
4933    if (*exceptionObject == NULL)
4934	return NULL;
4935
4936    restuple = PyObject_CallFunctionObjArgs(
4937	*errorHandler, *exceptionObject, NULL);
4938    if (restuple == NULL)
4939	return NULL;
4940    if (!PyTuple_Check(restuple)) {
4941	PyErr_Format(PyExc_TypeError, &argparse[4]);
4942	Py_DECREF(restuple);
4943	return NULL;
4944    }
4945    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4946	&resunicode, &i_newpos)) {
4947	Py_DECREF(restuple);
4948	return NULL;
4949    }
4950    if (i_newpos<0)
4951	*newpos = size+i_newpos;
4952    else
4953        *newpos = i_newpos;
4954    if (*newpos<0 || *newpos>size) {
4955	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4956	Py_DECREF(restuple);
4957	return NULL;
4958    }
4959    Py_INCREF(resunicode);
4960    Py_DECREF(restuple);
4961    return resunicode;
4962}
4963
4964/* Lookup the character ch in the mapping and put the result in result,
4965   which must be decrefed by the caller.
4966   Return 0 on success, -1 on error */
4967static
4968int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4969{
4970    PyObject *w = PyLong_FromLong((long)c);
4971    PyObject *x;
4972
4973    if (w == NULL)
4974	 return -1;
4975    x = PyObject_GetItem(mapping, w);
4976    Py_DECREF(w);
4977    if (x == NULL) {
4978	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4979	    /* No mapping found means: use 1:1 mapping. */
4980	    PyErr_Clear();
4981	    *result = NULL;
4982	    return 0;
4983	} else
4984	    return -1;
4985    }
4986    else if (x == Py_None) {
4987	*result = x;
4988	return 0;
4989    }
4990    else if (PyLong_Check(x)) {
4991	long value = PyLong_AS_LONG(x);
4992	long max = PyUnicode_GetMax();
4993	if (value < 0 || value > max) {
4994	    PyErr_Format(PyExc_TypeError,
4995                         "character mapping must be in range(0x%x)", max+1);
4996	    Py_DECREF(x);
4997	    return -1;
4998	}
4999	*result = x;
5000	return 0;
5001    }
5002    else if (PyUnicode_Check(x)) {
5003	*result = x;
5004	return 0;
5005    }
5006    else {
5007	/* wrong return value */
5008	PyErr_SetString(PyExc_TypeError,
5009	      "character mapping must return integer, None or str");
5010	Py_DECREF(x);
5011	return -1;
5012    }
5013}
5014/* ensure that *outobj is at least requiredsize characters long,
5015if not reallocate and adjust various state variables.
5016Return 0 on success, -1 on error */
5017static
5018int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
5019    Py_ssize_t requiredsize)
5020{
5021    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
5022    if (requiredsize > oldsize) {
5023	/* remember old output position */
5024	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5025	/* exponentially overallocate to minimize reallocations */
5026	if (requiredsize < 2 * oldsize)
5027	    requiredsize = 2 * oldsize;
5028	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
5029	    return -1;
5030	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5031    }
5032    return 0;
5033}
5034/* lookup the character, put the result in the output string and adjust
5035   various state variables. Return a new reference to the object that
5036   was put in the output buffer in *result, or Py_None, if the mapping was
5037   undefined (in which case no character was written).
5038   The called must decref result.
5039   Return 0 on success, -1 on error. */
5040static
5041int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5042    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5043    PyObject **res)
5044{
5045    if (charmaptranslate_lookup(*curinp, mapping, res))
5046	return -1;
5047    if (*res==NULL) {
5048	/* not found => default to 1:1 mapping */
5049	*(*outp)++ = *curinp;
5050    }
5051    else if (*res==Py_None)
5052	;
5053    else if (PyLong_Check(*res)) {
5054	/* no overflow check, because we know that the space is enough */
5055	*(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
5056    }
5057    else if (PyUnicode_Check(*res)) {
5058	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5059	if (repsize==1) {
5060	    /* no overflow check, because we know that the space is enough */
5061	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5062	}
5063	else if (repsize!=0) {
5064	    /* more than one character */
5065	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5066		(insize - (curinp-startinp)) +
5067		repsize - 1;
5068	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
5069		return -1;
5070	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5071	    *outp += repsize;
5072	}
5073    }
5074    else
5075	return -1;
5076    return 0;
5077}
5078
5079PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5080				     Py_ssize_t size,
5081				     PyObject *mapping,
5082				     const char *errors)
5083{
5084    /* output object */
5085    PyObject *res = NULL;
5086    /* pointers to the beginning and end+1 of input */
5087    const Py_UNICODE *startp = p;
5088    const Py_UNICODE *endp = p + size;
5089    /* pointer into the output */
5090    Py_UNICODE *str;
5091    /* current output position */
5092    Py_ssize_t respos = 0;
5093    char *reason = "character maps to <undefined>";
5094    PyObject *errorHandler = NULL;
5095    PyObject *exc = NULL;
5096    /* the following variable is used for caching string comparisons
5097     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5098     * 3=ignore, 4=xmlcharrefreplace */
5099    int known_errorHandler = -1;
5100
5101    if (mapping == NULL) {
5102	PyErr_BadArgument();
5103	return NULL;
5104    }
5105
5106    /* allocate enough for a simple 1:1 translation without
5107       replacements, if we need more, we'll resize */
5108    res = PyUnicode_FromUnicode(NULL, size);
5109    if (res == NULL)
5110	goto onError;
5111    if (size == 0)
5112	return res;
5113    str = PyUnicode_AS_UNICODE(res);
5114
5115    while (p<endp) {
5116	/* try to encode it */
5117	PyObject *x = NULL;
5118	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5119	    Py_XDECREF(x);
5120	    goto onError;
5121	}
5122	Py_XDECREF(x);
5123	if (x!=Py_None) /* it worked => adjust input pointer */
5124	    ++p;
5125	else { /* untranslatable character */
5126	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5127	    Py_ssize_t repsize;
5128	    Py_ssize_t newpos;
5129	    Py_UNICODE *uni2;
5130	    /* startpos for collecting untranslatable chars */
5131	    const Py_UNICODE *collstart = p;
5132	    const Py_UNICODE *collend = p+1;
5133	    const Py_UNICODE *coll;
5134
5135	    /* find all untranslatable characters */
5136	    while (collend < endp) {
5137		if (charmaptranslate_lookup(*collend, mapping, &x))
5138		    goto onError;
5139		Py_XDECREF(x);
5140		if (x!=Py_None)
5141		    break;
5142		++collend;
5143	    }
5144	    /* cache callback name lookup
5145	     * (if not done yet, i.e. it's the first error) */
5146	    if (known_errorHandler==-1) {
5147		if ((errors==NULL) || (!strcmp(errors, "strict")))
5148		    known_errorHandler = 1;
5149		else if (!strcmp(errors, "replace"))
5150		    known_errorHandler = 2;
5151		else if (!strcmp(errors, "ignore"))
5152		    known_errorHandler = 3;
5153		else if (!strcmp(errors, "xmlcharrefreplace"))
5154		    known_errorHandler = 4;
5155		else
5156		    known_errorHandler = 0;
5157	    }
5158	    switch (known_errorHandler) {
5159		case 1: /* strict */
5160		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5161		    goto onError;
5162		case 2: /* replace */
5163		    /* No need to check for space, this is a 1:1 replacement */
5164		    for (coll = collstart; coll<collend; ++coll)
5165			*str++ = '?';
5166		    /* fall through */
5167		case 3: /* ignore */
5168		    p = collend;
5169		    break;
5170		case 4: /* xmlcharrefreplace */
5171		    /* generate replacement (temporarily (mis)uses p) */
5172		    for (p = collstart; p < collend; ++p) {
5173			char buffer[2+29+1+1];
5174			char *cp;
5175			sprintf(buffer, "&#%d;", (int)*p);
5176			if (charmaptranslate_makespace(&res, &str,
5177			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5178			    goto onError;
5179			for (cp = buffer; *cp; ++cp)
5180			    *str++ = *cp;
5181		    }
5182		    p = collend;
5183		    break;
5184		default:
5185		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5186			reason, startp, size, &exc,
5187			collstart-startp, collend-startp, &newpos);
5188		    if (repunicode == NULL)
5189			goto onError;
5190		    /* generate replacement  */
5191		    repsize = PyUnicode_GET_SIZE(repunicode);
5192		    if (charmaptranslate_makespace(&res, &str,
5193			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5194			Py_DECREF(repunicode);
5195			goto onError;
5196		    }
5197		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5198			*str++ = *uni2;
5199		    p = startp + newpos;
5200		    Py_DECREF(repunicode);
5201	    }
5202	}
5203    }
5204    /* Resize if we allocated to much */
5205    respos = str-PyUnicode_AS_UNICODE(res);
5206    if (respos<PyUnicode_GET_SIZE(res)) {
5207	if (_PyUnicode_Resize(&res, respos) < 0)
5208	    goto onError;
5209    }
5210    Py_XDECREF(exc);
5211    Py_XDECREF(errorHandler);
5212    return res;
5213
5214    onError:
5215    Py_XDECREF(res);
5216    Py_XDECREF(exc);
5217    Py_XDECREF(errorHandler);
5218    return NULL;
5219}
5220
5221PyObject *PyUnicode_Translate(PyObject *str,
5222			      PyObject *mapping,
5223			      const char *errors)
5224{
5225    PyObject *result;
5226
5227    str = PyUnicode_FromObject(str);
5228    if (str == NULL)
5229	goto onError;
5230    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5231					PyUnicode_GET_SIZE(str),
5232					mapping,
5233					errors);
5234    Py_DECREF(str);
5235    return result;
5236
5237 onError:
5238    Py_XDECREF(str);
5239    return NULL;
5240}
5241
5242/* --- Decimal Encoder ---------------------------------------------------- */
5243
5244int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5245			    Py_ssize_t length,
5246			    char *output,
5247			    const char *errors)
5248{
5249    Py_UNICODE *p, *end;
5250    PyObject *errorHandler = NULL;
5251    PyObject *exc = NULL;
5252    const char *encoding = "decimal";
5253    const char *reason = "invalid decimal Unicode string";
5254    /* the following variable is used for caching string comparisons
5255     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5256    int known_errorHandler = -1;
5257
5258    if (output == NULL) {
5259	PyErr_BadArgument();
5260	return -1;
5261    }
5262
5263    p = s;
5264    end = s + length;
5265    while (p < end) {
5266	register Py_UNICODE ch = *p;
5267	int decimal;
5268	PyObject *repunicode;
5269	Py_ssize_t repsize;
5270	Py_ssize_t newpos;
5271	Py_UNICODE *uni2;
5272	Py_UNICODE *collstart;
5273	Py_UNICODE *collend;
5274
5275	if (Py_UNICODE_ISSPACE(ch)) {
5276	    *output++ = ' ';
5277	    ++p;
5278	    continue;
5279	}
5280	decimal = Py_UNICODE_TODECIMAL(ch);
5281	if (decimal >= 0) {
5282	    *output++ = '0' + decimal;
5283	    ++p;
5284	    continue;
5285	}
5286	if (0 < ch && ch < 256) {
5287	    *output++ = (char)ch;
5288	    ++p;
5289	    continue;
5290	}
5291	/* All other characters are considered unencodable */
5292	collstart = p;
5293	collend = p+1;
5294	while (collend < end) {
5295	    if ((0 < *collend && *collend < 256) ||
5296	        !Py_UNICODE_ISSPACE(*collend) ||
5297	        Py_UNICODE_TODECIMAL(*collend))
5298		break;
5299	}
5300	/* cache callback name lookup
5301	 * (if not done yet, i.e. it's the first error) */
5302	if (known_errorHandler==-1) {
5303	    if ((errors==NULL) || (!strcmp(errors, "strict")))
5304		known_errorHandler = 1;
5305	    else if (!strcmp(errors, "replace"))
5306		known_errorHandler = 2;
5307	    else if (!strcmp(errors, "ignore"))
5308		known_errorHandler = 3;
5309	    else if (!strcmp(errors, "xmlcharrefreplace"))
5310		known_errorHandler = 4;
5311	    else
5312		known_errorHandler = 0;
5313	}
5314	switch (known_errorHandler) {
5315	    case 1: /* strict */
5316		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5317		goto onError;
5318	    case 2: /* replace */
5319		for (p = collstart; p < collend; ++p)
5320		    *output++ = '?';
5321		/* fall through */
5322	    case 3: /* ignore */
5323		p = collend;
5324		break;
5325	    case 4: /* xmlcharrefreplace */
5326		/* generate replacement (temporarily (mis)uses p) */
5327		for (p = collstart; p < collend; ++p)
5328		    output += sprintf(output, "&#%d;", (int)*p);
5329		p = collend;
5330		break;
5331	    default:
5332		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5333		    encoding, reason, s, length, &exc,
5334		    collstart-s, collend-s, &newpos);
5335		if (repunicode == NULL)
5336		    goto onError;
5337		/* generate replacement  */
5338		repsize = PyUnicode_GET_SIZE(repunicode);
5339		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5340		    Py_UNICODE ch = *uni2;
5341		    if (Py_UNICODE_ISSPACE(ch))
5342			*output++ = ' ';
5343		    else {
5344			decimal = Py_UNICODE_TODECIMAL(ch);
5345			if (decimal >= 0)
5346			    *output++ = '0' + decimal;
5347			else if (0 < ch && ch < 256)
5348			    *output++ = (char)ch;
5349			else {
5350			    Py_DECREF(repunicode);
5351			    raise_encode_exception(&exc, encoding,
5352				s, length, collstart-s, collend-s, reason);
5353			    goto onError;
5354			}
5355		    }
5356		}
5357		p = s + newpos;
5358		Py_DECREF(repunicode);
5359	}
5360    }
5361    /* 0-terminate the output string */
5362    *output++ = '\0';
5363    Py_XDECREF(exc);
5364    Py_XDECREF(errorHandler);
5365    return 0;
5366
5367 onError:
5368    Py_XDECREF(exc);
5369    Py_XDECREF(errorHandler);
5370    return -1;
5371}
5372
5373/* --- Helpers ------------------------------------------------------------ */
5374
5375#include "stringlib/unicodedefs.h"
5376#include "stringlib/fastsearch.h"
5377#include "stringlib/count.h"
5378/* Include _ParseTupleFinds from find.h */
5379#define FROM_UNICODE
5380#include "stringlib/find.h"
5381#include "stringlib/partition.h"
5382
5383#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5384#include "stringlib/localeutil.h"
5385
5386/* helper macro to fixup start/end slice values */
5387#define FIX_START_END(obj)                      \
5388    if (start < 0)                              \
5389        start += (obj)->length;                 \
5390    if (start < 0)                              \
5391        start = 0;                              \
5392    if (end > (obj)->length)                    \
5393        end = (obj)->length;                    \
5394    if (end < 0)                                \
5395        end += (obj)->length;                   \
5396    if (end < 0)                                \
5397        end = 0;
5398
5399Py_ssize_t PyUnicode_Count(PyObject *str,
5400                           PyObject *substr,
5401                           Py_ssize_t start,
5402                           Py_ssize_t end)
5403{
5404    Py_ssize_t result;
5405    PyUnicodeObject* str_obj;
5406    PyUnicodeObject* sub_obj;
5407
5408    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5409    if (!str_obj)
5410	return -1;
5411    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5412    if (!sub_obj) {
5413	Py_DECREF(str_obj);
5414	return -1;
5415    }
5416
5417    FIX_START_END(str_obj);
5418
5419    result = stringlib_count(
5420        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5421        );
5422
5423    Py_DECREF(sub_obj);
5424    Py_DECREF(str_obj);
5425
5426    return result;
5427}
5428
5429Py_ssize_t PyUnicode_Find(PyObject *str,
5430                          PyObject *sub,
5431                          Py_ssize_t start,
5432                          Py_ssize_t end,
5433                          int direction)
5434{
5435    Py_ssize_t result;
5436
5437    str = PyUnicode_FromObject(str);
5438    if (!str)
5439	return -2;
5440    sub = PyUnicode_FromObject(sub);
5441    if (!sub) {
5442	Py_DECREF(str);
5443	return -2;
5444    }
5445
5446    if (direction > 0)
5447        result = stringlib_find_slice(
5448            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5449            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5450            start, end
5451            );
5452    else
5453        result = stringlib_rfind_slice(
5454            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5455            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5456            start, end
5457            );
5458
5459    Py_DECREF(str);
5460    Py_DECREF(sub);
5461
5462    return result;
5463}
5464
5465static
5466int tailmatch(PyUnicodeObject *self,
5467	      PyUnicodeObject *substring,
5468	      Py_ssize_t start,
5469	      Py_ssize_t end,
5470	      int direction)
5471{
5472    if (substring->length == 0)
5473        return 1;
5474
5475    FIX_START_END(self);
5476
5477    end -= substring->length;
5478    if (end < start)
5479	return 0;
5480
5481    if (direction > 0) {
5482	if (Py_UNICODE_MATCH(self, end, substring))
5483	    return 1;
5484    } else {
5485        if (Py_UNICODE_MATCH(self, start, substring))
5486	    return 1;
5487    }
5488
5489    return 0;
5490}
5491
5492Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5493			PyObject *substr,
5494			Py_ssize_t start,
5495			Py_ssize_t end,
5496			int direction)
5497{
5498    Py_ssize_t result;
5499
5500    str = PyUnicode_FromObject(str);
5501    if (str == NULL)
5502	return -1;
5503    substr = PyUnicode_FromObject(substr);
5504    if (substr == NULL) {
5505	Py_DECREF(str);
5506	return -1;
5507    }
5508
5509    result = tailmatch((PyUnicodeObject *)str,
5510		       (PyUnicodeObject *)substr,
5511		       start, end, direction);
5512    Py_DECREF(str);
5513    Py_DECREF(substr);
5514    return result;
5515}
5516
5517/* Apply fixfct filter to the Unicode object self and return a
5518   reference to the modified object */
5519
5520static
5521PyObject *fixup(PyUnicodeObject *self,
5522		int (*fixfct)(PyUnicodeObject *s))
5523{
5524
5525    PyUnicodeObject *u;
5526
5527    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5528    if (u == NULL)
5529	return NULL;
5530
5531    Py_UNICODE_COPY(u->str, self->str, self->length);
5532
5533    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5534	/* fixfct should return TRUE if it modified the buffer. If
5535	   FALSE, return a reference to the original buffer instead
5536	   (to save space, not time) */
5537	Py_INCREF(self);
5538	Py_DECREF(u);
5539	return (PyObject*) self;
5540    }
5541    return (PyObject*) u;
5542}
5543
5544static
5545int fixupper(PyUnicodeObject *self)
5546{
5547    Py_ssize_t len = self->length;
5548    Py_UNICODE *s = self->str;
5549    int status = 0;
5550
5551    while (len-- > 0) {
5552	register Py_UNICODE ch;
5553
5554	ch = Py_UNICODE_TOUPPER(*s);
5555	if (ch != *s) {
5556            status = 1;
5557	    *s = ch;
5558	}
5559        s++;
5560    }
5561
5562    return status;
5563}
5564
5565static
5566int fixlower(PyUnicodeObject *self)
5567{
5568    Py_ssize_t len = self->length;
5569    Py_UNICODE *s = self->str;
5570    int status = 0;
5571
5572    while (len-- > 0) {
5573	register Py_UNICODE ch;
5574
5575	ch = Py_UNICODE_TOLOWER(*s);
5576	if (ch != *s) {
5577            status = 1;
5578	    *s = ch;
5579	}
5580        s++;
5581    }
5582
5583    return status;
5584}
5585
5586static
5587int fixswapcase(PyUnicodeObject *self)
5588{
5589    Py_ssize_t len = self->length;
5590    Py_UNICODE *s = self->str;
5591    int status = 0;
5592
5593    while (len-- > 0) {
5594        if (Py_UNICODE_ISUPPER(*s)) {
5595            *s = Py_UNICODE_TOLOWER(*s);
5596            status = 1;
5597        } else if (Py_UNICODE_ISLOWER(*s)) {
5598            *s = Py_UNICODE_TOUPPER(*s);
5599            status = 1;
5600        }
5601        s++;
5602    }
5603
5604    return status;
5605}
5606
5607static
5608int fixcapitalize(PyUnicodeObject *self)
5609{
5610    Py_ssize_t len = self->length;
5611    Py_UNICODE *s = self->str;
5612    int status = 0;
5613
5614    if (len == 0)
5615	return 0;
5616    if (Py_UNICODE_ISLOWER(*s)) {
5617	*s = Py_UNICODE_TOUPPER(*s);
5618	status = 1;
5619    }
5620    s++;
5621    while (--len > 0) {
5622        if (Py_UNICODE_ISUPPER(*s)) {
5623            *s = Py_UNICODE_TOLOWER(*s);
5624            status = 1;
5625        }
5626        s++;
5627    }
5628    return status;
5629}
5630
5631static
5632int fixtitle(PyUnicodeObject *self)
5633{
5634    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5635    register Py_UNICODE *e;
5636    int previous_is_cased;
5637
5638    /* Shortcut for single character strings */
5639    if (PyUnicode_GET_SIZE(self) == 1) {
5640	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5641	if (*p != ch) {
5642	    *p = ch;
5643	    return 1;
5644	}
5645	else
5646	    return 0;
5647    }
5648
5649    e = p + PyUnicode_GET_SIZE(self);
5650    previous_is_cased = 0;
5651    for (; p < e; p++) {
5652	register const Py_UNICODE ch = *p;
5653
5654	if (previous_is_cased)
5655	    *p = Py_UNICODE_TOLOWER(ch);
5656	else
5657	    *p = Py_UNICODE_TOTITLE(ch);
5658
5659	if (Py_UNICODE_ISLOWER(ch) ||
5660	    Py_UNICODE_ISUPPER(ch) ||
5661	    Py_UNICODE_ISTITLE(ch))
5662	    previous_is_cased = 1;
5663	else
5664	    previous_is_cased = 0;
5665    }
5666    return 1;
5667}
5668
5669PyObject *
5670PyUnicode_Join(PyObject *separator, PyObject *seq)
5671{
5672    const Py_UNICODE blank = ' ';
5673    const Py_UNICODE *sep = &blank;
5674    Py_ssize_t seplen = 1;
5675    PyUnicodeObject *res = NULL; /* the result */
5676    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5677    PyObject *fseq;          /* PySequence_Fast(seq) */
5678    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
5679    PyObject **items;
5680    PyObject *item;
5681    Py_ssize_t sz, i;
5682
5683    fseq = PySequence_Fast(seq, "");
5684    if (fseq == NULL) {
5685    	return NULL;
5686    }
5687
5688    /* NOTE: the following code can't call back into Python code,
5689     * so we are sure that fseq won't be mutated.
5690     */
5691
5692    seqlen = PySequence_Fast_GET_SIZE(fseq);
5693    /* If empty sequence, return u"". */
5694    if (seqlen == 0) {
5695    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5696    	goto Done;
5697    }
5698    items = PySequence_Fast_ITEMS(fseq);
5699    /* If singleton sequence with an exact Unicode, return that. */
5700    if (seqlen == 1) {
5701	item = items[0];
5702	if (PyUnicode_CheckExact(item)) {
5703	    Py_INCREF(item);
5704	    res = (PyUnicodeObject *)item;
5705	    goto Done;
5706	}
5707    }
5708    else {
5709        /* Set up sep and seplen */
5710        if (separator == NULL) {
5711            sep = &blank;
5712            seplen = 1;
5713        }
5714        else {
5715            if (!PyUnicode_Check(separator)) {
5716                PyErr_Format(PyExc_TypeError,
5717                             "separator: expected str instance,"
5718                             " %.80s found",
5719                             Py_TYPE(separator)->tp_name);
5720                goto onError;
5721            }
5722            sep = PyUnicode_AS_UNICODE(separator);
5723            seplen = PyUnicode_GET_SIZE(separator);
5724        }
5725    }
5726
5727    /* There are at least two things to join, or else we have a subclass
5728     * of str in the sequence.
5729     * Do a pre-pass to figure out the total amount of space we'll
5730     * need (sz), and see whether all argument are strings.
5731     */
5732    sz = 0;
5733    for (i = 0; i < seqlen; i++) {
5734        const Py_ssize_t old_sz = sz;
5735        item = items[i];
5736	if (!PyUnicode_Check(item)) {
5737	    PyErr_Format(PyExc_TypeError,
5738			 "sequence item %zd: expected str instance,"
5739			 " %.80s found",
5740			 i, Py_TYPE(item)->tp_name);
5741	    goto onError;
5742	}
5743        sz += PyUnicode_GET_SIZE(item);
5744        if (i != 0)
5745            sz += seplen;
5746        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
5747            PyErr_SetString(PyExc_OverflowError,
5748                "join() result is too long for a Python string");
5749            goto onError;
5750        }
5751    }
5752
5753    res = _PyUnicode_New(sz);
5754    if (res == NULL)
5755        goto onError;
5756
5757    /* Catenate everything. */
5758    res_p = PyUnicode_AS_UNICODE(res);
5759    for (i = 0; i < seqlen; ++i) {
5760        Py_ssize_t itemlen;
5761        item = items[i];
5762        itemlen = PyUnicode_GET_SIZE(item);
5763	/* Copy item, and maybe the separator. */
5764	if (i) {
5765	    Py_UNICODE_COPY(res_p, sep, seplen);
5766	    res_p += seplen;
5767	}
5768	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5769	res_p += itemlen;
5770    }
5771
5772 Done:
5773    Py_DECREF(fseq);
5774    return (PyObject *)res;
5775
5776 onError:
5777    Py_DECREF(fseq);
5778    Py_XDECREF(res);
5779    return NULL;
5780}
5781
5782static
5783PyUnicodeObject *pad(PyUnicodeObject *self,
5784		     Py_ssize_t left,
5785		     Py_ssize_t right,
5786		     Py_UNICODE fill)
5787{
5788    PyUnicodeObject *u;
5789
5790    if (left < 0)
5791        left = 0;
5792    if (right < 0)
5793        right = 0;
5794
5795    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5796        Py_INCREF(self);
5797        return self;
5798    }
5799
5800    if (left > PY_SSIZE_T_MAX - self->length ||
5801        right > PY_SSIZE_T_MAX - (left + self->length)) {
5802        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5803        return NULL;
5804    }
5805    u = _PyUnicode_New(left + self->length + right);
5806    if (u) {
5807        if (left)
5808            Py_UNICODE_FILL(u->str, fill, left);
5809        Py_UNICODE_COPY(u->str + left, self->str, self->length);
5810        if (right)
5811            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5812    }
5813
5814    return u;
5815}
5816
5817#define SPLIT_APPEND(data, left, right)					\
5818	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
5819	if (!str)							\
5820	    goto onError;						\
5821	if (PyList_Append(list, str)) {					\
5822	    Py_DECREF(str);						\
5823	    goto onError;						\
5824	}								\
5825        else								\
5826            Py_DECREF(str);
5827
5828static
5829PyObject *split_whitespace(PyUnicodeObject *self,
5830			   PyObject *list,
5831			   Py_ssize_t maxcount)
5832{
5833    register Py_ssize_t i;
5834    register Py_ssize_t j;
5835    Py_ssize_t len = self->length;
5836    PyObject *str;
5837    register const Py_UNICODE *buf = self->str;
5838
5839    for (i = j = 0; i < len; ) {
5840	/* find a token */
5841	while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5842	    i++;
5843	j = i;
5844	while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5845	    i++;
5846	if (j < i) {
5847	    if (maxcount-- <= 0)
5848		break;
5849	    SPLIT_APPEND(buf, j, i);
5850	    while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5851		i++;
5852	    j = i;
5853	}
5854    }
5855    if (j < len) {
5856	SPLIT_APPEND(buf, j, len);
5857    }
5858    return list;
5859
5860 onError:
5861    Py_DECREF(list);
5862    return NULL;
5863}
5864
5865PyObject *PyUnicode_Splitlines(PyObject *string,
5866			       int keepends)
5867{
5868    register Py_ssize_t i;
5869    register Py_ssize_t j;
5870    Py_ssize_t len;
5871    PyObject *list;
5872    PyObject *str;
5873    Py_UNICODE *data;
5874
5875    string = PyUnicode_FromObject(string);
5876    if (string == NULL)
5877	return NULL;
5878    data = PyUnicode_AS_UNICODE(string);
5879    len = PyUnicode_GET_SIZE(string);
5880
5881    list = PyList_New(0);
5882    if (!list)
5883        goto onError;
5884
5885    for (i = j = 0; i < len; ) {
5886	Py_ssize_t eol;
5887
5888	/* Find a line and append it */
5889	while (i < len && !BLOOM_LINEBREAK(data[i]))
5890	    i++;
5891
5892	/* Skip the line break reading CRLF as one line break */
5893	eol = i;
5894	if (i < len) {
5895	    if (data[i] == '\r' && i + 1 < len &&
5896		data[i+1] == '\n')
5897		i += 2;
5898	    else
5899		i++;
5900	    if (keepends)
5901		eol = i;
5902	}
5903	SPLIT_APPEND(data, j, eol);
5904	j = i;
5905    }
5906    if (j < len) {
5907	SPLIT_APPEND(data, j, len);
5908    }
5909
5910    Py_DECREF(string);
5911    return list;
5912
5913 onError:
5914    Py_XDECREF(list);
5915    Py_DECREF(string);
5916    return NULL;
5917}
5918
5919static
5920PyObject *split_char(PyUnicodeObject *self,
5921		     PyObject *list,
5922		     Py_UNICODE ch,
5923		     Py_ssize_t maxcount)
5924{
5925    register Py_ssize_t i;
5926    register Py_ssize_t j;
5927    Py_ssize_t len = self->length;
5928    PyObject *str;
5929    register const Py_UNICODE *buf = self->str;
5930
5931    for (i = j = 0; i < len; ) {
5932	if (buf[i] == ch) {
5933	    if (maxcount-- <= 0)
5934		break;
5935	    SPLIT_APPEND(buf, j, i);
5936	    i = j = i + 1;
5937	} else
5938	    i++;
5939    }
5940    if (j <= len) {
5941	SPLIT_APPEND(buf, j, len);
5942    }
5943    return list;
5944
5945 onError:
5946    Py_DECREF(list);
5947    return NULL;
5948}
5949
5950static
5951PyObject *split_substring(PyUnicodeObject *self,
5952			  PyObject *list,
5953			  PyUnicodeObject *substring,
5954			  Py_ssize_t maxcount)
5955{
5956    register Py_ssize_t i;
5957    register Py_ssize_t j;
5958    Py_ssize_t len = self->length;
5959    Py_ssize_t sublen = substring->length;
5960    PyObject *str;
5961
5962    for (i = j = 0; i <= len - sublen; ) {
5963	if (Py_UNICODE_MATCH(self, i, substring)) {
5964	    if (maxcount-- <= 0)
5965		break;
5966	    SPLIT_APPEND(self->str, j, i);
5967	    i = j = i + sublen;
5968	} else
5969	    i++;
5970    }
5971    if (j <= len) {
5972	SPLIT_APPEND(self->str, j, len);
5973    }
5974    return list;
5975
5976 onError:
5977    Py_DECREF(list);
5978    return NULL;
5979}
5980
5981static
5982PyObject *rsplit_whitespace(PyUnicodeObject *self,
5983			    PyObject *list,
5984			    Py_ssize_t maxcount)
5985{
5986    register Py_ssize_t i;
5987    register Py_ssize_t j;
5988    Py_ssize_t len = self->length;
5989    PyObject *str;
5990    register const Py_UNICODE *buf = self->str;
5991
5992    for (i = j = len - 1; i >= 0; ) {
5993	/* find a token */
5994	while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5995	    i--;
5996	j = i;
5997	while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5998	    i--;
5999	if (j > i) {
6000	    if (maxcount-- <= 0)
6001		break;
6002	    SPLIT_APPEND(buf, i + 1, j + 1);
6003	    while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
6004		i--;
6005	    j = i;
6006	}
6007    }
6008    if (j >= 0) {
6009	SPLIT_APPEND(buf, 0, j + 1);
6010    }
6011    if (PyList_Reverse(list) < 0)
6012        goto onError;
6013    return list;
6014
6015 onError:
6016    Py_DECREF(list);
6017    return NULL;
6018}
6019
6020static
6021PyObject *rsplit_char(PyUnicodeObject *self,
6022		      PyObject *list,
6023		      Py_UNICODE ch,
6024		      Py_ssize_t maxcount)
6025{
6026    register Py_ssize_t i;
6027    register Py_ssize_t j;
6028    Py_ssize_t len = self->length;
6029    PyObject *str;
6030    register const Py_UNICODE *buf = self->str;
6031
6032    for (i = j = len - 1; i >= 0; ) {
6033	if (buf[i] == ch) {
6034	    if (maxcount-- <= 0)
6035		break;
6036	    SPLIT_APPEND(buf, i + 1, j + 1);
6037	    j = i = i - 1;
6038	} else
6039	    i--;
6040    }
6041    if (j >= -1) {
6042	SPLIT_APPEND(buf, 0, j + 1);
6043    }
6044    if (PyList_Reverse(list) < 0)
6045        goto onError;
6046    return list;
6047
6048 onError:
6049    Py_DECREF(list);
6050    return NULL;
6051}
6052
6053static
6054PyObject *rsplit_substring(PyUnicodeObject *self,
6055			   PyObject *list,
6056			   PyUnicodeObject *substring,
6057			   Py_ssize_t maxcount)
6058{
6059    register Py_ssize_t i;
6060    register Py_ssize_t j;
6061    Py_ssize_t len = self->length;
6062    Py_ssize_t sublen = substring->length;
6063    PyObject *str;
6064
6065    for (i = len - sublen, j = len; i >= 0; ) {
6066	if (Py_UNICODE_MATCH(self, i, substring)) {
6067	    if (maxcount-- <= 0)
6068		break;
6069	    SPLIT_APPEND(self->str, i + sublen, j);
6070	    j = i;
6071	    i -= sublen;
6072	} else
6073	    i--;
6074    }
6075    if (j >= 0) {
6076	SPLIT_APPEND(self->str, 0, j);
6077    }
6078    if (PyList_Reverse(list) < 0)
6079        goto onError;
6080    return list;
6081
6082 onError:
6083    Py_DECREF(list);
6084    return NULL;
6085}
6086
6087#undef SPLIT_APPEND
6088
6089static
6090PyObject *split(PyUnicodeObject *self,
6091		PyUnicodeObject *substring,
6092		Py_ssize_t maxcount)
6093{
6094    PyObject *list;
6095
6096    if (maxcount < 0)
6097        maxcount = PY_SSIZE_T_MAX;
6098
6099    list = PyList_New(0);
6100    if (!list)
6101        return NULL;
6102
6103    if (substring == NULL)
6104	return split_whitespace(self,list,maxcount);
6105
6106    else if (substring->length == 1)
6107	return split_char(self,list,substring->str[0],maxcount);
6108
6109    else if (substring->length == 0) {
6110	Py_DECREF(list);
6111	PyErr_SetString(PyExc_ValueError, "empty separator");
6112	return NULL;
6113    }
6114    else
6115	return split_substring(self,list,substring,maxcount);
6116}
6117
6118static
6119PyObject *rsplit(PyUnicodeObject *self,
6120		 PyUnicodeObject *substring,
6121		 Py_ssize_t maxcount)
6122{
6123    PyObject *list;
6124
6125    if (maxcount < 0)
6126        maxcount = PY_SSIZE_T_MAX;
6127
6128    list = PyList_New(0);
6129    if (!list)
6130        return NULL;
6131
6132    if (substring == NULL)
6133	return rsplit_whitespace(self,list,maxcount);
6134
6135    else if (substring->length == 1)
6136	return rsplit_char(self,list,substring->str[0],maxcount);
6137
6138    else if (substring->length == 0) {
6139	Py_DECREF(list);
6140	PyErr_SetString(PyExc_ValueError, "empty separator");
6141	return NULL;
6142    }
6143    else
6144	return rsplit_substring(self,list,substring,maxcount);
6145}
6146
6147static
6148PyObject *replace(PyUnicodeObject *self,
6149		  PyUnicodeObject *str1,
6150		  PyUnicodeObject *str2,
6151		  Py_ssize_t maxcount)
6152{
6153    PyUnicodeObject *u;
6154
6155    if (maxcount < 0)
6156	maxcount = PY_SSIZE_T_MAX;
6157
6158    if (str1->length == str2->length) {
6159        /* same length */
6160        Py_ssize_t i;
6161        if (str1->length == 1) {
6162            /* replace characters */
6163            Py_UNICODE u1, u2;
6164            if (!findchar(self->str, self->length, str1->str[0]))
6165                goto nothing;
6166            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6167            if (!u)
6168                return NULL;
6169            Py_UNICODE_COPY(u->str, self->str, self->length);
6170            u1 = str1->str[0];
6171            u2 = str2->str[0];
6172            for (i = 0; i < u->length; i++)
6173                if (u->str[i] == u1) {
6174                    if (--maxcount < 0)
6175                        break;
6176                    u->str[i] = u2;
6177                }
6178        } else {
6179            i = fastsearch(
6180                self->str, self->length, str1->str, str1->length, FAST_SEARCH
6181                );
6182            if (i < 0)
6183                goto nothing;
6184            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6185            if (!u)
6186                return NULL;
6187            Py_UNICODE_COPY(u->str, self->str, self->length);
6188            while (i <= self->length - str1->length)
6189                if (Py_UNICODE_MATCH(self, i, str1)) {
6190                    if (--maxcount < 0)
6191                        break;
6192                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6193                    i += str1->length;
6194                } else
6195                    i++;
6196        }
6197    } else {
6198
6199        Py_ssize_t n, i, j, e;
6200        Py_ssize_t product, new_size, delta;
6201        Py_UNICODE *p;
6202
6203        /* replace strings */
6204        n = stringlib_count(self->str, self->length, str1->str, str1->length);
6205        if (n > maxcount)
6206            n = maxcount;
6207        if (n == 0)
6208            goto nothing;
6209        /* new_size = self->length + n * (str2->length - str1->length)); */
6210        delta = (str2->length - str1->length);
6211        if (delta == 0) {
6212            new_size = self->length;
6213        } else {
6214            product = n * (str2->length - str1->length);
6215            if ((product / (str2->length - str1->length)) != n) {
6216                PyErr_SetString(PyExc_OverflowError,
6217                                "replace string is too long");
6218                return NULL;
6219            }
6220            new_size = self->length + product;
6221            if (new_size < 0) {
6222                PyErr_SetString(PyExc_OverflowError,
6223                                "replace string is too long");
6224                return NULL;
6225            }
6226        }
6227        u = _PyUnicode_New(new_size);
6228        if (!u)
6229            return NULL;
6230        i = 0;
6231        p = u->str;
6232        e = self->length - str1->length;
6233        if (str1->length > 0) {
6234            while (n-- > 0) {
6235                /* look for next match */
6236                j = i;
6237                while (j <= e) {
6238                    if (Py_UNICODE_MATCH(self, j, str1))
6239                        break;
6240                    j++;
6241                }
6242		if (j > i) {
6243                    if (j > e)
6244                        break;
6245                    /* copy unchanged part [i:j] */
6246                    Py_UNICODE_COPY(p, self->str+i, j-i);
6247                    p += j - i;
6248                }
6249                /* copy substitution string */
6250                if (str2->length > 0) {
6251                    Py_UNICODE_COPY(p, str2->str, str2->length);
6252                    p += str2->length;
6253                }
6254                i = j + str1->length;
6255            }
6256            if (i < self->length)
6257                /* copy tail [i:] */
6258                Py_UNICODE_COPY(p, self->str+i, self->length-i);
6259        } else {
6260            /* interleave */
6261            while (n > 0) {
6262                Py_UNICODE_COPY(p, str2->str, str2->length);
6263                p += str2->length;
6264                if (--n <= 0)
6265                    break;
6266                *p++ = self->str[i++];
6267            }
6268            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6269        }
6270    }
6271    return (PyObject *) u;
6272
6273nothing:
6274    /* nothing to replace; return original string (when possible) */
6275    if (PyUnicode_CheckExact(self)) {
6276        Py_INCREF(self);
6277        return (PyObject *) self;
6278    }
6279    return PyUnicode_FromUnicode(self->str, self->length);
6280}
6281
6282/* --- Unicode Object Methods --------------------------------------------- */
6283
6284PyDoc_STRVAR(title__doc__,
6285"S.title() -> str\n\
6286\n\
6287Return a titlecased version of S, i.e. words start with title case\n\
6288characters, all remaining cased characters have lower case.");
6289
6290static PyObject*
6291unicode_title(PyUnicodeObject *self)
6292{
6293    return fixup(self, fixtitle);
6294}
6295
6296PyDoc_STRVAR(capitalize__doc__,
6297"S.capitalize() -> str\n\
6298\n\
6299Return a capitalized version of S, i.e. make the first character\n\
6300have upper case.");
6301
6302static PyObject*
6303unicode_capitalize(PyUnicodeObject *self)
6304{
6305    return fixup(self, fixcapitalize);
6306}
6307
6308#if 0
6309PyDoc_STRVAR(capwords__doc__,
6310"S.capwords() -> str\n\
6311\n\
6312Apply .capitalize() to all words in S and return the result with\n\
6313normalized whitespace (all whitespace strings are replaced by ' ').");
6314
6315static PyObject*
6316unicode_capwords(PyUnicodeObject *self)
6317{
6318    PyObject *list;
6319    PyObject *item;
6320    Py_ssize_t i;
6321
6322    /* Split into words */
6323    list = split(self, NULL, -1);
6324    if (!list)
6325        return NULL;
6326
6327    /* Capitalize each word */
6328    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6329        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6330		     fixcapitalize);
6331        if (item == NULL)
6332            goto onError;
6333        Py_DECREF(PyList_GET_ITEM(list, i));
6334        PyList_SET_ITEM(list, i, item);
6335    }
6336
6337    /* Join the words to form a new string */
6338    item = PyUnicode_Join(NULL, list);
6339
6340onError:
6341    Py_DECREF(list);
6342    return (PyObject *)item;
6343}
6344#endif
6345
6346/* Argument converter.  Coerces to a single unicode character */
6347
6348static int
6349convert_uc(PyObject *obj, void *addr)
6350{
6351	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6352	PyObject *uniobj;
6353	Py_UNICODE *unistr;
6354
6355	uniobj = PyUnicode_FromObject(obj);
6356	if (uniobj == NULL) {
6357		PyErr_SetString(PyExc_TypeError,
6358			"The fill character cannot be converted to Unicode");
6359		return 0;
6360	}
6361	if (PyUnicode_GET_SIZE(uniobj) != 1) {
6362		PyErr_SetString(PyExc_TypeError,
6363			"The fill character must be exactly one character long");
6364		Py_DECREF(uniobj);
6365		return 0;
6366	}
6367	unistr = PyUnicode_AS_UNICODE(uniobj);
6368	*fillcharloc = unistr[0];
6369	Py_DECREF(uniobj);
6370	return 1;
6371}
6372
6373PyDoc_STRVAR(center__doc__,
6374"S.center(width[, fillchar]) -> str\n\
6375\n\
6376Return S centered in a string of length width. Padding is\n\
6377done using the specified fill character (default is a space)");
6378
6379static PyObject *
6380unicode_center(PyUnicodeObject *self, PyObject *args)
6381{
6382    Py_ssize_t marg, left;
6383    Py_ssize_t width;
6384    Py_UNICODE fillchar = ' ';
6385
6386    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6387        return NULL;
6388
6389    if (self->length >= width && PyUnicode_CheckExact(self)) {
6390        Py_INCREF(self);
6391        return (PyObject*) self;
6392    }
6393
6394    marg = width - self->length;
6395    left = marg / 2 + (marg & width & 1);
6396
6397    return (PyObject*) pad(self, left, marg - left, fillchar);
6398}
6399
6400#if 0
6401
6402/* This code should go into some future Unicode collation support
6403   module. The basic comparison should compare ordinals on a naive
6404   basis (this is what Java does and thus JPython too). */
6405
6406/* speedy UTF-16 code point order comparison */
6407/* gleaned from: */
6408/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6409
6410static short utf16Fixup[32] =
6411{
6412    0, 0, 0, 0, 0, 0, 0, 0,
6413    0, 0, 0, 0, 0, 0, 0, 0,
6414    0, 0, 0, 0, 0, 0, 0, 0,
6415    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6416};
6417
6418static int
6419unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6420{
6421    Py_ssize_t len1, len2;
6422
6423    Py_UNICODE *s1 = str1->str;
6424    Py_UNICODE *s2 = str2->str;
6425
6426    len1 = str1->length;
6427    len2 = str2->length;
6428
6429    while (len1 > 0 && len2 > 0) {
6430        Py_UNICODE c1, c2;
6431
6432        c1 = *s1++;
6433        c2 = *s2++;
6434
6435	if (c1 > (1<<11) * 26)
6436	    c1 += utf16Fixup[c1>>11];
6437	if (c2 > (1<<11) * 26)
6438            c2 += utf16Fixup[c2>>11];
6439        /* now c1 and c2 are in UTF-32-compatible order */
6440
6441        if (c1 != c2)
6442            return (c1 < c2) ? -1 : 1;
6443
6444        len1--; len2--;
6445    }
6446
6447    return (len1 < len2) ? -1 : (len1 != len2);
6448}
6449
6450#else
6451
6452static int
6453unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6454{
6455    register Py_ssize_t len1, len2;
6456
6457    Py_UNICODE *s1 = str1->str;
6458    Py_UNICODE *s2 = str2->str;
6459
6460    len1 = str1->length;
6461    len2 = str2->length;
6462
6463    while (len1 > 0 && len2 > 0) {
6464        Py_UNICODE c1, c2;
6465
6466        c1 = *s1++;
6467        c2 = *s2++;
6468
6469        if (c1 != c2)
6470            return (c1 < c2) ? -1 : 1;
6471
6472        len1--; len2--;
6473    }
6474
6475    return (len1 < len2) ? -1 : (len1 != len2);
6476}
6477
6478#endif
6479
6480int PyUnicode_Compare(PyObject *left,
6481		      PyObject *right)
6482{
6483    if (PyUnicode_Check(left) && PyUnicode_Check(right))
6484        return unicode_compare((PyUnicodeObject *)left,
6485                               (PyUnicodeObject *)right);
6486    PyErr_Format(PyExc_TypeError,
6487                 "Can't compare %.100s and %.100s",
6488                 left->ob_type->tp_name,
6489                 right->ob_type->tp_name);
6490    return -1;
6491}
6492
6493int
6494PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6495{
6496    int i;
6497    Py_UNICODE *id;
6498    assert(PyUnicode_Check(uni));
6499    id = PyUnicode_AS_UNICODE(uni);
6500    /* Compare Unicode string and source character set string */
6501    for (i = 0; id[i] && str[i]; i++)
6502	if (id[i] != str[i])
6503	    return ((int)id[i] < (int)str[i]) ? -1 : 1;
6504    if (id[i])
6505	return 1; /* uni is longer */
6506    if (str[i])
6507	return -1; /* str is longer */
6508    return 0;
6509}
6510
6511PyObject *PyUnicode_RichCompare(PyObject *left,
6512                                PyObject *right,
6513                                int op)
6514{
6515    int result;
6516
6517    result = PyUnicode_Compare(left, right);
6518    if (result == -1 && PyErr_Occurred())
6519        goto onError;
6520
6521    /* Convert the return value to a Boolean */
6522    switch (op) {
6523    case Py_EQ:
6524        result = (result == 0);
6525        break;
6526    case Py_NE:
6527        result = (result != 0);
6528        break;
6529    case Py_LE:
6530        result = (result <= 0);
6531        break;
6532    case Py_GE:
6533        result = (result >= 0);
6534        break;
6535    case Py_LT:
6536        result = (result == -1);
6537        break;
6538    case Py_GT:
6539        result = (result == 1);
6540        break;
6541    }
6542    return PyBool_FromLong(result);
6543
6544 onError:
6545
6546    /* Standard case
6547
6548       Type errors mean that PyUnicode_FromObject() could not convert
6549       one of the arguments (usually the right hand side) to Unicode,
6550       ie. we can't handle the comparison request. However, it is
6551       possible that the other object knows a comparison method, which
6552       is why we return Py_NotImplemented to give the other object a
6553       chance.
6554
6555    */
6556    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6557        PyErr_Clear();
6558        Py_INCREF(Py_NotImplemented);
6559        return Py_NotImplemented;
6560    }
6561    if (op != Py_EQ && op != Py_NE)
6562        return NULL;
6563
6564    /* Equality comparison.
6565
6566       This is a special case: we silence any PyExc_UnicodeDecodeError
6567       and instead turn it into a PyErr_UnicodeWarning.
6568
6569    */
6570    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6571        return NULL;
6572    PyErr_Clear();
6573    if (PyErr_WarnEx(PyExc_UnicodeWarning,
6574                     (op == Py_EQ) ?
6575                     "equal comparison "
6576                     "failed to convert both arguments to str - "
6577                     "interpreting them as being unequal"
6578                     :
6579                     "Unicode unequal comparison "
6580                     "failed to convert both arguments to str - "
6581                     "interpreting them as being unequal",
6582                     1) < 0)
6583        return NULL;
6584    result = (op == Py_NE);
6585    return PyBool_FromLong(result);
6586}
6587
6588int PyUnicode_Contains(PyObject *container,
6589		       PyObject *element)
6590{
6591    PyObject *str, *sub;
6592    int result;
6593
6594    /* Coerce the two arguments */
6595    sub = PyUnicode_FromObject(element);
6596    if (!sub) {
6597	PyErr_Format(PyExc_TypeError,
6598	    "'in <string>' requires string as left operand, not %s",
6599	    element->ob_type->tp_name);
6600        return -1;
6601    }
6602
6603    str = PyUnicode_FromObject(container);
6604    if (!str) {
6605        Py_DECREF(sub);
6606        return -1;
6607    }
6608
6609    result = stringlib_contains_obj(str, sub);
6610
6611    Py_DECREF(str);
6612    Py_DECREF(sub);
6613
6614    return result;
6615}
6616
6617/* Concat to string or Unicode object giving a new Unicode object. */
6618
6619PyObject *PyUnicode_Concat(PyObject *left,
6620			   PyObject *right)
6621{
6622    PyUnicodeObject *u = NULL, *v = NULL, *w;
6623
6624    /* Coerce the two arguments */
6625    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6626    if (u == NULL)
6627	goto onError;
6628    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6629    if (v == NULL)
6630	goto onError;
6631
6632    /* Shortcuts */
6633    if (v == unicode_empty) {
6634	Py_DECREF(v);
6635	return (PyObject *)u;
6636    }
6637    if (u == unicode_empty) {
6638	Py_DECREF(u);
6639	return (PyObject *)v;
6640    }
6641
6642    /* Concat the two Unicode strings */
6643    w = _PyUnicode_New(u->length + v->length);
6644    if (w == NULL)
6645	goto onError;
6646    Py_UNICODE_COPY(w->str, u->str, u->length);
6647    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6648
6649    Py_DECREF(u);
6650    Py_DECREF(v);
6651    return (PyObject *)w;
6652
6653onError:
6654    Py_XDECREF(u);
6655    Py_XDECREF(v);
6656    return NULL;
6657}
6658
6659void
6660PyUnicode_Append(PyObject **pleft, PyObject *right)
6661{
6662	PyObject *new;
6663	if (*pleft == NULL)
6664		return;
6665	if (right == NULL || !PyUnicode_Check(*pleft)) {
6666		Py_DECREF(*pleft);
6667		*pleft = NULL;
6668		return;
6669	}
6670	new = PyUnicode_Concat(*pleft, right);
6671	Py_DECREF(*pleft);
6672	*pleft = new;
6673}
6674
6675void
6676PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6677{
6678	PyUnicode_Append(pleft, right);
6679	Py_XDECREF(right);
6680}
6681
6682PyDoc_STRVAR(count__doc__,
6683"S.count(sub[, start[, end]]) -> int\n\
6684\n\
6685Return the number of non-overlapping occurrences of substring sub in\n\
6686string S[start:end].  Optional arguments start and end are\n\
6687interpreted as in slice notation.");
6688
6689static PyObject *
6690unicode_count(PyUnicodeObject *self, PyObject *args)
6691{
6692    PyUnicodeObject *substring;
6693    Py_ssize_t start = 0;
6694    Py_ssize_t end = PY_SSIZE_T_MAX;
6695    PyObject *result;
6696
6697    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6698		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6699        return NULL;
6700
6701    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6702        (PyObject *)substring);
6703    if (substring == NULL)
6704	return NULL;
6705
6706    FIX_START_END(self);
6707
6708    result = PyLong_FromSsize_t(
6709        stringlib_count(self->str + start, end - start,
6710                        substring->str, substring->length)
6711        );
6712
6713    Py_DECREF(substring);
6714
6715    return result;
6716}
6717
6718PyDoc_STRVAR(encode__doc__,
6719"S.encode([encoding[, errors]]) -> bytes\n\
6720\n\
6721Encode S using the codec registered for encoding. encoding defaults\n\
6722to the default encoding. errors may be given to set a different error\n\
6723handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6724a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6725'xmlcharrefreplace' as well as any other name registered with\n\
6726codecs.register_error that can handle UnicodeEncodeErrors.");
6727
6728static PyObject *
6729unicode_encode(PyUnicodeObject *self, PyObject *args)
6730{
6731    char *encoding = NULL;
6732    char *errors = NULL;
6733    PyObject *v;
6734
6735    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6736        return NULL;
6737    v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
6738    if (v == NULL)
6739        goto onError;
6740    if (!PyBytes_Check(v)) {
6741        PyErr_Format(PyExc_TypeError,
6742                     "encoder did not return a bytes object "
6743                     "(type=%.400s)",
6744                     Py_TYPE(v)->tp_name);
6745        Py_DECREF(v);
6746        return NULL;
6747    }
6748    return v;
6749
6750 onError:
6751    return NULL;
6752}
6753
6754PyDoc_STRVAR(expandtabs__doc__,
6755"S.expandtabs([tabsize]) -> str\n\
6756\n\
6757Return a copy of S where all tab characters are expanded using spaces.\n\
6758If tabsize is not given, a tab size of 8 characters is assumed.");
6759
6760static PyObject*
6761unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6762{
6763    Py_UNICODE *e;
6764    Py_UNICODE *p;
6765    Py_UNICODE *q;
6766    Py_UNICODE *qe;
6767    Py_ssize_t i, j, incr;
6768    PyUnicodeObject *u;
6769    int tabsize = 8;
6770
6771    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6772	return NULL;
6773
6774    /* First pass: determine size of output string */
6775    i = 0; /* chars up to and including most recent \n or \r */
6776    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6777    e = self->str + self->length; /* end of input */
6778    for (p = self->str; p < e; p++)
6779        if (*p == '\t') {
6780	    if (tabsize > 0) {
6781		incr = tabsize - (j % tabsize); /* cannot overflow */
6782		if (j > PY_SSIZE_T_MAX - incr)
6783		    goto overflow1;
6784		j += incr;
6785            }
6786	}
6787        else {
6788	    if (j > PY_SSIZE_T_MAX - 1)
6789		goto overflow1;
6790            j++;
6791            if (*p == '\n' || *p == '\r') {
6792		if (i > PY_SSIZE_T_MAX - j)
6793		    goto overflow1;
6794                i += j;
6795                j = 0;
6796            }
6797        }
6798
6799    if (i > PY_SSIZE_T_MAX - j)
6800	goto overflow1;
6801
6802    /* Second pass: create output string and fill it */
6803    u = _PyUnicode_New(i + j);
6804    if (!u)
6805        return NULL;
6806
6807    j = 0; /* same as in first pass */
6808    q = u->str; /* next output char */
6809    qe = u->str + u->length; /* end of output */
6810
6811    for (p = self->str; p < e; p++)
6812        if (*p == '\t') {
6813	    if (tabsize > 0) {
6814		i = tabsize - (j % tabsize);
6815		j += i;
6816		while (i--) {
6817		    if (q >= qe)
6818			goto overflow2;
6819		    *q++ = ' ';
6820                }
6821	    }
6822	}
6823	else {
6824	    if (q >= qe)
6825		goto overflow2;
6826	    *q++ = *p;
6827            j++;
6828            if (*p == '\n' || *p == '\r')
6829                j = 0;
6830        }
6831
6832    return (PyObject*) u;
6833
6834  overflow2:
6835    Py_DECREF(u);
6836  overflow1:
6837    PyErr_SetString(PyExc_OverflowError, "new string is too long");
6838    return NULL;
6839}
6840
6841PyDoc_STRVAR(find__doc__,
6842"S.find(sub[, start[, end]]) -> int\n\
6843\n\
6844Return the lowest index in S where substring sub is found,\n\
6845such that sub is contained within s[start:end].  Optional\n\
6846arguments start and end are interpreted as in slice notation.\n\
6847\n\
6848Return -1 on failure.");
6849
6850static PyObject *
6851unicode_find(PyUnicodeObject *self, PyObject *args)
6852{
6853    PyObject *substring;
6854    Py_ssize_t start;
6855    Py_ssize_t end;
6856    Py_ssize_t result;
6857
6858    if (!_ParseTupleFinds(args, &substring, &start, &end))
6859        return NULL;
6860
6861    result = stringlib_find_slice(
6862        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6863        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6864        start, end
6865        );
6866
6867    Py_DECREF(substring);
6868
6869    return PyLong_FromSsize_t(result);
6870}
6871
6872static PyObject *
6873unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6874{
6875    if (index < 0 || index >= self->length) {
6876        PyErr_SetString(PyExc_IndexError, "string index out of range");
6877        return NULL;
6878    }
6879
6880    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6881}
6882
6883/* Believe it or not, this produces the same value for ASCII strings
6884   as string_hash(). */
6885static long
6886unicode_hash(PyUnicodeObject *self)
6887{
6888    Py_ssize_t len;
6889    Py_UNICODE *p;
6890    long x;
6891
6892    if (self->hash != -1)
6893        return self->hash;
6894    len = Py_SIZE(self);
6895    p = self->str;
6896    x = *p << 7;
6897    while (--len >= 0)
6898        x = (1000003*x) ^ *p++;
6899    x ^= Py_SIZE(self);
6900    if (x == -1)
6901        x = -2;
6902    self->hash = x;
6903    return x;
6904}
6905
6906PyDoc_STRVAR(index__doc__,
6907"S.index(sub[, start[, end]]) -> int\n\
6908\n\
6909Like S.find() but raise ValueError when the substring is not found.");
6910
6911static PyObject *
6912unicode_index(PyUnicodeObject *self, PyObject *args)
6913{
6914    Py_ssize_t result;
6915    PyObject *substring;
6916    Py_ssize_t start;
6917    Py_ssize_t end;
6918
6919    if (!_ParseTupleFinds(args, &substring, &start, &end))
6920        return NULL;
6921
6922    result = stringlib_find_slice(
6923        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6924        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6925        start, end
6926        );
6927
6928    Py_DECREF(substring);
6929
6930    if (result < 0) {
6931        PyErr_SetString(PyExc_ValueError, "substring not found");
6932        return NULL;
6933    }
6934
6935    return PyLong_FromSsize_t(result);
6936}
6937
6938PyDoc_STRVAR(islower__doc__,
6939"S.islower() -> bool\n\
6940\n\
6941Return True if all cased characters in S are lowercase and there is\n\
6942at least one cased character in S, False otherwise.");
6943
6944static PyObject*
6945unicode_islower(PyUnicodeObject *self)
6946{
6947    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948    register const Py_UNICODE *e;
6949    int cased;
6950
6951    /* Shortcut for single character strings */
6952    if (PyUnicode_GET_SIZE(self) == 1)
6953	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6954
6955    /* Special case for empty strings */
6956    if (PyUnicode_GET_SIZE(self) == 0)
6957	return PyBool_FromLong(0);
6958
6959    e = p + PyUnicode_GET_SIZE(self);
6960    cased = 0;
6961    for (; p < e; p++) {
6962	register const Py_UNICODE ch = *p;
6963
6964	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6965	    return PyBool_FromLong(0);
6966	else if (!cased && Py_UNICODE_ISLOWER(ch))
6967	    cased = 1;
6968    }
6969    return PyBool_FromLong(cased);
6970}
6971
6972PyDoc_STRVAR(isupper__doc__,
6973"S.isupper() -> bool\n\
6974\n\
6975Return True if all cased characters in S are uppercase and there is\n\
6976at least one cased character in S, False otherwise.");
6977
6978static PyObject*
6979unicode_isupper(PyUnicodeObject *self)
6980{
6981    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6982    register const Py_UNICODE *e;
6983    int cased;
6984
6985    /* Shortcut for single character strings */
6986    if (PyUnicode_GET_SIZE(self) == 1)
6987	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6988
6989    /* Special case for empty strings */
6990    if (PyUnicode_GET_SIZE(self) == 0)
6991	return PyBool_FromLong(0);
6992
6993    e = p + PyUnicode_GET_SIZE(self);
6994    cased = 0;
6995    for (; p < e; p++) {
6996	register const Py_UNICODE ch = *p;
6997
6998	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6999	    return PyBool_FromLong(0);
7000	else if (!cased && Py_UNICODE_ISUPPER(ch))
7001	    cased = 1;
7002    }
7003    return PyBool_FromLong(cased);
7004}
7005
7006PyDoc_STRVAR(istitle__doc__,
7007"S.istitle() -> bool\n\
7008\n\
7009Return True if S is a titlecased string and there is at least one\n\
7010character in S, i.e. upper- and titlecase characters may only\n\
7011follow uncased characters and lowercase characters only cased ones.\n\
7012Return False otherwise.");
7013
7014static PyObject*
7015unicode_istitle(PyUnicodeObject *self)
7016{
7017    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7018    register const Py_UNICODE *e;
7019    int cased, previous_is_cased;
7020
7021    /* Shortcut for single character strings */
7022    if (PyUnicode_GET_SIZE(self) == 1)
7023	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7024			       (Py_UNICODE_ISUPPER(*p) != 0));
7025
7026    /* Special case for empty strings */
7027    if (PyUnicode_GET_SIZE(self) == 0)
7028	return PyBool_FromLong(0);
7029
7030    e = p + PyUnicode_GET_SIZE(self);
7031    cased = 0;
7032    previous_is_cased = 0;
7033    for (; p < e; p++) {
7034	register const Py_UNICODE ch = *p;
7035
7036	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7037	    if (previous_is_cased)
7038		return PyBool_FromLong(0);
7039	    previous_is_cased = 1;
7040	    cased = 1;
7041	}
7042	else if (Py_UNICODE_ISLOWER(ch)) {
7043	    if (!previous_is_cased)
7044		return PyBool_FromLong(0);
7045	    previous_is_cased = 1;
7046	    cased = 1;
7047	}
7048	else
7049	    previous_is_cased = 0;
7050    }
7051    return PyBool_FromLong(cased);
7052}
7053
7054PyDoc_STRVAR(isspace__doc__,
7055"S.isspace() -> bool\n\
7056\n\
7057Return True if all characters in S are whitespace\n\
7058and there is at least one character in S, False otherwise.");
7059
7060static PyObject*
7061unicode_isspace(PyUnicodeObject *self)
7062{
7063    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7064    register const Py_UNICODE *e;
7065
7066    /* Shortcut for single character strings */
7067    if (PyUnicode_GET_SIZE(self) == 1 &&
7068	Py_UNICODE_ISSPACE(*p))
7069	return PyBool_FromLong(1);
7070
7071    /* Special case for empty strings */
7072    if (PyUnicode_GET_SIZE(self) == 0)
7073	return PyBool_FromLong(0);
7074
7075    e = p + PyUnicode_GET_SIZE(self);
7076    for (; p < e; p++) {
7077	if (!Py_UNICODE_ISSPACE(*p))
7078	    return PyBool_FromLong(0);
7079    }
7080    return PyBool_FromLong(1);
7081}
7082
7083PyDoc_STRVAR(isalpha__doc__,
7084"S.isalpha() -> bool\n\
7085\n\
7086Return True if all characters in S are alphabetic\n\
7087and there is at least one character in S, False otherwise.");
7088
7089static PyObject*
7090unicode_isalpha(PyUnicodeObject *self)
7091{
7092    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7093    register const Py_UNICODE *e;
7094
7095    /* Shortcut for single character strings */
7096    if (PyUnicode_GET_SIZE(self) == 1 &&
7097	Py_UNICODE_ISALPHA(*p))
7098	return PyBool_FromLong(1);
7099
7100    /* Special case for empty strings */
7101    if (PyUnicode_GET_SIZE(self) == 0)
7102	return PyBool_FromLong(0);
7103
7104    e = p + PyUnicode_GET_SIZE(self);
7105    for (; p < e; p++) {
7106	if (!Py_UNICODE_ISALPHA(*p))
7107	    return PyBool_FromLong(0);
7108    }
7109    return PyBool_FromLong(1);
7110}
7111
7112PyDoc_STRVAR(isalnum__doc__,
7113"S.isalnum() -> bool\n\
7114\n\
7115Return True if all characters in S are alphanumeric\n\
7116and there is at least one character in S, False otherwise.");
7117
7118static PyObject*
7119unicode_isalnum(PyUnicodeObject *self)
7120{
7121    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7122    register const Py_UNICODE *e;
7123
7124    /* Shortcut for single character strings */
7125    if (PyUnicode_GET_SIZE(self) == 1 &&
7126	Py_UNICODE_ISALNUM(*p))
7127	return PyBool_FromLong(1);
7128
7129    /* Special case for empty strings */
7130    if (PyUnicode_GET_SIZE(self) == 0)
7131	return PyBool_FromLong(0);
7132
7133    e = p + PyUnicode_GET_SIZE(self);
7134    for (; p < e; p++) {
7135	if (!Py_UNICODE_ISALNUM(*p))
7136	    return PyBool_FromLong(0);
7137    }
7138    return PyBool_FromLong(1);
7139}
7140
7141PyDoc_STRVAR(isdecimal__doc__,
7142"S.isdecimal() -> bool\n\
7143\n\
7144Return True if there are only decimal characters in S,\n\
7145False otherwise.");
7146
7147static PyObject*
7148unicode_isdecimal(PyUnicodeObject *self)
7149{
7150    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7151    register const Py_UNICODE *e;
7152
7153    /* Shortcut for single character strings */
7154    if (PyUnicode_GET_SIZE(self) == 1 &&
7155	Py_UNICODE_ISDECIMAL(*p))
7156	return PyBool_FromLong(1);
7157
7158    /* Special case for empty strings */
7159    if (PyUnicode_GET_SIZE(self) == 0)
7160	return PyBool_FromLong(0);
7161
7162    e = p + PyUnicode_GET_SIZE(self);
7163    for (; p < e; p++) {
7164	if (!Py_UNICODE_ISDECIMAL(*p))
7165	    return PyBool_FromLong(0);
7166    }
7167    return PyBool_FromLong(1);
7168}
7169
7170PyDoc_STRVAR(isdigit__doc__,
7171"S.isdigit() -> bool\n\
7172\n\
7173Return True if all characters in S are digits\n\
7174and there is at least one character in S, False otherwise.");
7175
7176static PyObject*
7177unicode_isdigit(PyUnicodeObject *self)
7178{
7179    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7180    register const Py_UNICODE *e;
7181
7182    /* Shortcut for single character strings */
7183    if (PyUnicode_GET_SIZE(self) == 1 &&
7184	Py_UNICODE_ISDIGIT(*p))
7185	return PyBool_FromLong(1);
7186
7187    /* Special case for empty strings */
7188    if (PyUnicode_GET_SIZE(self) == 0)
7189	return PyBool_FromLong(0);
7190
7191    e = p + PyUnicode_GET_SIZE(self);
7192    for (; p < e; p++) {
7193	if (!Py_UNICODE_ISDIGIT(*p))
7194	    return PyBool_FromLong(0);
7195    }
7196    return PyBool_FromLong(1);
7197}
7198
7199PyDoc_STRVAR(isnumeric__doc__,
7200"S.isnumeric() -> bool\n\
7201\n\
7202Return True if there are only numeric characters in S,\n\
7203False otherwise.");
7204
7205static PyObject*
7206unicode_isnumeric(PyUnicodeObject *self)
7207{
7208    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7209    register const Py_UNICODE *e;
7210
7211    /* Shortcut for single character strings */
7212    if (PyUnicode_GET_SIZE(self) == 1 &&
7213	Py_UNICODE_ISNUMERIC(*p))
7214	return PyBool_FromLong(1);
7215
7216    /* Special case for empty strings */
7217    if (PyUnicode_GET_SIZE(self) == 0)
7218	return PyBool_FromLong(0);
7219
7220    e = p + PyUnicode_GET_SIZE(self);
7221    for (; p < e; p++) {
7222	if (!Py_UNICODE_ISNUMERIC(*p))
7223	    return PyBool_FromLong(0);
7224    }
7225    return PyBool_FromLong(1);
7226}
7227
7228int
7229PyUnicode_IsIdentifier(PyObject *self)
7230{
7231    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7232    register const Py_UNICODE *e;
7233
7234    /* Special case for empty strings */
7235    if (PyUnicode_GET_SIZE(self) == 0)
7236	return 0;
7237
7238    /* PEP 3131 says that the first character must be in
7239       XID_Start and subsequent characters in XID_Continue,
7240       and for the ASCII range, the 2.x rules apply (i.e
7241       start with letters and underscore, continue with
7242       letters, digits, underscore). However, given the current
7243       definition of XID_Start and XID_Continue, it is sufficient
7244       to check just for these, except that _ must be allowed
7245       as starting an identifier.  */
7246    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7247        return 0;
7248
7249    e = p + PyUnicode_GET_SIZE(self);
7250    for (p++; p < e; p++) {
7251	if (!_PyUnicode_IsXidContinue(*p))
7252	    return 0;
7253    }
7254    return 1;
7255}
7256
7257PyDoc_STRVAR(isidentifier__doc__,
7258"S.isidentifier() -> bool\n\
7259\n\
7260Return True if S is a valid identifier according\n\
7261to the language definition.");
7262
7263static PyObject*
7264unicode_isidentifier(PyObject *self)
7265{
7266    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7267}
7268
7269PyDoc_STRVAR(isprintable__doc__,
7270"S.isprintable() -> bool\n\
7271\n\
7272Return True if all characters in S are considered\n\
7273printable in repr() or S is empty, False otherwise.");
7274
7275static PyObject*
7276unicode_isprintable(PyObject *self)
7277{
7278    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7279    register const Py_UNICODE *e;
7280
7281    /* Shortcut for single character strings */
7282    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7283        Py_RETURN_TRUE;
7284    }
7285
7286    e = p + PyUnicode_GET_SIZE(self);
7287    for (; p < e; p++) {
7288        if (!Py_UNICODE_ISPRINTABLE(*p)) {
7289            Py_RETURN_FALSE;
7290        }
7291    }
7292    Py_RETURN_TRUE;
7293}
7294
7295PyDoc_STRVAR(join__doc__,
7296"S.join(sequence) -> str\n\
7297\n\
7298Return a string which is the concatenation of the strings in the\n\
7299sequence.  The separator between elements is S.");
7300
7301static PyObject*
7302unicode_join(PyObject *self, PyObject *data)
7303{
7304    return PyUnicode_Join(self, data);
7305}
7306
7307static Py_ssize_t
7308unicode_length(PyUnicodeObject *self)
7309{
7310    return self->length;
7311}
7312
7313PyDoc_STRVAR(ljust__doc__,
7314"S.ljust(width[, fillchar]) -> str\n\
7315\n\
7316Return S left-justified in a Unicode string of length width. Padding is\n\
7317done using the specified fill character (default is a space).");
7318
7319static PyObject *
7320unicode_ljust(PyUnicodeObject *self, PyObject *args)
7321{
7322    Py_ssize_t width;
7323    Py_UNICODE fillchar = ' ';
7324
7325    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7326        return NULL;
7327
7328    if (self->length >= width && PyUnicode_CheckExact(self)) {
7329        Py_INCREF(self);
7330        return (PyObject*) self;
7331    }
7332
7333    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7334}
7335
7336PyDoc_STRVAR(lower__doc__,
7337"S.lower() -> str\n\
7338\n\
7339Return a copy of the string S converted to lowercase.");
7340
7341static PyObject*
7342unicode_lower(PyUnicodeObject *self)
7343{
7344    return fixup(self, fixlower);
7345}
7346
7347#define LEFTSTRIP 0
7348#define RIGHTSTRIP 1
7349#define BOTHSTRIP 2
7350
7351/* Arrays indexed by above */
7352static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7353
7354#define STRIPNAME(i) (stripformat[i]+3)
7355
7356/* externally visible for str.strip(unicode) */
7357PyObject *
7358_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7359{
7360	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7361	Py_ssize_t len = PyUnicode_GET_SIZE(self);
7362	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7363	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7364	Py_ssize_t i, j;
7365
7366        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7367
7368	i = 0;
7369	if (striptype != RIGHTSTRIP) {
7370            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7371                i++;
7372            }
7373	}
7374
7375	j = len;
7376	if (striptype != LEFTSTRIP) {
7377            do {
7378                j--;
7379            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7380            j++;
7381	}
7382
7383	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7384            Py_INCREF(self);
7385            return (PyObject*)self;
7386	}
7387	else
7388            return PyUnicode_FromUnicode(s+i, j-i);
7389}
7390
7391
7392static PyObject *
7393do_strip(PyUnicodeObject *self, int striptype)
7394{
7395	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7396	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7397
7398	i = 0;
7399	if (striptype != RIGHTSTRIP) {
7400		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7401			i++;
7402		}
7403	}
7404
7405	j = len;
7406	if (striptype != LEFTSTRIP) {
7407		do {
7408			j--;
7409		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7410		j++;
7411	}
7412
7413	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7414		Py_INCREF(self);
7415		return (PyObject*)self;
7416	}
7417	else
7418		return PyUnicode_FromUnicode(s+i, j-i);
7419}
7420
7421
7422static PyObject *
7423do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7424{
7425	PyObject *sep = NULL;
7426
7427	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7428		return NULL;
7429
7430	if (sep != NULL && sep != Py_None) {
7431		if (PyUnicode_Check(sep))
7432			return _PyUnicode_XStrip(self, striptype, sep);
7433		else {
7434			PyErr_Format(PyExc_TypeError,
7435				     "%s arg must be None or str",
7436				     STRIPNAME(striptype));
7437			return NULL;
7438		}
7439	}
7440
7441	return do_strip(self, striptype);
7442}
7443
7444
7445PyDoc_STRVAR(strip__doc__,
7446"S.strip([chars]) -> str\n\
7447\n\
7448Return a copy of the string S with leading and trailing\n\
7449whitespace removed.\n\
7450If chars is given and not None, remove characters in chars instead.");
7451
7452static PyObject *
7453unicode_strip(PyUnicodeObject *self, PyObject *args)
7454{
7455	if (PyTuple_GET_SIZE(args) == 0)
7456		return do_strip(self, BOTHSTRIP); /* Common case */
7457	else
7458		return do_argstrip(self, BOTHSTRIP, args);
7459}
7460
7461
7462PyDoc_STRVAR(lstrip__doc__,
7463"S.lstrip([chars]) -> str\n\
7464\n\
7465Return a copy of the string S with leading whitespace removed.\n\
7466If chars is given and not None, remove characters in chars instead.");
7467
7468static PyObject *
7469unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7470{
7471	if (PyTuple_GET_SIZE(args) == 0)
7472		return do_strip(self, LEFTSTRIP); /* Common case */
7473	else
7474		return do_argstrip(self, LEFTSTRIP, args);
7475}
7476
7477
7478PyDoc_STRVAR(rstrip__doc__,
7479"S.rstrip([chars]) -> str\n\
7480\n\
7481Return a copy of the string S with trailing whitespace removed.\n\
7482If chars is given and not None, remove characters in chars instead.");
7483
7484static PyObject *
7485unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7486{
7487	if (PyTuple_GET_SIZE(args) == 0)
7488		return do_strip(self, RIGHTSTRIP); /* Common case */
7489	else
7490		return do_argstrip(self, RIGHTSTRIP, args);
7491}
7492
7493
7494static PyObject*
7495unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7496{
7497    PyUnicodeObject *u;
7498    Py_UNICODE *p;
7499    Py_ssize_t nchars;
7500    size_t nbytes;
7501
7502    if (len < 0)
7503        len = 0;
7504
7505    if (len == 1 && PyUnicode_CheckExact(str)) {
7506        /* no repeat, return original string */
7507        Py_INCREF(str);
7508        return (PyObject*) str;
7509    }
7510
7511    /* ensure # of chars needed doesn't overflow int and # of bytes
7512     * needed doesn't overflow size_t
7513     */
7514    nchars = len * str->length;
7515    if (len && nchars / len != str->length) {
7516        PyErr_SetString(PyExc_OverflowError,
7517                        "repeated string is too long");
7518        return NULL;
7519    }
7520    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7521    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7522        PyErr_SetString(PyExc_OverflowError,
7523                        "repeated string is too long");
7524        return NULL;
7525    }
7526    u = _PyUnicode_New(nchars);
7527    if (!u)
7528        return NULL;
7529
7530    p = u->str;
7531
7532    if (str->length == 1 && len > 0) {
7533        Py_UNICODE_FILL(p, str->str[0], len);
7534    } else {
7535	Py_ssize_t done = 0; /* number of characters copied this far */
7536	if (done < nchars) {
7537            Py_UNICODE_COPY(p, str->str, str->length);
7538            done = str->length;
7539	}
7540	while (done < nchars) {
7541            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7542            Py_UNICODE_COPY(p+done, p, n);
7543            done += n;
7544	}
7545    }
7546
7547    return (PyObject*) u;
7548}
7549
7550PyObject *PyUnicode_Replace(PyObject *obj,
7551			    PyObject *subobj,
7552			    PyObject *replobj,
7553			    Py_ssize_t maxcount)
7554{
7555    PyObject *self;
7556    PyObject *str1;
7557    PyObject *str2;
7558    PyObject *result;
7559
7560    self = PyUnicode_FromObject(obj);
7561    if (self == NULL)
7562	return NULL;
7563    str1 = PyUnicode_FromObject(subobj);
7564    if (str1 == NULL) {
7565	Py_DECREF(self);
7566	return NULL;
7567    }
7568    str2 = PyUnicode_FromObject(replobj);
7569    if (str2 == NULL) {
7570	Py_DECREF(self);
7571	Py_DECREF(str1);
7572	return NULL;
7573    }
7574    result = replace((PyUnicodeObject *)self,
7575		     (PyUnicodeObject *)str1,
7576		     (PyUnicodeObject *)str2,
7577		     maxcount);
7578    Py_DECREF(self);
7579    Py_DECREF(str1);
7580    Py_DECREF(str2);
7581    return result;
7582}
7583
7584PyDoc_STRVAR(replace__doc__,
7585"S.replace (old, new[, count]) -> str\n\
7586\n\
7587Return a copy of S with all occurrences of substring\n\
7588old replaced by new.  If the optional argument count is\n\
7589given, only the first count occurrences are replaced.");
7590
7591static PyObject*
7592unicode_replace(PyUnicodeObject *self, PyObject *args)
7593{
7594    PyUnicodeObject *str1;
7595    PyUnicodeObject *str2;
7596    Py_ssize_t maxcount = -1;
7597    PyObject *result;
7598
7599    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7600        return NULL;
7601    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7602    if (str1 == NULL)
7603	return NULL;
7604    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7605    if (str2 == NULL) {
7606	Py_DECREF(str1);
7607	return NULL;
7608    }
7609
7610    result = replace(self, str1, str2, maxcount);
7611
7612    Py_DECREF(str1);
7613    Py_DECREF(str2);
7614    return result;
7615}
7616
7617static
7618PyObject *unicode_repr(PyObject *unicode)
7619{
7620    PyObject *repr;
7621    Py_UNICODE *p;
7622    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7623    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7624
7625    /* XXX(nnorwitz): rather than over-allocating, it would be
7626       better to choose a different scheme.  Perhaps scan the
7627       first N-chars of the string and allocate based on that size.
7628    */
7629    /* Initial allocation is based on the longest-possible unichr
7630       escape.
7631
7632       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7633       unichr, so in this case it's the longest unichr escape. In
7634       narrow (UTF-16) builds this is five chars per source unichr
7635       since there are two unichrs in the surrogate pair, so in narrow
7636       (UTF-16) builds it's not the longest unichr escape.
7637
7638       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7639       so in the narrow (UTF-16) build case it's the longest unichr
7640       escape.
7641    */
7642
7643    repr = PyUnicode_FromUnicode(NULL,
7644        2 /* quotes */
7645#ifdef Py_UNICODE_WIDE
7646        + 10*size
7647#else
7648        + 6*size
7649#endif
7650        + 1);
7651    if (repr == NULL)
7652        return NULL;
7653
7654    p = PyUnicode_AS_UNICODE(repr);
7655
7656    /* Add quote */
7657    *p++ = (findchar(s, size, '\'') &&
7658            !findchar(s, size, '"')) ? '"' : '\'';
7659    while (size-- > 0) {
7660        Py_UNICODE ch = *s++;
7661
7662        /* Escape quotes and backslashes */
7663        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7664            *p++ = '\\';
7665            *p++ = ch;
7666            continue;
7667        }
7668
7669	/* Map special whitespace to '\t', \n', '\r' */
7670        if (ch == '\t') {
7671            *p++ = '\\';
7672            *p++ = 't';
7673        }
7674        else if (ch == '\n') {
7675            *p++ = '\\';
7676            *p++ = 'n';
7677        }
7678        else if (ch == '\r') {
7679            *p++ = '\\';
7680            *p++ = 'r';
7681        }
7682
7683        /* Map non-printable US ASCII to '\xhh' */
7684        else if (ch < ' ' || ch == 0x7F) {
7685            *p++ = '\\';
7686            *p++ = 'x';
7687            *p++ = hexdigits[(ch >> 4) & 0x000F];
7688            *p++ = hexdigits[ch & 0x000F];
7689        }
7690
7691        /* Copy ASCII characters as-is */
7692        else if (ch < 0x7F) {
7693            *p++ = ch;
7694        }
7695
7696	/* Non-ASCII characters */
7697        else {
7698            Py_UCS4 ucs = ch;
7699
7700#ifndef Py_UNICODE_WIDE
7701            Py_UNICODE ch2 = 0;
7702            /* Get code point from surrogate pair */
7703            if (size > 0) {
7704                ch2 = *s;
7705                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
7706                            && ch2 <= 0xDFFF) {
7707                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
7708                            + 0x00010000;
7709                    s++;
7710                    size--;
7711                }
7712            }
7713#endif
7714            /* Map Unicode whitespace and control characters
7715               (categories Z* and C* except ASCII space)
7716            */
7717            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
7718                /* Map 8-bit characters to '\xhh' */
7719                if (ucs <= 0xff) {
7720                    *p++ = '\\';
7721                    *p++ = 'x';
7722                    *p++ = hexdigits[(ch >> 4) & 0x000F];
7723                    *p++ = hexdigits[ch & 0x000F];
7724                }
7725                /* Map 21-bit characters to '\U00xxxxxx' */
7726                else if (ucs >= 0x10000) {
7727                    *p++ = '\\';
7728                    *p++ = 'U';
7729                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7730                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7731                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7732                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7733                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7734                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7735                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7736                    *p++ = hexdigits[ucs & 0x0000000F];
7737                }
7738                /* Map 16-bit characters to '\uxxxx' */
7739                else {
7740                    *p++ = '\\';
7741                    *p++ = 'u';
7742                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
7743                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
7744                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
7745                    *p++ = hexdigits[ucs & 0x000F];
7746                }
7747            }
7748            /* Copy characters as-is */
7749            else {
7750                *p++ = ch;
7751#ifndef Py_UNICODE_WIDE
7752                if (ucs >= 0x10000)
7753                    *p++ = ch2;
7754#endif
7755            }
7756        }
7757    }
7758    /* Add quote */
7759    *p++ = PyUnicode_AS_UNICODE(repr)[0];
7760
7761    *p = '\0';
7762    _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7763    return repr;
7764}
7765
7766PyDoc_STRVAR(rfind__doc__,
7767"S.rfind(sub[, start[, end]]) -> int\n\
7768\n\
7769Return the highest index in S where substring sub is found,\n\
7770such that sub is contained within s[start:end].  Optional\n\
7771arguments start and end are interpreted as in slice notation.\n\
7772\n\
7773Return -1 on failure.");
7774
7775static PyObject *
7776unicode_rfind(PyUnicodeObject *self, PyObject *args)
7777{
7778    PyObject *substring;
7779    Py_ssize_t start;
7780    Py_ssize_t end;
7781    Py_ssize_t result;
7782
7783    if (!_ParseTupleFinds(args, &substring, &start, &end))
7784	    return NULL;
7785
7786    result = stringlib_rfind_slice(
7787        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7788        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7789        start, end
7790        );
7791
7792    Py_DECREF(substring);
7793
7794    return PyLong_FromSsize_t(result);
7795}
7796
7797PyDoc_STRVAR(rindex__doc__,
7798"S.rindex(sub[, start[, end]]) -> int\n\
7799\n\
7800Like S.rfind() but raise ValueError when the substring is not found.");
7801
7802static PyObject *
7803unicode_rindex(PyUnicodeObject *self, PyObject *args)
7804{
7805    PyObject *substring;
7806    Py_ssize_t start;
7807    Py_ssize_t end;
7808    Py_ssize_t result;
7809
7810    if (!_ParseTupleFinds(args, &substring, &start, &end))
7811	    return NULL;
7812
7813    result = stringlib_rfind_slice(
7814        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7815        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7816        start, end
7817        );
7818
7819    Py_DECREF(substring);
7820
7821    if (result < 0) {
7822        PyErr_SetString(PyExc_ValueError, "substring not found");
7823        return NULL;
7824    }
7825    return PyLong_FromSsize_t(result);
7826}
7827
7828PyDoc_STRVAR(rjust__doc__,
7829"S.rjust(width[, fillchar]) -> str\n\
7830\n\
7831Return S right-justified in a string of length width. Padding is\n\
7832done using the specified fill character (default is a space).");
7833
7834static PyObject *
7835unicode_rjust(PyUnicodeObject *self, PyObject *args)
7836{
7837    Py_ssize_t width;
7838    Py_UNICODE fillchar = ' ';
7839
7840    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7841        return NULL;
7842
7843    if (self->length >= width && PyUnicode_CheckExact(self)) {
7844        Py_INCREF(self);
7845        return (PyObject*) self;
7846    }
7847
7848    return (PyObject*) pad(self, width - self->length, 0, fillchar);
7849}
7850
7851PyObject *PyUnicode_Split(PyObject *s,
7852			  PyObject *sep,
7853			  Py_ssize_t maxsplit)
7854{
7855    PyObject *result;
7856
7857    s = PyUnicode_FromObject(s);
7858    if (s == NULL)
7859	return NULL;
7860    if (sep != NULL) {
7861	sep = PyUnicode_FromObject(sep);
7862	if (sep == NULL) {
7863	    Py_DECREF(s);
7864	    return NULL;
7865	}
7866    }
7867
7868    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7869
7870    Py_DECREF(s);
7871    Py_XDECREF(sep);
7872    return result;
7873}
7874
7875PyDoc_STRVAR(split__doc__,
7876"S.split([sep[, maxsplit]]) -> list of strings\n\
7877\n\
7878Return a list of the words in S, using sep as the\n\
7879delimiter string.  If maxsplit is given, at most maxsplit\n\
7880splits are done. If sep is not specified or is None, any\n\
7881whitespace string is a separator and empty strings are\n\
7882removed from the result.");
7883
7884static PyObject*
7885unicode_split(PyUnicodeObject *self, PyObject *args)
7886{
7887    PyObject *substring = Py_None;
7888    Py_ssize_t maxcount = -1;
7889
7890    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7891        return NULL;
7892
7893    if (substring == Py_None)
7894	return split(self, NULL, maxcount);
7895    else if (PyUnicode_Check(substring))
7896	return split(self, (PyUnicodeObject *)substring, maxcount);
7897    else
7898	return PyUnicode_Split((PyObject *)self, substring, maxcount);
7899}
7900
7901PyObject *
7902PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7903{
7904    PyObject* str_obj;
7905    PyObject* sep_obj;
7906    PyObject* out;
7907
7908    str_obj = PyUnicode_FromObject(str_in);
7909    if (!str_obj)
7910	return NULL;
7911    sep_obj = PyUnicode_FromObject(sep_in);
7912    if (!sep_obj) {
7913        Py_DECREF(str_obj);
7914        return NULL;
7915    }
7916
7917    out = stringlib_partition(
7918        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7919        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7920        );
7921
7922    Py_DECREF(sep_obj);
7923    Py_DECREF(str_obj);
7924
7925    return out;
7926}
7927
7928
7929PyObject *
7930PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7931{
7932    PyObject* str_obj;
7933    PyObject* sep_obj;
7934    PyObject* out;
7935
7936    str_obj = PyUnicode_FromObject(str_in);
7937    if (!str_obj)
7938	return NULL;
7939    sep_obj = PyUnicode_FromObject(sep_in);
7940    if (!sep_obj) {
7941        Py_DECREF(str_obj);
7942        return NULL;
7943    }
7944
7945    out = stringlib_rpartition(
7946        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7947        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7948        );
7949
7950    Py_DECREF(sep_obj);
7951    Py_DECREF(str_obj);
7952
7953    return out;
7954}
7955
7956PyDoc_STRVAR(partition__doc__,
7957"S.partition(sep) -> (head, sep, tail)\n\
7958\n\
7959Search for the separator sep in S, and return the part before it,\n\
7960the separator itself, and the part after it.  If the separator is not\n\
7961found, return S and two empty strings.");
7962
7963static PyObject*
7964unicode_partition(PyUnicodeObject *self, PyObject *separator)
7965{
7966    return PyUnicode_Partition((PyObject *)self, separator);
7967}
7968
7969PyDoc_STRVAR(rpartition__doc__,
7970"S.rpartition(sep) -> (tail, sep, head)\n\
7971\n\
7972Search for the separator sep in S, starting at the end of S, and return\n\
7973the part before it, the separator itself, and the part after it.  If the\n\
7974separator is not found, return two empty strings and S.");
7975
7976static PyObject*
7977unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7978{
7979    return PyUnicode_RPartition((PyObject *)self, separator);
7980}
7981
7982PyObject *PyUnicode_RSplit(PyObject *s,
7983			   PyObject *sep,
7984			   Py_ssize_t maxsplit)
7985{
7986    PyObject *result;
7987
7988    s = PyUnicode_FromObject(s);
7989    if (s == NULL)
7990	return NULL;
7991    if (sep != NULL) {
7992	sep = PyUnicode_FromObject(sep);
7993	if (sep == NULL) {
7994	    Py_DECREF(s);
7995	    return NULL;
7996	}
7997    }
7998
7999    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8000
8001    Py_DECREF(s);
8002    Py_XDECREF(sep);
8003    return result;
8004}
8005
8006PyDoc_STRVAR(rsplit__doc__,
8007"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8008\n\
8009Return a list of the words in S, using sep as the\n\
8010delimiter string, starting at the end of the string and\n\
8011working to the front.  If maxsplit is given, at most maxsplit\n\
8012splits are done. If sep is not specified, any whitespace string\n\
8013is a separator.");
8014
8015static PyObject*
8016unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8017{
8018    PyObject *substring = Py_None;
8019    Py_ssize_t maxcount = -1;
8020
8021    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8022        return NULL;
8023
8024    if (substring == Py_None)
8025	return rsplit(self, NULL, maxcount);
8026    else if (PyUnicode_Check(substring))
8027	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8028    else
8029	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8030}
8031
8032PyDoc_STRVAR(splitlines__doc__,
8033"S.splitlines([keepends]]) -> list of strings\n\
8034\n\
8035Return a list of the lines in S, breaking at line boundaries.\n\
8036Line breaks are not included in the resulting list unless keepends\n\
8037is given and true.");
8038
8039static PyObject*
8040unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8041{
8042    int keepends = 0;
8043
8044    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8045        return NULL;
8046
8047    return PyUnicode_Splitlines((PyObject *)self, keepends);
8048}
8049
8050static
8051PyObject *unicode_str(PyObject *self)
8052{
8053    if (PyUnicode_CheckExact(self)) {
8054        Py_INCREF(self);
8055        return self;
8056    } else
8057        /* Subtype -- return genuine unicode string with the same value. */
8058        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8059                                     PyUnicode_GET_SIZE(self));
8060}
8061
8062PyDoc_STRVAR(swapcase__doc__,
8063"S.swapcase() -> str\n\
8064\n\
8065Return a copy of S with uppercase characters converted to lowercase\n\
8066and vice versa.");
8067
8068static PyObject*
8069unicode_swapcase(PyUnicodeObject *self)
8070{
8071    return fixup(self, fixswapcase);
8072}
8073
8074PyDoc_STRVAR(maketrans__doc__,
8075"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8076\n\
8077Return a translation table usable for str.translate().\n\
8078If there is only one argument, it must be a dictionary mapping Unicode\n\
8079ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8080Character keys will be then converted to ordinals.\n\
8081If there are two arguments, they must be strings of equal length, and\n\
8082in the resulting dictionary, each character in x will be mapped to the\n\
8083character at the same position in y. If there is a third argument, it\n\
8084must be a string, whose characters will be mapped to None in the result.");
8085
8086static PyObject*
8087unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8088{
8089    PyObject *x, *y = NULL, *z = NULL;
8090    PyObject *new = NULL, *key, *value;
8091    Py_ssize_t i = 0;
8092    int res;
8093
8094    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8095        return NULL;
8096    new = PyDict_New();
8097    if (!new)
8098        return NULL;
8099    if (y != NULL) {
8100        /* x must be a string too, of equal length */
8101        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8102        if (!PyUnicode_Check(x)) {
8103            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8104                            "be a string if there is a second argument");
8105            goto err;
8106        }
8107        if (PyUnicode_GET_SIZE(x) != ylen) {
8108            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8109                            "arguments must have equal length");
8110            goto err;
8111        }
8112        /* create entries for translating chars in x to those in y */
8113        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8114            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8115            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8116            if (!key || !value)
8117                goto err;
8118            res = PyDict_SetItem(new, key, value);
8119            Py_DECREF(key);
8120            Py_DECREF(value);
8121            if (res < 0)
8122                goto err;
8123        }
8124        /* create entries for deleting chars in z */
8125        if (z != NULL) {
8126            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8127                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8128                if (!key)
8129                    goto err;
8130                res = PyDict_SetItem(new, key, Py_None);
8131                Py_DECREF(key);
8132                if (res < 0)
8133                    goto err;
8134            }
8135        }
8136    } else {
8137        /* x must be a dict */
8138        if (!PyDict_Check(x)) {
8139            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8140                            "to maketrans it must be a dict");
8141            goto err;
8142        }
8143        /* copy entries into the new dict, converting string keys to int keys */
8144        while (PyDict_Next(x, &i, &key, &value)) {
8145            if (PyUnicode_Check(key)) {
8146                /* convert string keys to integer keys */
8147                PyObject *newkey;
8148                if (PyUnicode_GET_SIZE(key) != 1) {
8149                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8150                                    "table must be of length 1");
8151                    goto err;
8152                }
8153                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8154                if (!newkey)
8155                    goto err;
8156                res = PyDict_SetItem(new, newkey, value);
8157                Py_DECREF(newkey);
8158                if (res < 0)
8159                    goto err;
8160            } else if (PyLong_Check(key)) {
8161                /* just keep integer keys */
8162                if (PyDict_SetItem(new, key, value) < 0)
8163                    goto err;
8164            } else {
8165                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8166                                "be strings or integers");
8167                goto err;
8168            }
8169        }
8170    }
8171    return new;
8172  err:
8173    Py_DECREF(new);
8174    return NULL;
8175}
8176
8177PyDoc_STRVAR(translate__doc__,
8178"S.translate(table) -> str\n\
8179\n\
8180Return a copy of the string S, where all characters have been mapped\n\
8181through the given translation table, which must be a mapping of\n\
8182Unicode ordinals to Unicode ordinals, strings, or None.\n\
8183Unmapped characters are left untouched. Characters mapped to None\n\
8184are deleted.");
8185
8186static PyObject*
8187unicode_translate(PyUnicodeObject *self, PyObject *table)
8188{
8189    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8190}
8191
8192PyDoc_STRVAR(upper__doc__,
8193"S.upper() -> str\n\
8194\n\
8195Return a copy of S converted to uppercase.");
8196
8197static PyObject*
8198unicode_upper(PyUnicodeObject *self)
8199{
8200    return fixup(self, fixupper);
8201}
8202
8203PyDoc_STRVAR(zfill__doc__,
8204"S.zfill(width) -> str\n\
8205\n\
8206Pad a numeric string S with zeros on the left, to fill a field\n\
8207of the specified width. The string S is never truncated.");
8208
8209static PyObject *
8210unicode_zfill(PyUnicodeObject *self, PyObject *args)
8211{
8212    Py_ssize_t fill;
8213    PyUnicodeObject *u;
8214
8215    Py_ssize_t width;
8216    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8217        return NULL;
8218
8219    if (self->length >= width) {
8220        if (PyUnicode_CheckExact(self)) {
8221            Py_INCREF(self);
8222            return (PyObject*) self;
8223        }
8224        else
8225            return PyUnicode_FromUnicode(
8226                PyUnicode_AS_UNICODE(self),
8227                PyUnicode_GET_SIZE(self)
8228            );
8229    }
8230
8231    fill = width - self->length;
8232
8233    u = pad(self, fill, 0, '0');
8234
8235    if (u == NULL)
8236        return NULL;
8237
8238    if (u->str[fill] == '+' || u->str[fill] == '-') {
8239        /* move sign to beginning of string */
8240        u->str[0] = u->str[fill];
8241        u->str[fill] = '0';
8242    }
8243
8244    return (PyObject*) u;
8245}
8246
8247#if 0
8248static PyObject*
8249unicode_freelistsize(PyUnicodeObject *self)
8250{
8251    return PyLong_FromLong(numfree);
8252}
8253#endif
8254
8255PyDoc_STRVAR(startswith__doc__,
8256"S.startswith(prefix[, start[, end]]) -> bool\n\
8257\n\
8258Return True if S starts with the specified prefix, False otherwise.\n\
8259With optional start, test S beginning at that position.\n\
8260With optional end, stop comparing S at that position.\n\
8261prefix can also be a tuple of strings to try.");
8262
8263static PyObject *
8264unicode_startswith(PyUnicodeObject *self,
8265		   PyObject *args)
8266{
8267    PyObject *subobj;
8268    PyUnicodeObject *substring;
8269    Py_ssize_t start = 0;
8270    Py_ssize_t end = PY_SSIZE_T_MAX;
8271    int result;
8272
8273    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8274		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8275	return NULL;
8276    if (PyTuple_Check(subobj)) {
8277        Py_ssize_t i;
8278        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8279            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8280                            PyTuple_GET_ITEM(subobj, i));
8281            if (substring == NULL)
8282                return NULL;
8283            result = tailmatch(self, substring, start, end, -1);
8284            Py_DECREF(substring);
8285            if (result) {
8286                Py_RETURN_TRUE;
8287            }
8288        }
8289        /* nothing matched */
8290        Py_RETURN_FALSE;
8291    }
8292    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8293    if (substring == NULL)
8294         return NULL;
8295    result = tailmatch(self, substring, start, end, -1);
8296    Py_DECREF(substring);
8297    return PyBool_FromLong(result);
8298}
8299
8300
8301PyDoc_STRVAR(endswith__doc__,
8302"S.endswith(suffix[, start[, end]]) -> bool\n\
8303\n\
8304Return True if S ends with the specified suffix, False otherwise.\n\
8305With optional start, test S beginning at that position.\n\
8306With optional end, stop comparing S at that position.\n\
8307suffix can also be a tuple of strings to try.");
8308
8309static PyObject *
8310unicode_endswith(PyUnicodeObject *self,
8311		 PyObject *args)
8312{
8313    PyObject *subobj;
8314    PyUnicodeObject *substring;
8315    Py_ssize_t start = 0;
8316    Py_ssize_t end = PY_SSIZE_T_MAX;
8317    int result;
8318
8319    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8320        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8321	return NULL;
8322    if (PyTuple_Check(subobj)) {
8323        Py_ssize_t i;
8324        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8325            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8326                            PyTuple_GET_ITEM(subobj, i));
8327            if (substring == NULL)
8328            return NULL;
8329            result = tailmatch(self, substring, start, end, +1);
8330            Py_DECREF(substring);
8331            if (result) {
8332                Py_RETURN_TRUE;
8333            }
8334        }
8335        Py_RETURN_FALSE;
8336    }
8337    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8338    if (substring == NULL)
8339    return NULL;
8340
8341    result = tailmatch(self, substring, start, end, +1);
8342    Py_DECREF(substring);
8343    return PyBool_FromLong(result);
8344}
8345
8346#include "stringlib/string_format.h"
8347
8348PyDoc_STRVAR(format__doc__,
8349"S.format(*args, **kwargs) -> str\n\
8350\n\
8351");
8352
8353static PyObject *
8354unicode__format__(PyObject* self, PyObject* args)
8355{
8356    PyObject *format_spec;
8357
8358    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8359        return NULL;
8360
8361    return _PyUnicode_FormatAdvanced(self,
8362                                     PyUnicode_AS_UNICODE(format_spec),
8363                                     PyUnicode_GET_SIZE(format_spec));
8364}
8365
8366PyDoc_STRVAR(p_format__doc__,
8367"S.__format__(format_spec) -> str\n\
8368\n\
8369");
8370
8371static PyObject *
8372unicode__sizeof__(PyUnicodeObject *v)
8373{
8374    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8375                              sizeof(Py_UNICODE) * (v->length + 1));
8376}
8377
8378PyDoc_STRVAR(sizeof__doc__,
8379"S.__sizeof__() -> size of S in memory, in bytes");
8380
8381static PyObject *
8382unicode_getnewargs(PyUnicodeObject *v)
8383{
8384	return Py_BuildValue("(u#)", v->str, v->length);
8385}
8386
8387
8388static PyMethodDef unicode_methods[] = {
8389
8390    /* Order is according to common usage: often used methods should
8391       appear first, since lookup is done sequentially. */
8392
8393    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8394    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8395    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8396    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8397    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8398    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8399    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8400    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8401    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8402    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8403    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8404    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8405    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8406    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8407    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8408    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8409    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8410    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8411    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8412    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8413    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8414    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8415    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8416    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8417    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8418    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8419    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8420    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8421    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8422    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8423    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8424    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8425    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8426    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8427    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8428    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8429    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8430    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8431    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
8432    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8433    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8434    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8435    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8436    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8437    {"maketrans", (PyCFunction) unicode_maketrans,
8438     METH_VARARGS | METH_STATIC, maketrans__doc__},
8439    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8440#if 0
8441    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8442#endif
8443
8444#if 0
8445    /* This one is just used for debugging the implementation. */
8446    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8447#endif
8448
8449    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
8450    {NULL, NULL}
8451};
8452
8453static PyObject *
8454unicode_mod(PyObject *v, PyObject *w)
8455{
8456       if (!PyUnicode_Check(v)) {
8457               Py_INCREF(Py_NotImplemented);
8458               return Py_NotImplemented;
8459       }
8460       return PyUnicode_Format(v, w);
8461}
8462
8463static PyNumberMethods unicode_as_number = {
8464	0,				/*nb_add*/
8465	0,				/*nb_subtract*/
8466	0,				/*nb_multiply*/
8467	unicode_mod,			/*nb_remainder*/
8468};
8469
8470static PySequenceMethods unicode_as_sequence = {
8471    (lenfunc) unicode_length, 		/* sq_length */
8472    PyUnicode_Concat,		 	/* sq_concat */
8473    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
8474    (ssizeargfunc) unicode_getitem, 	/* sq_item */
8475    0,				 	/* sq_slice */
8476    0, 					/* sq_ass_item */
8477    0, 					/* sq_ass_slice */
8478    PyUnicode_Contains, 		/* sq_contains */
8479};
8480
8481static PyObject*
8482unicode_subscript(PyUnicodeObject* self, PyObject* item)
8483{
8484    if (PyIndex_Check(item)) {
8485        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8486        if (i == -1 && PyErr_Occurred())
8487            return NULL;
8488        if (i < 0)
8489            i += PyUnicode_GET_SIZE(self);
8490        return unicode_getitem(self, i);
8491    } else if (PySlice_Check(item)) {
8492        Py_ssize_t start, stop, step, slicelength, cur, i;
8493        Py_UNICODE* source_buf;
8494        Py_UNICODE* result_buf;
8495        PyObject* result;
8496
8497        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8498				 &start, &stop, &step, &slicelength) < 0) {
8499            return NULL;
8500        }
8501
8502        if (slicelength <= 0) {
8503            return PyUnicode_FromUnicode(NULL, 0);
8504        } else if (start == 0 && step == 1 && slicelength == self->length &&
8505                   PyUnicode_CheckExact(self)) {
8506            Py_INCREF(self);
8507            return (PyObject *)self;
8508        } else if (step == 1) {
8509            return PyUnicode_FromUnicode(self->str + start, slicelength);
8510        } else {
8511            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8512            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8513                                                       sizeof(Py_UNICODE));
8514
8515	    if (result_buf == NULL)
8516		    return PyErr_NoMemory();
8517
8518            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8519                result_buf[i] = source_buf[cur];
8520            }
8521
8522            result = PyUnicode_FromUnicode(result_buf, slicelength);
8523            PyObject_FREE(result_buf);
8524            return result;
8525        }
8526    } else {
8527        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8528        return NULL;
8529    }
8530}
8531
8532static PyMappingMethods unicode_as_mapping = {
8533    (lenfunc)unicode_length,		/* mp_length */
8534    (binaryfunc)unicode_subscript,	/* mp_subscript */
8535    (objobjargproc)0,			/* mp_ass_subscript */
8536};
8537
8538
8539/* Helpers for PyUnicode_Format() */
8540
8541static PyObject *
8542getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8543{
8544    Py_ssize_t argidx = *p_argidx;
8545    if (argidx < arglen) {
8546	(*p_argidx)++;
8547	if (arglen < 0)
8548	    return args;
8549	else
8550	    return PyTuple_GetItem(args, argidx);
8551    }
8552    PyErr_SetString(PyExc_TypeError,
8553		    "not enough arguments for format string");
8554    return NULL;
8555}
8556
8557static Py_ssize_t
8558strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8559{
8560    register Py_ssize_t i;
8561    Py_ssize_t len = strlen(charbuffer);
8562    for (i = len - 1; i >= 0; i--)
8563	buffer[i] = (Py_UNICODE) charbuffer[i];
8564
8565    return len;
8566}
8567
8568static int
8569doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8570{
8571    Py_ssize_t result;
8572
8573    PyOS_ascii_formatd((char *)buffer, len, format, x);
8574    result = strtounicode(buffer, (char *)buffer);
8575    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8576}
8577
8578#if 0
8579static int
8580longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8581{
8582    Py_ssize_t result;
8583
8584    PyOS_snprintf((char *)buffer, len, format, x);
8585    result = strtounicode(buffer, (char *)buffer);
8586    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8587}
8588#endif
8589
8590/* XXX To save some code duplication, formatfloat/long/int could have been
8591   shared with stringobject.c, converting from 8-bit to Unicode after the
8592   formatting is done. */
8593
8594static int
8595formatfloat(Py_UNICODE *buf,
8596	    size_t buflen,
8597	    int flags,
8598	    int prec,
8599	    int type,
8600	    PyObject *v)
8601{
8602    /* fmt = '%#.' + `prec` + `type`
8603       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8604    char fmt[20];
8605    double x;
8606
8607    x = PyFloat_AsDouble(v);
8608    if (x == -1.0 && PyErr_Occurred())
8609	return -1;
8610    if (prec < 0)
8611	prec = 6;
8612    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8613	type = 'g';
8614    /* Worst case length calc to ensure no buffer overrun:
8615
8616       'g' formats:
8617	 fmt = %#.<prec>g
8618	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8619	    for any double rep.)
8620	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8621
8622       'f' formats:
8623	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8624	 len = 1 + 50 + 1 + prec = 52 + prec
8625
8626       If prec=0 the effective precision is 1 (the leading digit is
8627       always given), therefore increase the length by one.
8628
8629    */
8630    if (((type == 'g' || type == 'G') &&
8631          buflen <= (size_t)10 + (size_t)prec) ||
8632	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8633	PyErr_SetString(PyExc_OverflowError,
8634			"formatted float is too long (precision too large?)");
8635	return -1;
8636    }
8637    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8638		  (flags&F_ALT) ? "#" : "",
8639		  prec, type);
8640    return doubletounicode(buf, buflen, fmt, x);
8641}
8642
8643static PyObject*
8644formatlong(PyObject *val, int flags, int prec, int type)
8645{
8646	char *buf;
8647	int len;
8648	PyObject *str; /* temporary string object. */
8649	PyObject *result;
8650
8651	str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8652	if (!str)
8653		return NULL;
8654	result = PyUnicode_FromStringAndSize(buf, len);
8655	Py_DECREF(str);
8656	return result;
8657}
8658
8659#if 0
8660static int
8661formatint(Py_UNICODE *buf,
8662	  size_t buflen,
8663	  int flags,
8664	  int prec,
8665	  int type,
8666	  PyObject *v)
8667{
8668    /* fmt = '%#.' + `prec` + 'l' + `type`
8669     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8670     *                     + 1 + 1
8671     *                   = 24
8672     */
8673    char fmt[64]; /* plenty big enough! */
8674    char *sign;
8675    long x;
8676
8677    x = PyLong_AsLong(v);
8678    if (x == -1 && PyErr_Occurred())
8679        return -1;
8680    if (x < 0 && type == 'u') {
8681        type = 'd';
8682    }
8683    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8684        sign = "-";
8685    else
8686        sign = "";
8687    if (prec < 0)
8688        prec = 1;
8689
8690    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8691     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8692     */
8693    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8694        PyErr_SetString(PyExc_OverflowError,
8695    	        "formatted integer is too long (precision too large?)");
8696        return -1;
8697    }
8698
8699    if ((flags & F_ALT) &&
8700        (type == 'x' || type == 'X' || type == 'o')) {
8701        /* When converting under %#o, %#x or %#X, there are a number
8702         * of issues that cause pain:
8703	 * - for %#o, we want a different base marker than C
8704         * - when 0 is being converted, the C standard leaves off
8705         *   the '0x' or '0X', which is inconsistent with other
8706         *   %#x/%#X conversions and inconsistent with Python's
8707         *   hex() function
8708         * - there are platforms that violate the standard and
8709         *   convert 0 with the '0x' or '0X'
8710         *   (Metrowerks, Compaq Tru64)
8711         * - there are platforms that give '0x' when converting
8712         *   under %#X, but convert 0 in accordance with the
8713         *   standard (OS/2 EMX)
8714         *
8715         * We can achieve the desired consistency by inserting our
8716         * own '0x' or '0X' prefix, and substituting %x/%X in place
8717         * of %#x/%#X.
8718         *
8719         * Note that this is the same approach as used in
8720         * formatint() in stringobject.c
8721         */
8722        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8723                      sign, type, prec, type);
8724    }
8725    else {
8726        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8727                      sign, (flags&F_ALT) ? "#" : "",
8728                      prec, type);
8729    }
8730    if (sign[0])
8731        return longtounicode(buf, buflen, fmt, -x);
8732    else
8733        return longtounicode(buf, buflen, fmt, x);
8734}
8735#endif
8736
8737static int
8738formatchar(Py_UNICODE *buf,
8739           size_t buflen,
8740           PyObject *v)
8741{
8742    /* presume that the buffer is at least 3 characters long */
8743    if (PyUnicode_Check(v)) {
8744	if (PyUnicode_GET_SIZE(v) == 1) {
8745	    buf[0] = PyUnicode_AS_UNICODE(v)[0];
8746	    buf[1] = '\0';
8747	    return 1;
8748	}
8749#ifndef Py_UNICODE_WIDE
8750	if (PyUnicode_GET_SIZE(v) == 2) {
8751	    /* Decode a valid surrogate pair */
8752	    int c0 = PyUnicode_AS_UNICODE(v)[0];
8753	    int c1 = PyUnicode_AS_UNICODE(v)[1];
8754	    if (0xD800 <= c0 && c0 <= 0xDBFF &&
8755		0xDC00 <= c1 && c1 <= 0xDFFF) {
8756		buf[0] = c0;
8757		buf[1] = c1;
8758		buf[2] = '\0';
8759		return 2;
8760	    }
8761	}
8762#endif
8763	goto onError;
8764    }
8765    else {
8766	/* Integer input truncated to a character */
8767        long x;
8768	x = PyLong_AsLong(v);
8769	if (x == -1 && PyErr_Occurred())
8770	    goto onError;
8771
8772	if (x < 0 || x > 0x10ffff) {
8773	    PyErr_SetString(PyExc_OverflowError,
8774			    "%c arg not in range(0x110000)");
8775	    return -1;
8776	}
8777
8778#ifndef Py_UNICODE_WIDE
8779	if (x > 0xffff) {
8780	    x -= 0x10000;
8781	    buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
8782	    buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
8783	    return 2;
8784	}
8785#endif
8786	buf[0] = (Py_UNICODE) x;
8787	buf[1] = '\0';
8788	return 1;
8789    }
8790
8791 onError:
8792    PyErr_SetString(PyExc_TypeError,
8793		    "%c requires int or char");
8794    return -1;
8795}
8796
8797/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8798
8799   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8800   chars are formatted. XXX This is a magic number. Each formatting
8801   routine does bounds checking to ensure no overflow, but a better
8802   solution may be to malloc a buffer of appropriate size for each
8803   format. For now, the current solution is sufficient.
8804*/
8805#define FORMATBUFLEN (size_t)120
8806
8807PyObject *PyUnicode_Format(PyObject *format,
8808			   PyObject *args)
8809{
8810    Py_UNICODE *fmt, *res;
8811    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8812    int args_owned = 0;
8813    PyUnicodeObject *result = NULL;
8814    PyObject *dict = NULL;
8815    PyObject *uformat;
8816
8817    if (format == NULL || args == NULL) {
8818	PyErr_BadInternalCall();
8819	return NULL;
8820    }
8821    uformat = PyUnicode_FromObject(format);
8822    if (uformat == NULL)
8823	return NULL;
8824    fmt = PyUnicode_AS_UNICODE(uformat);
8825    fmtcnt = PyUnicode_GET_SIZE(uformat);
8826
8827    reslen = rescnt = fmtcnt + 100;
8828    result = _PyUnicode_New(reslen);
8829    if (result == NULL)
8830	goto onError;
8831    res = PyUnicode_AS_UNICODE(result);
8832
8833    if (PyTuple_Check(args)) {
8834	arglen = PyTuple_Size(args);
8835	argidx = 0;
8836    }
8837    else {
8838	arglen = -1;
8839	argidx = -2;
8840    }
8841    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8842        !PyUnicode_Check(args))
8843	dict = args;
8844
8845    while (--fmtcnt >= 0) {
8846	if (*fmt != '%') {
8847	    if (--rescnt < 0) {
8848		rescnt = fmtcnt + 100;
8849		reslen += rescnt;
8850		if (_PyUnicode_Resize(&result, reslen) < 0)
8851		    goto onError;
8852		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8853		--rescnt;
8854	    }
8855	    *res++ = *fmt++;
8856	}
8857	else {
8858	    /* Got a format specifier */
8859	    int flags = 0;
8860	    Py_ssize_t width = -1;
8861	    int prec = -1;
8862	    Py_UNICODE c = '\0';
8863	    Py_UNICODE fill;
8864	    int isnumok;
8865	    PyObject *v = NULL;
8866	    PyObject *temp = NULL;
8867	    Py_UNICODE *pbuf;
8868	    Py_UNICODE sign;
8869	    Py_ssize_t len;
8870	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8871
8872	    fmt++;
8873	    if (*fmt == '(') {
8874		Py_UNICODE *keystart;
8875		Py_ssize_t keylen;
8876		PyObject *key;
8877		int pcount = 1;
8878
8879		if (dict == NULL) {
8880		    PyErr_SetString(PyExc_TypeError,
8881				    "format requires a mapping");
8882		    goto onError;
8883		}
8884		++fmt;
8885		--fmtcnt;
8886		keystart = fmt;
8887		/* Skip over balanced parentheses */
8888		while (pcount > 0 && --fmtcnt >= 0) {
8889		    if (*fmt == ')')
8890			--pcount;
8891		    else if (*fmt == '(')
8892			++pcount;
8893		    fmt++;
8894		}
8895		keylen = fmt - keystart - 1;
8896		if (fmtcnt < 0 || pcount > 0) {
8897		    PyErr_SetString(PyExc_ValueError,
8898				    "incomplete format key");
8899		    goto onError;
8900		}
8901#if 0
8902		/* keys are converted to strings using UTF-8 and
8903		   then looked up since Python uses strings to hold
8904		   variables names etc. in its namespaces and we
8905		   wouldn't want to break common idioms. */
8906		key = PyUnicode_EncodeUTF8(keystart,
8907					   keylen,
8908					   NULL);
8909#else
8910		key = PyUnicode_FromUnicode(keystart, keylen);
8911#endif
8912		if (key == NULL)
8913		    goto onError;
8914		if (args_owned) {
8915		    Py_DECREF(args);
8916		    args_owned = 0;
8917		}
8918		args = PyObject_GetItem(dict, key);
8919		Py_DECREF(key);
8920		if (args == NULL) {
8921		    goto onError;
8922		}
8923		args_owned = 1;
8924		arglen = -1;
8925		argidx = -2;
8926	    }
8927	    while (--fmtcnt >= 0) {
8928		switch (c = *fmt++) {
8929		case '-': flags |= F_LJUST; continue;
8930		case '+': flags |= F_SIGN; continue;
8931		case ' ': flags |= F_BLANK; continue;
8932		case '#': flags |= F_ALT; continue;
8933		case '0': flags |= F_ZERO; continue;
8934		}
8935		break;
8936	    }
8937	    if (c == '*') {
8938		v = getnextarg(args, arglen, &argidx);
8939		if (v == NULL)
8940		    goto onError;
8941		if (!PyLong_Check(v)) {
8942		    PyErr_SetString(PyExc_TypeError,
8943				    "* wants int");
8944		    goto onError;
8945		}
8946		width = PyLong_AsLong(v);
8947		if (width == -1 && PyErr_Occurred())
8948			goto onError;
8949		if (width < 0) {
8950		    flags |= F_LJUST;
8951		    width = -width;
8952		}
8953		if (--fmtcnt >= 0)
8954		    c = *fmt++;
8955	    }
8956	    else if (c >= '0' && c <= '9') {
8957		width = c - '0';
8958		while (--fmtcnt >= 0) {
8959		    c = *fmt++;
8960		    if (c < '0' || c > '9')
8961			break;
8962		    if ((width*10) / 10 != width) {
8963			PyErr_SetString(PyExc_ValueError,
8964					"width too big");
8965			goto onError;
8966		    }
8967		    width = width*10 + (c - '0');
8968		}
8969	    }
8970	    if (c == '.') {
8971		prec = 0;
8972		if (--fmtcnt >= 0)
8973		    c = *fmt++;
8974		if (c == '*') {
8975		    v = getnextarg(args, arglen, &argidx);
8976		    if (v == NULL)
8977			goto onError;
8978		    if (!PyLong_Check(v)) {
8979			PyErr_SetString(PyExc_TypeError,
8980					"* wants int");
8981			goto onError;
8982		    }
8983		    prec = PyLong_AsLong(v);
8984		    if (prec == -1 && PyErr_Occurred())
8985			goto onError;
8986		    if (prec < 0)
8987			prec = 0;
8988		    if (--fmtcnt >= 0)
8989			c = *fmt++;
8990		}
8991		else if (c >= '0' && c <= '9') {
8992		    prec = c - '0';
8993		    while (--fmtcnt >= 0) {
8994			c = Py_CHARMASK(*fmt++);
8995			if (c < '0' || c > '9')
8996			    break;
8997			if ((prec*10) / 10 != prec) {
8998			    PyErr_SetString(PyExc_ValueError,
8999					    "prec too big");
9000			    goto onError;
9001			}
9002			prec = prec*10 + (c - '0');
9003		    }
9004		}
9005	    } /* prec */
9006	    if (fmtcnt >= 0) {
9007		if (c == 'h' || c == 'l' || c == 'L') {
9008		    if (--fmtcnt >= 0)
9009			c = *fmt++;
9010		}
9011	    }
9012	    if (fmtcnt < 0) {
9013		PyErr_SetString(PyExc_ValueError,
9014				"incomplete format");
9015		goto onError;
9016	    }
9017	    if (c != '%') {
9018		v = getnextarg(args, arglen, &argidx);
9019		if (v == NULL)
9020		    goto onError;
9021	    }
9022	    sign = 0;
9023	    fill = ' ';
9024	    switch (c) {
9025
9026	    case '%':
9027		pbuf = formatbuf;
9028		/* presume that buffer length is at least 1 */
9029		pbuf[0] = '%';
9030		len = 1;
9031		break;
9032
9033	    case 's':
9034	    case 'r':
9035	    case 'a':
9036		if (PyUnicode_Check(v) && c == 's') {
9037		    temp = v;
9038		    Py_INCREF(temp);
9039		}
9040		else {
9041		    if (c == 's')
9042			temp = PyObject_Str(v);
9043		    else if (c == 'r')
9044			temp = PyObject_Repr(v);
9045		    else
9046			temp = PyObject_ASCII(v);
9047		    if (temp == NULL)
9048			goto onError;
9049                    if (PyUnicode_Check(temp))
9050                        /* nothing to do */;
9051		    else {
9052			Py_DECREF(temp);
9053			PyErr_SetString(PyExc_TypeError,
9054					"%s argument has non-string str()");
9055			goto onError;
9056		    }
9057		}
9058		pbuf = PyUnicode_AS_UNICODE(temp);
9059		len = PyUnicode_GET_SIZE(temp);
9060		if (prec >= 0 && len > prec)
9061		    len = prec;
9062		break;
9063
9064	    case 'i':
9065	    case 'd':
9066	    case 'u':
9067	    case 'o':
9068	    case 'x':
9069	    case 'X':
9070		if (c == 'i')
9071		    c = 'd';
9072		isnumok = 0;
9073		if (PyNumber_Check(v)) {
9074			PyObject *iobj=NULL;
9075
9076			if (PyLong_Check(v)) {
9077				iobj = v;
9078				Py_INCREF(iobj);
9079			}
9080			else {
9081				iobj = PyNumber_Long(v);
9082			}
9083			if (iobj!=NULL) {
9084				if (PyLong_Check(iobj)) {
9085					isnumok = 1;
9086					temp = formatlong(iobj, flags, prec, c);
9087					Py_DECREF(iobj);
9088					if (!temp)
9089					    goto onError;
9090					pbuf = PyUnicode_AS_UNICODE(temp);
9091					len = PyUnicode_GET_SIZE(temp);
9092					sign = 1;
9093				}
9094				else {
9095					Py_DECREF(iobj);
9096				}
9097			}
9098		}
9099		if (!isnumok) {
9100			PyErr_Format(PyExc_TypeError,
9101			    "%%%c format: a number is required, "
9102                                     "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9103			goto onError;
9104		}
9105		if (flags & F_ZERO)
9106		    fill = '0';
9107		break;
9108
9109	    case 'e':
9110	    case 'E':
9111	    case 'f':
9112	    case 'F':
9113	    case 'g':
9114	    case 'G':
9115		if (c == 'F')
9116			c = 'f';
9117		pbuf = formatbuf;
9118		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9119			flags, prec, c, v);
9120		if (len < 0)
9121		    goto onError;
9122		sign = 1;
9123		if (flags & F_ZERO)
9124		    fill = '0';
9125		break;
9126
9127	    case 'c':
9128		pbuf = formatbuf;
9129		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9130		if (len < 0)
9131		    goto onError;
9132		break;
9133
9134	    default:
9135		PyErr_Format(PyExc_ValueError,
9136			     "unsupported format character '%c' (0x%x) "
9137			     "at index %zd",
9138			     (31<=c && c<=126) ? (char)c : '?',
9139                             (int)c,
9140			     (Py_ssize_t)(fmt - 1 -
9141					  PyUnicode_AS_UNICODE(uformat)));
9142		goto onError;
9143	    }
9144	    if (sign) {
9145		if (*pbuf == '-' || *pbuf == '+') {
9146		    sign = *pbuf++;
9147		    len--;
9148		}
9149		else if (flags & F_SIGN)
9150		    sign = '+';
9151		else if (flags & F_BLANK)
9152		    sign = ' ';
9153		else
9154		    sign = 0;
9155	    }
9156	    if (width < len)
9157		width = len;
9158	    if (rescnt - (sign != 0) < width) {
9159		reslen -= rescnt;
9160		rescnt = width + fmtcnt + 100;
9161		reslen += rescnt;
9162		if (reslen < 0) {
9163		    Py_XDECREF(temp);
9164		    PyErr_NoMemory();
9165		    goto onError;
9166		}
9167		if (_PyUnicode_Resize(&result, reslen) < 0) {
9168		    Py_XDECREF(temp);
9169		    goto onError;
9170		}
9171		res = PyUnicode_AS_UNICODE(result)
9172		    + reslen - rescnt;
9173	    }
9174	    if (sign) {
9175		if (fill != ' ')
9176		    *res++ = sign;
9177		rescnt--;
9178		if (width > len)
9179		    width--;
9180	    }
9181	    if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9182		assert(pbuf[0] == '0');
9183		assert(pbuf[1] == c);
9184		if (fill != ' ') {
9185		    *res++ = *pbuf++;
9186		    *res++ = *pbuf++;
9187		}
9188		rescnt -= 2;
9189		width -= 2;
9190		if (width < 0)
9191		    width = 0;
9192		len -= 2;
9193	    }
9194	    if (width > len && !(flags & F_LJUST)) {
9195		do {
9196		    --rescnt;
9197		    *res++ = fill;
9198		} while (--width > len);
9199	    }
9200	    if (fill == ' ') {
9201		if (sign)
9202		    *res++ = sign;
9203		if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9204		    assert(pbuf[0] == '0');
9205		    assert(pbuf[1] == c);
9206		    *res++ = *pbuf++;
9207		    *res++ = *pbuf++;
9208		}
9209	    }
9210	    Py_UNICODE_COPY(res, pbuf, len);
9211	    res += len;
9212	    rescnt -= len;
9213	    while (--width >= len) {
9214		--rescnt;
9215		*res++ = ' ';
9216	    }
9217	    if (dict && (argidx < arglen) && c != '%') {
9218		PyErr_SetString(PyExc_TypeError,
9219				"not all arguments converted during string formatting");
9220                Py_XDECREF(temp);
9221		goto onError;
9222	    }
9223	    Py_XDECREF(temp);
9224	} /* '%' */
9225    } /* until end */
9226    if (argidx < arglen && !dict) {
9227	PyErr_SetString(PyExc_TypeError,
9228			"not all arguments converted during string formatting");
9229	goto onError;
9230    }
9231
9232    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9233	goto onError;
9234    if (args_owned) {
9235	Py_DECREF(args);
9236    }
9237    Py_DECREF(uformat);
9238    return (PyObject *)result;
9239
9240 onError:
9241    Py_XDECREF(result);
9242    Py_DECREF(uformat);
9243    if (args_owned) {
9244	Py_DECREF(args);
9245    }
9246    return NULL;
9247}
9248
9249static PyObject *
9250unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9251
9252static PyObject *
9253unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9254{
9255        PyObject *x = NULL;
9256	static char *kwlist[] = {"object", "encoding", "errors", 0};
9257	char *encoding = NULL;
9258	char *errors = NULL;
9259
9260	if (type != &PyUnicode_Type)
9261		return unicode_subtype_new(type, args, kwds);
9262	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9263					  kwlist, &x, &encoding, &errors))
9264	    return NULL;
9265	if (x == NULL)
9266		return (PyObject *)_PyUnicode_New(0);
9267	if (encoding == NULL && errors == NULL)
9268	    return PyObject_Str(x);
9269	else
9270	return PyUnicode_FromEncodedObject(x, encoding, errors);
9271}
9272
9273static PyObject *
9274unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9275{
9276	PyUnicodeObject *tmp, *pnew;
9277	Py_ssize_t n;
9278
9279	assert(PyType_IsSubtype(type, &PyUnicode_Type));
9280	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9281	if (tmp == NULL)
9282		return NULL;
9283	assert(PyUnicode_Check(tmp));
9284	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9285	if (pnew == NULL) {
9286		Py_DECREF(tmp);
9287		return NULL;
9288	}
9289	pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9290	if (pnew->str == NULL) {
9291		_Py_ForgetReference((PyObject *)pnew);
9292		PyObject_Del(pnew);
9293		Py_DECREF(tmp);
9294		return PyErr_NoMemory();
9295	}
9296	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9297	pnew->length = n;
9298	pnew->hash = tmp->hash;
9299	Py_DECREF(tmp);
9300	return (PyObject *)pnew;
9301}
9302
9303PyDoc_STRVAR(unicode_doc,
9304"str(string[, encoding[, errors]]) -> str\n\
9305\n\
9306Create a new string object from the given encoded string.\n\
9307encoding defaults to the current default string encoding.\n\
9308errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9309
9310static PyObject *unicode_iter(PyObject *seq);
9311
9312PyTypeObject PyUnicode_Type = {
9313    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9314    "str", 				/* tp_name */
9315    sizeof(PyUnicodeObject), 		/* tp_size */
9316    0, 					/* tp_itemsize */
9317    /* Slots */
9318    (destructor)unicode_dealloc, 	/* tp_dealloc */
9319    0, 					/* tp_print */
9320    0,				 	/* tp_getattr */
9321    0, 					/* tp_setattr */
9322    0, 					/* tp_compare */
9323    unicode_repr, 			/* tp_repr */
9324    &unicode_as_number, 		/* tp_as_number */
9325    &unicode_as_sequence, 		/* tp_as_sequence */
9326    &unicode_as_mapping, 		/* tp_as_mapping */
9327    (hashfunc) unicode_hash, 		/* tp_hash*/
9328    0, 					/* tp_call*/
9329    (reprfunc) unicode_str,	 	/* tp_str */
9330    PyObject_GenericGetAttr, 		/* tp_getattro */
9331    0,			 		/* tp_setattro */
9332    0, 					/* tp_as_buffer */
9333    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9334        Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
9335    unicode_doc,			/* tp_doc */
9336    0,					/* tp_traverse */
9337    0,					/* tp_clear */
9338    PyUnicode_RichCompare,		/* tp_richcompare */
9339    0,					/* tp_weaklistoffset */
9340    unicode_iter,			/* tp_iter */
9341    0,					/* tp_iternext */
9342    unicode_methods,			/* tp_methods */
9343    0,					/* tp_members */
9344    0,					/* tp_getset */
9345    &PyBaseObject_Type,			/* tp_base */
9346    0,					/* tp_dict */
9347    0,					/* tp_descr_get */
9348    0,					/* tp_descr_set */
9349    0,					/* tp_dictoffset */
9350    0,					/* tp_init */
9351    0,					/* tp_alloc */
9352    unicode_new,			/* tp_new */
9353    PyObject_Del,      		/* tp_free */
9354};
9355
9356/* Initialize the Unicode implementation */
9357
9358void _PyUnicode_Init(void)
9359{
9360    int i;
9361
9362    /* XXX - move this array to unicodectype.c ? */
9363    Py_UNICODE linebreak[] = {
9364        0x000A, /* LINE FEED */
9365        0x000D, /* CARRIAGE RETURN */
9366        0x001C, /* FILE SEPARATOR */
9367        0x001D, /* GROUP SEPARATOR */
9368        0x001E, /* RECORD SEPARATOR */
9369        0x0085, /* NEXT LINE */
9370        0x2028, /* LINE SEPARATOR */
9371        0x2029, /* PARAGRAPH SEPARATOR */
9372    };
9373
9374    /* Init the implementation */
9375    free_list = NULL;
9376    numfree = 0;
9377    unicode_empty = _PyUnicode_New(0);
9378    if (!unicode_empty)
9379	return;
9380
9381    for (i = 0; i < 256; i++)
9382	unicode_latin1[i] = NULL;
9383    if (PyType_Ready(&PyUnicode_Type) < 0)
9384	Py_FatalError("Can't initialize 'unicode'");
9385
9386    /* initialize the linebreak bloom filter */
9387    bloom_linebreak = make_bloom_mask(
9388        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9389        );
9390
9391    PyType_Ready(&EncodingMapType);
9392}
9393
9394/* Finalize the Unicode implementation */
9395
9396int
9397PyUnicode_ClearFreeList(void)
9398{
9399    int freelist_size = numfree;
9400    PyUnicodeObject *u;
9401
9402    for (u = free_list; u != NULL;) {
9403	PyUnicodeObject *v = u;
9404	u = *(PyUnicodeObject **)u;
9405	if (v->str)
9406	    PyObject_DEL(v->str);
9407	Py_XDECREF(v->defenc);
9408	PyObject_Del(v);
9409	numfree--;
9410    }
9411    free_list = NULL;
9412    assert(numfree == 0);
9413    return freelist_size;
9414}
9415
9416void
9417_PyUnicode_Fini(void)
9418{
9419    int i;
9420
9421    Py_XDECREF(unicode_empty);
9422    unicode_empty = NULL;
9423
9424    for (i = 0; i < 256; i++) {
9425	if (unicode_latin1[i]) {
9426	    Py_DECREF(unicode_latin1[i]);
9427	    unicode_latin1[i] = NULL;
9428	}
9429    }
9430    (void)PyUnicode_ClearFreeList();
9431}
9432
9433void
9434PyUnicode_InternInPlace(PyObject **p)
9435{
9436	register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9437	PyObject *t;
9438	if (s == NULL || !PyUnicode_Check(s))
9439		Py_FatalError(
9440		    "PyUnicode_InternInPlace: unicode strings only please!");
9441	/* If it's a subclass, we don't really know what putting
9442	   it in the interned dict might do. */
9443	if (!PyUnicode_CheckExact(s))
9444		return;
9445	if (PyUnicode_CHECK_INTERNED(s))
9446		return;
9447	if (interned == NULL) {
9448		interned = PyDict_New();
9449		if (interned == NULL) {
9450			PyErr_Clear(); /* Don't leave an exception */
9451			return;
9452		}
9453	}
9454	/* It might be that the GetItem call fails even
9455	   though the key is present in the dictionary,
9456	   namely when this happens during a stack overflow. */
9457	Py_ALLOW_RECURSION
9458	t = PyDict_GetItem(interned, (PyObject *)s);
9459	Py_END_ALLOW_RECURSION
9460
9461	if (t) {
9462		Py_INCREF(t);
9463		Py_DECREF(*p);
9464		*p = t;
9465		return;
9466	}
9467
9468	PyThreadState_GET()->recursion_critical = 1;
9469	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9470		PyErr_Clear();
9471		PyThreadState_GET()->recursion_critical = 0;
9472		return;
9473	}
9474	PyThreadState_GET()->recursion_critical = 0;
9475	/* The two references in interned are not counted by refcnt.
9476	   The deallocator will take care of this */
9477	Py_REFCNT(s) -= 2;
9478	PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9479}
9480
9481void
9482PyUnicode_InternImmortal(PyObject **p)
9483{
9484	PyUnicode_InternInPlace(p);
9485	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9486		PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9487		Py_INCREF(*p);
9488	}
9489}
9490
9491PyObject *
9492PyUnicode_InternFromString(const char *cp)
9493{
9494	PyObject *s = PyUnicode_FromString(cp);
9495	if (s == NULL)
9496		return NULL;
9497	PyUnicode_InternInPlace(&s);
9498	return s;
9499}
9500
9501void _Py_ReleaseInternedUnicodeStrings(void)
9502{
9503	PyObject *keys;
9504	PyUnicodeObject *s;
9505	Py_ssize_t i, n;
9506	Py_ssize_t immortal_size = 0, mortal_size = 0;
9507
9508	if (interned == NULL || !PyDict_Check(interned))
9509		return;
9510	keys = PyDict_Keys(interned);
9511	if (keys == NULL || !PyList_Check(keys)) {
9512		PyErr_Clear();
9513		return;
9514	}
9515
9516	/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9517	   detector, interned unicode strings are not forcibly deallocated;
9518	   rather, we give them their stolen references back, and then clear
9519	   and DECREF the interned dict. */
9520
9521	n = PyList_GET_SIZE(keys);
9522	fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9523		n);
9524	for (i = 0; i < n; i++) {
9525		s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9526		switch (s->state) {
9527		case SSTATE_NOT_INTERNED:
9528			/* XXX Shouldn't happen */
9529			break;
9530		case SSTATE_INTERNED_IMMORTAL:
9531			Py_REFCNT(s) += 1;
9532			immortal_size += s->length;
9533			break;
9534		case SSTATE_INTERNED_MORTAL:
9535			Py_REFCNT(s) += 2;
9536			mortal_size += s->length;
9537			break;
9538		default:
9539			Py_FatalError("Inconsistent interned string state.");
9540		}
9541		s->state = SSTATE_NOT_INTERNED;
9542	}
9543	fprintf(stderr, "total size of all interned strings: "
9544			"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9545			"mortal/immortal\n", mortal_size, immortal_size);
9546	Py_DECREF(keys);
9547	PyDict_Clear(interned);
9548	Py_DECREF(interned);
9549	interned = NULL;
9550}
9551
9552
9553/********************* Unicode Iterator **************************/
9554
9555typedef struct {
9556	PyObject_HEAD
9557	Py_ssize_t it_index;
9558	PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9559} unicodeiterobject;
9560
9561static void
9562unicodeiter_dealloc(unicodeiterobject *it)
9563{
9564	_PyObject_GC_UNTRACK(it);
9565	Py_XDECREF(it->it_seq);
9566	PyObject_GC_Del(it);
9567}
9568
9569static int
9570unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9571{
9572	Py_VISIT(it->it_seq);
9573	return 0;
9574}
9575
9576static PyObject *
9577unicodeiter_next(unicodeiterobject *it)
9578{
9579	PyUnicodeObject *seq;
9580	PyObject *item;
9581
9582	assert(it != NULL);
9583	seq = it->it_seq;
9584	if (seq == NULL)
9585		return NULL;
9586	assert(PyUnicode_Check(seq));
9587
9588	if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9589		item = PyUnicode_FromUnicode(
9590                    PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9591		if (item != NULL)
9592			++it->it_index;
9593		return item;
9594	}
9595
9596	Py_DECREF(seq);
9597	it->it_seq = NULL;
9598	return NULL;
9599}
9600
9601static PyObject *
9602unicodeiter_len(unicodeiterobject *it)
9603{
9604	Py_ssize_t len = 0;
9605	if (it->it_seq)
9606		len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9607	return PyLong_FromSsize_t(len);
9608}
9609
9610PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9611
9612static PyMethodDef unicodeiter_methods[] = {
9613	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9614         length_hint_doc},
9615 	{NULL,		NULL}		/* sentinel */
9616};
9617
9618PyTypeObject PyUnicodeIter_Type = {
9619	PyVarObject_HEAD_INIT(&PyType_Type, 0)
9620	"str_iterator",			/* tp_name */
9621	sizeof(unicodeiterobject),		/* tp_basicsize */
9622	0,					/* tp_itemsize */
9623	/* methods */
9624	(destructor)unicodeiter_dealloc,	/* tp_dealloc */
9625	0,					/* tp_print */
9626	0,					/* tp_getattr */
9627	0,					/* tp_setattr */
9628	0,					/* tp_compare */
9629	0,					/* tp_repr */
9630	0,					/* tp_as_number */
9631	0,					/* tp_as_sequence */
9632	0,					/* tp_as_mapping */
9633	0,					/* tp_hash */
9634	0,					/* tp_call */
9635	0,					/* tp_str */
9636	PyObject_GenericGetAttr,		/* tp_getattro */
9637	0,					/* tp_setattro */
9638	0,					/* tp_as_buffer */
9639	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9640	0,					/* tp_doc */
9641	(traverseproc)unicodeiter_traverse,	/* tp_traverse */
9642	0,					/* tp_clear */
9643	0,					/* tp_richcompare */
9644	0,					/* tp_weaklistoffset */
9645	PyObject_SelfIter,			/* tp_iter */
9646	(iternextfunc)unicodeiter_next,		/* tp_iternext */
9647	unicodeiter_methods,			/* tp_methods */
9648	0,
9649};
9650
9651static PyObject *
9652unicode_iter(PyObject *seq)
9653{
9654	unicodeiterobject *it;
9655
9656	if (!PyUnicode_Check(seq)) {
9657		PyErr_BadInternalCall();
9658		return NULL;
9659	}
9660	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9661	if (it == NULL)
9662		return NULL;
9663	it->it_index = 0;
9664	Py_INCREF(seq);
9665	it->it_seq = (PyUnicodeObject *)seq;
9666	_PyObject_GC_TRACK(it);
9667	return (PyObject *)it;
9668}
9669
9670size_t
9671Py_UNICODE_strlen(const Py_UNICODE *u)
9672{
9673    int res = 0;
9674    while(*u++)
9675        res++;
9676    return res;
9677}
9678
9679Py_UNICODE*
9680Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9681{
9682    Py_UNICODE *u = s1;
9683    while ((*u++ = *s2++));
9684    return s1;
9685}
9686
9687Py_UNICODE*
9688Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9689{
9690    Py_UNICODE *u = s1;
9691    while ((*u++ = *s2++))
9692        if (n-- == 0)
9693            break;
9694    return s1;
9695}
9696
9697int
9698Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9699{
9700    while (*s1 && *s2 && *s1 == *s2)
9701        s1++, s2++;
9702    if (*s1 && *s2)
9703        return (*s1 < *s2) ? -1 : +1;
9704    if (*s1)
9705        return 1;
9706    if (*s2)
9707        return -1;
9708    return 0;
9709}
9710
9711Py_UNICODE*
9712Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9713{
9714    const Py_UNICODE *p;
9715    for (p = s; *p; p++)
9716        if (*p == c)
9717            return (Py_UNICODE*)p;
9718    return NULL;
9719}
9720
9721
9722#ifdef __cplusplus
9723}
9724#endif
9725
9726
9727/*
9728Local variables:
9729c-basic-offset: 4
9730indent-tabs-mode: nil
9731End:
9732*/
9733