unicodeobject.c revision c28e1fa71f61278256887d257e4e7e24b0e7e7ce
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "bytes_methods.h"
45
46#include "unicodeobject.h"
47#include "ucnhash.h"
48
49#ifdef MS_WINDOWS
50#include <windows.h>
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96/* This dictionary holds all interned unicode strings.  Note that references
97   to strings in this dictionary are *not* counted in the string's ob_refcnt.
98   When the interned string reaches a refcnt of 0 the string deallocation
99   function will delete the reference from this dictionary.
100
101   Another way to look at this is that to say that the actual reference
102   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
103*/
104static PyObject *interned;
105
106/* Free list for Unicode objects */
107static PyUnicodeObject *free_list;
108static int numfree;
109
110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114   shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
117/* Default encoding to use and assume when NULL is passed as encoding
118   parameter; it is fixed to "utf-8".  Always use the
119   PyUnicode_GetDefaultEncoding() API to access this global.
120
121   Don't forget to alter Py_FileSystemDefaultEncoding if you change the
122   hard coded default!
123*/
124static const char unicode_default_encoding[] = "utf-8";
125
126/* Fast detection of the most frequent whitespace characters */
127const unsigned char _Py_ascii_whitespace[] = {
128	0, 0, 0, 0, 0, 0, 0, 0,
129//     case 0x0009: /* HORIZONTAL TABULATION */
130//     case 0x000A: /* LINE FEED */
131//     case 0x000B: /* VERTICAL TABULATION */
132//     case 0x000C: /* FORM FEED */
133//     case 0x000D: /* CARRIAGE RETURN */
134	0, 1, 1, 1, 1, 1, 0, 0,
135	0, 0, 0, 0, 0, 0, 0, 0,
136//     case 0x001C: /* FILE SEPARATOR */
137//     case 0x001D: /* GROUP SEPARATOR */
138//     case 0x001E: /* RECORD SEPARATOR */
139//     case 0x001F: /* UNIT SEPARATOR */
140	0, 0, 0, 0, 1, 1, 1, 1,
141//     case 0x0020: /* SPACE */
142	1, 0, 0, 0, 0, 0, 0, 0,
143	0, 0, 0, 0, 0, 0, 0, 0,
144	0, 0, 0, 0, 0, 0, 0, 0,
145	0, 0, 0, 0, 0, 0, 0, 0,
146
147	0, 0, 0, 0, 0, 0, 0, 0,
148	0, 0, 0, 0, 0, 0, 0, 0,
149	0, 0, 0, 0, 0, 0, 0, 0,
150	0, 0, 0, 0, 0, 0, 0, 0,
151	0, 0, 0, 0, 0, 0, 0, 0,
152	0, 0, 0, 0, 0, 0, 0, 0,
153	0, 0, 0, 0, 0, 0, 0, 0,
154	0, 0, 0, 0, 0, 0, 0, 0
155};
156
157/* Same for linebreaks */
158static unsigned char ascii_linebreak[] = {
159	0, 0, 0, 0, 0, 0, 0, 0,
160//         0x000A, /* LINE FEED */
161//         0x000D, /* CARRIAGE RETURN */
162	0, 0, 1, 0, 0, 1, 0, 0,
163	0, 0, 0, 0, 0, 0, 0, 0,
164//         0x001C, /* FILE SEPARATOR */
165//         0x001D, /* GROUP SEPARATOR */
166//         0x001E, /* RECORD SEPARATOR */
167	0, 0, 0, 0, 1, 1, 1, 0,
168	0, 0, 0, 0, 0, 0, 0, 0,
169	0, 0, 0, 0, 0, 0, 0, 0,
170	0, 0, 0, 0, 0, 0, 0, 0,
171	0, 0, 0, 0, 0, 0, 0, 0,
172
173	0, 0, 0, 0, 0, 0, 0, 0,
174	0, 0, 0, 0, 0, 0, 0, 0,
175	0, 0, 0, 0, 0, 0, 0, 0,
176	0, 0, 0, 0, 0, 0, 0, 0,
177	0, 0, 0, 0, 0, 0, 0, 0,
178	0, 0, 0, 0, 0, 0, 0, 0,
179	0, 0, 0, 0, 0, 0, 0, 0,
180	0, 0, 0, 0, 0, 0, 0, 0
181};
182
183
184Py_UNICODE
185PyUnicode_GetMax(void)
186{
187#ifdef Py_UNICODE_WIDE
188	return 0x10FFFF;
189#else
190	/* This is actually an illegal character, so it should
191	   not be passed to unichr. */
192	return 0xFFFF;
193#endif
194}
195
196/* --- Bloom Filters ----------------------------------------------------- */
197
198/* stuff to implement simple "bloom filters" for Unicode characters.
199   to keep things simple, we use a single bitmask, using the least 5
200   bits from each unicode characters as the bit index. */
201
202/* the linebreak mask is set up by Unicode_Init below */
203
204#define BLOOM_MASK unsigned long
205
206static BLOOM_MASK bloom_linebreak;
207
208#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
209
210#define BLOOM_LINEBREAK(ch) \
211    ((ch) < 128U ? ascii_linebreak[(ch)] : \
212    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
213
214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
215{
216    /* calculate simple bloom-style bitmask for a given unicode string */
217
218    long mask;
219    Py_ssize_t i;
220
221    mask = 0;
222    for (i = 0; i < len; i++)
223        mask |= (1 << (ptr[i] & 0x1F));
224
225    return mask;
226}
227
228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
229{
230    Py_ssize_t i;
231
232    for (i = 0; i < setlen; i++)
233        if (set[i] == chr)
234            return 1;
235
236    return 0;
237}
238
239#define BLOOM_MEMBER(mask, chr, set, setlen)\
240    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
242/* --- Unicode Object ----------------------------------------------------- */
243
244static
245int unicode_resize(register PyUnicodeObject *unicode,
246                      Py_ssize_t length)
247{
248    void *oldstr;
249
250    /* Shortcut if there's nothing much to do. */
251    if (unicode->length == length)
252	goto reset;
253
254    /* Resizing shared object (unicode_empty or single character
255       objects) in-place is not allowed. Use PyUnicode_Resize()
256       instead ! */
257
258    if (unicode == unicode_empty ||
259	(unicode->length == 1 &&
260	 unicode->str[0] < 256U &&
261	 unicode_latin1[unicode->str[0]] == unicode)) {
262        PyErr_SetString(PyExc_SystemError,
263                        "can't resize shared unicode objects");
264        return -1;
265    }
266
267    /* We allocate one more byte to make sure the string is Ux0000 terminated.
268       The overallocation is also used by fastsearch, which assumes that it's
269       safe to look at str[length] (without making any assumptions about what
270       it contains). */
271
272    oldstr = unicode->str;
273    unicode->str = PyObject_REALLOC(unicode->str,
274				    sizeof(Py_UNICODE) * (length + 1));
275    if (!unicode->str) {
276	unicode->str = (Py_UNICODE *)oldstr;
277        PyErr_NoMemory();
278        return -1;
279    }
280    unicode->str[length] = 0;
281    unicode->length = length;
282
283 reset:
284    /* Reset the object caches */
285    if (unicode->defenc) {
286        Py_DECREF(unicode->defenc);
287        unicode->defenc = NULL;
288    }
289    unicode->hash = -1;
290
291    return 0;
292}
293
294/* We allocate one more byte to make sure the string is
295   Ux0000 terminated; some code (e.g. new_identifier)
296   relies on that.
297
298   XXX This allocator could further be enhanced by assuring that the
299       free list never reduces its size below 1.
300
301*/
302
303static
304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305{
306    register PyUnicodeObject *unicode;
307
308    /* Optimization for empty strings */
309    if (length == 0 && unicode_empty != NULL) {
310        Py_INCREF(unicode_empty);
311        return unicode_empty;
312    }
313
314    /* Unicode freelist & memory allocation */
315    if (free_list) {
316        unicode = free_list;
317        free_list = *(PyUnicodeObject **)unicode;
318        numfree--;
319	if (unicode->str) {
320	    /* Keep-Alive optimization: we only upsize the buffer,
321	       never downsize it. */
322	    if ((unicode->length < length) &&
323                unicode_resize(unicode, length) < 0) {
324		PyObject_DEL(unicode->str);
325		goto onError;
326	    }
327	}
328        else {
329	    size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
330	    unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
331        }
332        PyObject_INIT(unicode, &PyUnicode_Type);
333    }
334    else {
335	size_t new_size;
336        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
337        if (unicode == NULL)
338            return NULL;
339	new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
340	unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
341    }
342
343    if (!unicode->str) {
344	PyErr_NoMemory();
345	goto onError;
346    }
347    /* Initialize the first element to guard against cases where
348     * the caller fails before initializing str -- unicode_resize()
349     * reads str[0], and the Keep-Alive optimization can keep memory
350     * allocated for str alive across a call to unicode_dealloc(unicode).
351     * We don't want unicode_resize to read uninitialized memory in
352     * that case.
353     */
354    unicode->str[0] = 0;
355    unicode->str[length] = 0;
356    unicode->length = length;
357    unicode->hash = -1;
358    unicode->state = 0;
359    unicode->defenc = NULL;
360    return unicode;
361
362 onError:
363    _Py_ForgetReference((PyObject *)unicode);
364    PyObject_Del(unicode);
365    return NULL;
366}
367
368static
369void unicode_dealloc(register PyUnicodeObject *unicode)
370{
371    switch (PyUnicode_CHECK_INTERNED(unicode)) {
372        case SSTATE_NOT_INTERNED:
373            break;
374
375        case SSTATE_INTERNED_MORTAL:
376            /* revive dead object temporarily for DelItem */
377            Py_REFCNT(unicode) = 3;
378            if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
379                Py_FatalError(
380                    "deletion of interned unicode string failed");
381            break;
382
383        case SSTATE_INTERNED_IMMORTAL:
384            Py_FatalError("Immortal interned unicode string died.");
385
386        default:
387            Py_FatalError("Inconsistent interned unicode string state.");
388    }
389
390    if (PyUnicode_CheckExact(unicode) &&
391	numfree < PyUnicode_MAXFREELIST) {
392        /* Keep-Alive optimization */
393	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394	    PyObject_DEL(unicode->str);
395	    unicode->str = NULL;
396	    unicode->length = 0;
397	}
398	if (unicode->defenc) {
399	    Py_DECREF(unicode->defenc);
400	    unicode->defenc = NULL;
401	}
402	/* Add to free list */
403        *(PyUnicodeObject **)unicode = free_list;
404        free_list = unicode;
405        numfree++;
406    }
407    else {
408	PyObject_DEL(unicode->str);
409	Py_XDECREF(unicode->defenc);
410	Py_TYPE(unicode)->tp_free((PyObject *)unicode);
411    }
412}
413
414int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
415{
416    register PyUnicodeObject *v;
417
418    /* Argument checks */
419    if (unicode == NULL) {
420	PyErr_BadInternalCall();
421	return -1;
422    }
423    v = (PyUnicodeObject *)*unicode;
424    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
425	PyErr_BadInternalCall();
426	return -1;
427    }
428
429    /* Resizing unicode_empty and single character objects is not
430       possible since these are being shared. We simply return a fresh
431       copy with the same Unicode content. */
432    if (v->length != length &&
433	(v == unicode_empty || v->length == 1)) {
434	PyUnicodeObject *w = _PyUnicode_New(length);
435	if (w == NULL)
436	    return -1;
437	Py_UNICODE_COPY(w->str, v->str,
438			length < v->length ? length : v->length);
439	Py_DECREF(*unicode);
440	*unicode = (PyObject *)w;
441	return 0;
442    }
443
444    /* Note that we don't have to modify *unicode for unshared Unicode
445       objects, since we can modify them in-place. */
446    return unicode_resize(v, length);
447}
448
449/* Internal API for use in unicodeobject.c only ! */
450#define _PyUnicode_Resize(unicodevar, length) \
451        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
452
453PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
454				Py_ssize_t size)
455{
456    PyUnicodeObject *unicode;
457
458    /* If the Unicode data is known at construction time, we can apply
459       some optimizations which share commonly used objects. */
460    if (u != NULL) {
461
462	/* Optimization for empty strings */
463	if (size == 0 && unicode_empty != NULL) {
464	    Py_INCREF(unicode_empty);
465	    return (PyObject *)unicode_empty;
466	}
467
468	/* Single character Unicode objects in the Latin-1 range are
469	   shared when using this constructor */
470	if (size == 1 && *u < 256) {
471	    unicode = unicode_latin1[*u];
472	    if (!unicode) {
473		unicode = _PyUnicode_New(1);
474		if (!unicode)
475		    return NULL;
476		unicode->str[0] = *u;
477		unicode_latin1[*u] = unicode;
478	    }
479	    Py_INCREF(unicode);
480	    return (PyObject *)unicode;
481	}
482    }
483
484    unicode = _PyUnicode_New(size);
485    if (!unicode)
486        return NULL;
487
488    /* Copy the Unicode data into the new object */
489    if (u != NULL)
490	Py_UNICODE_COPY(unicode->str, u, size);
491
492    return (PyObject *)unicode;
493}
494
495PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
496{
497    PyUnicodeObject *unicode;
498
499	if (size < 0) {
500		PyErr_SetString(PyExc_SystemError,
501		    "Negative size passed to PyUnicode_FromStringAndSize");
502		return NULL;
503	}
504
505    /* If the Unicode data is known at construction time, we can apply
506       some optimizations which share commonly used objects.
507       Also, this means the input must be UTF-8, so fall back to the
508       UTF-8 decoder at the end. */
509    if (u != NULL) {
510
511	/* Optimization for empty strings */
512	if (size == 0 && unicode_empty != NULL) {
513	    Py_INCREF(unicode_empty);
514	    return (PyObject *)unicode_empty;
515	}
516
517	/* Single characters are shared when using this constructor.
518           Restrict to ASCII, since the input must be UTF-8. */
519	if (size == 1 && Py_CHARMASK(*u) < 128) {
520	    unicode = unicode_latin1[Py_CHARMASK(*u)];
521	    if (!unicode) {
522		unicode = _PyUnicode_New(1);
523		if (!unicode)
524		    return NULL;
525		unicode->str[0] = Py_CHARMASK(*u);
526		unicode_latin1[Py_CHARMASK(*u)] = unicode;
527	    }
528	    Py_INCREF(unicode);
529	    return (PyObject *)unicode;
530	}
531
532        return PyUnicode_DecodeUTF8(u, size, NULL);
533    }
534
535    unicode = _PyUnicode_New(size);
536    if (!unicode)
537        return NULL;
538
539    return (PyObject *)unicode;
540}
541
542PyObject *PyUnicode_FromString(const char *u)
543{
544    size_t size = strlen(u);
545    if (size > PY_SSIZE_T_MAX) {
546        PyErr_SetString(PyExc_OverflowError, "input too long");
547        return NULL;
548    }
549
550    return PyUnicode_FromStringAndSize(u, size);
551}
552
553#ifdef HAVE_WCHAR_H
554
555PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
556				 Py_ssize_t size)
557{
558    PyUnicodeObject *unicode;
559
560    if (w == NULL) {
561        if (size == 0)
562            return PyUnicode_FromStringAndSize(NULL, 0);
563	PyErr_BadInternalCall();
564	return NULL;
565    }
566
567    if (size == -1) {
568        size = wcslen(w);
569    }
570
571    unicode = _PyUnicode_New(size);
572    if (!unicode)
573        return NULL;
574
575    /* Copy the wchar_t data into the new object */
576#ifdef HAVE_USABLE_WCHAR_T
577    memcpy(unicode->str, w, size * sizeof(wchar_t));
578#else
579    {
580	register Py_UNICODE *u;
581	register Py_ssize_t i;
582	u = PyUnicode_AS_UNICODE(unicode);
583	for (i = size; i > 0; i--)
584	    *u++ = *w++;
585    }
586#endif
587
588    return (PyObject *)unicode;
589}
590
591static void
592makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
593{
594	*fmt++ = '%';
595	if (width) {
596		if (zeropad)
597			*fmt++ = '0';
598		fmt += sprintf(fmt, "%d", width);
599	}
600	if (precision)
601		fmt += sprintf(fmt, ".%d", precision);
602	if (longflag)
603		*fmt++ = 'l';
604	else if (size_tflag) {
605		char *f = PY_FORMAT_SIZE_T;
606		while (*f)
607			*fmt++ = *f++;
608	}
609	*fmt++ = c;
610	*fmt = '\0';
611}
612
613#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
614
615PyObject *
616PyUnicode_FromFormatV(const char *format, va_list vargs)
617{
618	va_list count;
619	Py_ssize_t callcount = 0;
620	PyObject **callresults = NULL;
621	PyObject **callresult = NULL;
622	Py_ssize_t n = 0;
623	int width = 0;
624	int precision = 0;
625	int zeropad;
626	const char* f;
627	Py_UNICODE *s;
628	PyObject *string;
629	/* used by sprintf */
630	char buffer[21];
631	/* use abuffer instead of buffer, if we need more space
632	 * (which can happen if there's a format specifier with width). */
633	char *abuffer = NULL;
634	char *realbuffer;
635	Py_ssize_t abuffersize = 0;
636	char fmt[60]; /* should be enough for %0width.precisionld */
637	const char *copy;
638
639#ifdef VA_LIST_IS_ARRAY
640	Py_MEMCPY(count, vargs, sizeof(va_list));
641#else
642#ifdef  __va_copy
643	__va_copy(count, vargs);
644#else
645	count = vargs;
646#endif
647#endif
648	/* step 1: count the number of %S/%R format specifications
649	 * (we call PyObject_Str()/PyObject_Repr() for these objects
650	 * once during step 3 and put the result in an array) */
651	for (f = format; *f; f++) {
652		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
653			++callcount;
654	}
655	/* step 2: allocate memory for the results of
656	 * PyObject_Str()/PyObject_Repr() calls */
657	if (callcount) {
658		callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
659		if (!callresults) {
660			PyErr_NoMemory();
661			return NULL;
662		}
663		callresult = callresults;
664	}
665	/* step 3: figure out how large a buffer we need */
666	for (f = format; *f; f++) {
667		if (*f == '%') {
668			const char* p = f;
669			width = 0;
670			while (ISDIGIT((unsigned)*f))
671				width = (width*10) + *f++ - '0';
672			while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
673				;
674
675			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
676			 * they don't affect the amount of space we reserve.
677			 */
678			if ((*f == 'l' || *f == 'z') &&
679					(f[1] == 'd' || f[1] == 'u'))
680                                ++f;
681
682			switch (*f) {
683			case 'c':
684				(void)va_arg(count, int);
685				/* fall through... */
686			case '%':
687				n++;
688				break;
689			case 'd': case 'u': case 'i': case 'x':
690				(void) va_arg(count, int);
691				/* 20 bytes is enough to hold a 64-bit
692				   integer.  Decimal takes the most space.
693				   This isn't enough for octal.
694				   If a width is specified we need more
695				   (which we allocate later). */
696				if (width < 20)
697					width = 20;
698				n += width;
699				if (abuffersize < width)
700					abuffersize = width;
701				break;
702			case 's':
703			{
704				/* UTF-8 */
705				unsigned char*s;
706				s = va_arg(count, unsigned char*);
707				while (*s) {
708					if (*s < 128) {
709						n++; s++;
710					} else if (*s < 0xc0) {
711						/* invalid UTF-8 */
712						n++; s++;
713					} else if (*s < 0xc0) {
714						n++;
715						s++; if(!*s)break;
716						s++;
717					} else if (*s < 0xe0) {
718						n++;
719						s++; if(!*s)break;
720						s++; if(!*s)break;
721						s++;
722					} else {
723						#ifdef Py_UNICODE_WIDE
724						n++;
725						#else
726						n+=2;
727						#endif
728						s++; if(!*s)break;
729						s++; if(!*s)break;
730						s++; if(!*s)break;
731						s++;
732					}
733				}
734				break;
735			}
736			case 'U':
737			{
738				PyObject *obj = va_arg(count, PyObject *);
739				assert(obj && PyUnicode_Check(obj));
740				n += PyUnicode_GET_SIZE(obj);
741				break;
742			}
743			case 'V':
744			{
745				PyObject *obj = va_arg(count, PyObject *);
746				const char *str = va_arg(count, const char *);
747				assert(obj || str);
748				assert(!obj || PyUnicode_Check(obj));
749				if (obj)
750					n += PyUnicode_GET_SIZE(obj);
751				else
752					n += strlen(str);
753				break;
754			}
755			case 'S':
756			{
757				PyObject *obj = va_arg(count, PyObject *);
758				PyObject *str;
759				assert(obj);
760				str = PyObject_Str(obj);
761				if (!str)
762					goto fail;
763				n += PyUnicode_GET_SIZE(str);
764				/* Remember the str and switch to the next slot */
765				*callresult++ = str;
766				break;
767			}
768			case 'R':
769			{
770				PyObject *obj = va_arg(count, PyObject *);
771				PyObject *repr;
772				assert(obj);
773				repr = PyObject_Repr(obj);
774				if (!repr)
775					goto fail;
776				n += PyUnicode_GET_SIZE(repr);
777				/* Remember the repr and switch to the next slot */
778				*callresult++ = repr;
779				break;
780			}
781			case 'p':
782				(void) va_arg(count, int);
783				/* maximum 64-bit pointer representation:
784				 * 0xffffffffffffffff
785				 * so 19 characters is enough.
786				 * XXX I count 18 -- what's the extra for?
787				 */
788				n += 19;
789				break;
790			default:
791				/* if we stumble upon an unknown
792				   formatting code, copy the rest of
793				   the format string to the output
794				   string. (we cannot just skip the
795				   code, since there's no way to know
796				   what's in the argument list) */
797				n += strlen(p);
798				goto expand;
799			}
800		} else
801			n++;
802	}
803 expand:
804	if (abuffersize > 20) {
805		abuffer = PyObject_Malloc(abuffersize);
806		if (!abuffer) {
807			PyErr_NoMemory();
808			goto fail;
809		}
810		realbuffer = abuffer;
811	}
812	else
813		realbuffer = buffer;
814	/* step 4: fill the buffer */
815	/* Since we've analyzed how much space we need for the worst case,
816	   we don't have to resize the string.
817	   There can be no errors beyond this point. */
818	string = PyUnicode_FromUnicode(NULL, n);
819	if (!string)
820		goto fail;
821
822	s = PyUnicode_AS_UNICODE(string);
823	callresult = callresults;
824
825	for (f = format; *f; f++) {
826		if (*f == '%') {
827			const char* p = f++;
828			int longflag = 0;
829			int size_tflag = 0;
830			zeropad = (*f == '0');
831			/* parse the width.precision part */
832			width = 0;
833			while (ISDIGIT((unsigned)*f))
834				width = (width*10) + *f++ - '0';
835			precision = 0;
836			if (*f == '.') {
837				f++;
838				while (ISDIGIT((unsigned)*f))
839					precision = (precision*10) + *f++ - '0';
840			}
841			/* handle the long flag, but only for %ld and %lu.
842			   others can be added when necessary. */
843			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
844				longflag = 1;
845				++f;
846			}
847			/* handle the size_t flag. */
848			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
849				size_tflag = 1;
850				++f;
851			}
852
853			switch (*f) {
854			case 'c':
855				*s++ = va_arg(vargs, int);
856				break;
857			case 'd':
858				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
859				if (longflag)
860					sprintf(realbuffer, fmt, va_arg(vargs, long));
861				else if (size_tflag)
862					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
863				else
864					sprintf(realbuffer, fmt, va_arg(vargs, int));
865				appendstring(realbuffer);
866				break;
867			case 'u':
868				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
869				if (longflag)
870					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
871				else if (size_tflag)
872					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
873				else
874					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
875				appendstring(realbuffer);
876				break;
877			case 'i':
878				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
879				sprintf(realbuffer, fmt, va_arg(vargs, int));
880				appendstring(realbuffer);
881				break;
882			case 'x':
883				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
884				sprintf(realbuffer, fmt, va_arg(vargs, int));
885				appendstring(realbuffer);
886				break;
887			case 's':
888			{
889				/* Parameter must be UTF-8 encoded.
890				   In case of encoding errors, use
891				   the replacement character. */
892				PyObject *u;
893				p = va_arg(vargs, char*);
894				u = PyUnicode_DecodeUTF8(p, strlen(p),
895							 "replace");
896				if (!u)
897					goto fail;
898				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
899						PyUnicode_GET_SIZE(u));
900				s += PyUnicode_GET_SIZE(u);
901				Py_DECREF(u);
902				break;
903			}
904			case 'U':
905			{
906				PyObject *obj = va_arg(vargs, PyObject *);
907				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
908				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
909				s += size;
910				break;
911			}
912			case 'V':
913			{
914				PyObject *obj = va_arg(vargs, PyObject *);
915				const char *str = va_arg(vargs, const char *);
916				if (obj) {
917					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
918					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
919					s += size;
920				} else {
921					appendstring(str);
922				}
923				break;
924			}
925			case 'S':
926			case 'R':
927			{
928				Py_UNICODE *ucopy;
929				Py_ssize_t usize;
930				Py_ssize_t upos;
931				/* unused, since we already have the result */
932				(void) va_arg(vargs, PyObject *);
933				ucopy = PyUnicode_AS_UNICODE(*callresult);
934				usize = PyUnicode_GET_SIZE(*callresult);
935				for (upos = 0; upos<usize;)
936					*s++ = ucopy[upos++];
937				/* We're done with the unicode()/repr() => forget it */
938				Py_DECREF(*callresult);
939				/* switch to next unicode()/repr() result */
940				++callresult;
941				break;
942			}
943			case 'p':
944				sprintf(buffer, "%p", va_arg(vargs, void*));
945				/* %p is ill-defined:  ensure leading 0x. */
946				if (buffer[1] == 'X')
947					buffer[1] = 'x';
948				else if (buffer[1] != 'x') {
949					memmove(buffer+2, buffer, strlen(buffer)+1);
950					buffer[0] = '0';
951					buffer[1] = 'x';
952				}
953				appendstring(buffer);
954				break;
955			case '%':
956				*s++ = '%';
957				break;
958			default:
959				appendstring(p);
960				goto end;
961			}
962		} else
963			*s++ = *f;
964	}
965
966 end:
967	if (callresults)
968		PyObject_Free(callresults);
969	if (abuffer)
970		PyObject_Free(abuffer);
971	_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
972	return string;
973 fail:
974	if (callresults) {
975		PyObject **callresult2 = callresults;
976		while (callresult2 < callresult) {
977			Py_DECREF(*callresult2);
978			++callresult2;
979		}
980		PyObject_Free(callresults);
981	}
982	if (abuffer)
983		PyObject_Free(abuffer);
984	return NULL;
985}
986
987#undef appendstring
988
989PyObject *
990PyUnicode_FromFormat(const char *format, ...)
991{
992	PyObject* ret;
993	va_list vargs;
994
995#ifdef HAVE_STDARG_PROTOTYPES
996	va_start(vargs, format);
997#else
998	va_start(vargs);
999#endif
1000	ret = PyUnicode_FromFormatV(format, vargs);
1001	va_end(vargs);
1002	return ret;
1003}
1004
1005Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1006				wchar_t *w,
1007				Py_ssize_t size)
1008{
1009    if (unicode == NULL) {
1010	PyErr_BadInternalCall();
1011	return -1;
1012    }
1013
1014    /* If possible, try to copy the 0-termination as well */
1015    if (size > PyUnicode_GET_SIZE(unicode))
1016	size = PyUnicode_GET_SIZE(unicode) + 1;
1017
1018#ifdef HAVE_USABLE_WCHAR_T
1019    memcpy(w, unicode->str, size * sizeof(wchar_t));
1020#else
1021    {
1022	register Py_UNICODE *u;
1023	register Py_ssize_t i;
1024	u = PyUnicode_AS_UNICODE(unicode);
1025	for (i = size; i > 0; i--)
1026	    *w++ = *u++;
1027    }
1028#endif
1029
1030    if (size > PyUnicode_GET_SIZE(unicode))
1031        return PyUnicode_GET_SIZE(unicode);
1032    else
1033    return size;
1034}
1035
1036#endif
1037
1038PyObject *PyUnicode_FromOrdinal(int ordinal)
1039{
1040    Py_UNICODE s[2];
1041
1042    if (ordinal < 0 || ordinal > 0x10ffff) {
1043	PyErr_SetString(PyExc_ValueError,
1044			"chr() arg not in range(0x110000)");
1045	return NULL;
1046    }
1047
1048#ifndef Py_UNICODE_WIDE
1049    if (ordinal > 0xffff) {
1050        ordinal -= 0x10000;
1051        s[0] = 0xD800 | (ordinal >> 10);
1052        s[1] = 0xDC00 | (ordinal & 0x3FF);
1053        return PyUnicode_FromUnicode(s, 2);
1054    }
1055#endif
1056
1057    s[0] = (Py_UNICODE)ordinal;
1058    return PyUnicode_FromUnicode(s, 1);
1059}
1060
1061PyObject *PyUnicode_FromObject(register PyObject *obj)
1062{
1063    /* XXX Perhaps we should make this API an alias of
1064           PyObject_Str() instead ?! */
1065    if (PyUnicode_CheckExact(obj)) {
1066	Py_INCREF(obj);
1067	return obj;
1068    }
1069    if (PyUnicode_Check(obj)) {
1070	/* For a Unicode subtype that's not a Unicode object,
1071	   return a true Unicode object with the same data. */
1072	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1073				     PyUnicode_GET_SIZE(obj));
1074    }
1075    PyErr_Format(PyExc_TypeError,
1076                 "Can't convert '%.100s' object to str implicitly",
1077                 Py_TYPE(obj)->tp_name);
1078    return NULL;
1079}
1080
1081PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1082				      const char *encoding,
1083				      const char *errors)
1084{
1085    const char *s = NULL;
1086    Py_ssize_t len;
1087    PyObject *v;
1088
1089    if (obj == NULL) {
1090	PyErr_BadInternalCall();
1091	return NULL;
1092    }
1093
1094    if (PyUnicode_Check(obj)) {
1095	PyErr_SetString(PyExc_TypeError,
1096			"decoding Unicode is not supported");
1097	return NULL;
1098	}
1099
1100    /* Coerce object */
1101    if (PyBytes_Check(obj)) {
1102        s = PyBytes_AS_STRING(obj);
1103        len = PyBytes_GET_SIZE(obj);
1104    }
1105    else if (PyByteArray_Check(obj)) {
1106        s = PyByteArray_AS_STRING(obj);
1107        len = PyByteArray_GET_SIZE(obj);
1108    }
1109    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1110	/* Overwrite the error message with something more useful in
1111	   case of a TypeError. */
1112	if (PyErr_ExceptionMatches(PyExc_TypeError))
1113            PyErr_Format(PyExc_TypeError,
1114			 "coercing to Unicode: need string or buffer, "
1115			 "%.80s found",
1116		     Py_TYPE(obj)->tp_name);
1117	goto onError;
1118    }
1119
1120    /* Convert to Unicode */
1121    if (len == 0) {
1122	Py_INCREF(unicode_empty);
1123	v = (PyObject *)unicode_empty;
1124    }
1125    else
1126	v = PyUnicode_Decode(s, len, encoding, errors);
1127
1128    return v;
1129
1130 onError:
1131    return NULL;
1132}
1133
1134PyObject *PyUnicode_Decode(const char *s,
1135			   Py_ssize_t size,
1136			   const char *encoding,
1137			   const char *errors)
1138{
1139    PyObject *buffer = NULL, *unicode;
1140    Py_buffer info;
1141    char lower[20];  /* Enough for any encoding name we recognize */
1142    char *l;
1143    const char *e;
1144
1145    if (encoding == NULL)
1146        encoding = PyUnicode_GetDefaultEncoding();
1147
1148    /* Convert encoding to lower case and replace '_' with '-' in order to
1149       catch e.g. UTF_8 */
1150    e = encoding;
1151    l = lower;
1152    while (*e && l < &lower[(sizeof lower) - 2]) {
1153        if (ISUPPER(*e)) {
1154            *l++ = TOLOWER(*e++);
1155        }
1156        else if (*e == '_') {
1157            *l++ = '-';
1158            e++;
1159        }
1160        else {
1161            *l++ = *e++;
1162        }
1163    }
1164    *l = '\0';
1165
1166    /* Shortcuts for common default encodings */
1167    if (strcmp(lower, "utf-8") == 0)
1168        return PyUnicode_DecodeUTF8(s, size, errors);
1169    else if ((strcmp(lower, "latin-1") == 0) ||
1170             (strcmp(lower, "iso-8859-1") == 0))
1171        return PyUnicode_DecodeLatin1(s, size, errors);
1172#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1173    else if (strcmp(lower, "mbcs") == 0)
1174        return PyUnicode_DecodeMBCS(s, size, errors);
1175#endif
1176    else if (strcmp(lower, "ascii") == 0)
1177        return PyUnicode_DecodeASCII(s, size, errors);
1178    else if (strcmp(lower, "utf-16") == 0)
1179        return PyUnicode_DecodeUTF16(s, size, errors, 0);
1180    else if (strcmp(lower, "utf-32") == 0)
1181        return PyUnicode_DecodeUTF32(s, size, errors, 0);
1182
1183    /* Decode via the codec registry */
1184    buffer = NULL;
1185    if (PyBuffer_FillInfo(&info, (void *)s, size, 1, PyBUF_SIMPLE) < 0)
1186        goto onError;
1187    buffer = PyMemoryView_FromMemory(&info);
1188    if (buffer == NULL)
1189        goto onError;
1190    unicode = PyCodec_Decode(buffer, encoding, errors);
1191    if (unicode == NULL)
1192        goto onError;
1193    if (!PyUnicode_Check(unicode)) {
1194        PyErr_Format(PyExc_TypeError,
1195                     "decoder did not return a unicode object (type=%.400s)",
1196                     Py_TYPE(unicode)->tp_name);
1197        Py_DECREF(unicode);
1198        goto onError;
1199    }
1200    Py_DECREF(buffer);
1201    return unicode;
1202
1203 onError:
1204    Py_XDECREF(buffer);
1205    return NULL;
1206}
1207
1208PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1209                                    const char *encoding,
1210                                    const char *errors)
1211{
1212    PyObject *v;
1213
1214    if (!PyUnicode_Check(unicode)) {
1215        PyErr_BadArgument();
1216        goto onError;
1217    }
1218
1219    if (encoding == NULL)
1220	encoding = PyUnicode_GetDefaultEncoding();
1221
1222    /* Decode via the codec registry */
1223    v = PyCodec_Decode(unicode, encoding, errors);
1224    if (v == NULL)
1225        goto onError;
1226    return v;
1227
1228 onError:
1229    return NULL;
1230}
1231
1232PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1233                                     const char *encoding,
1234                                     const char *errors)
1235{
1236    PyObject *v;
1237
1238    if (!PyUnicode_Check(unicode)) {
1239        PyErr_BadArgument();
1240        goto onError;
1241    }
1242
1243    if (encoding == NULL)
1244	encoding = PyUnicode_GetDefaultEncoding();
1245
1246    /* Decode via the codec registry */
1247    v = PyCodec_Decode(unicode, encoding, errors);
1248    if (v == NULL)
1249        goto onError;
1250    if (!PyUnicode_Check(v)) {
1251        PyErr_Format(PyExc_TypeError,
1252                     "decoder did not return a unicode object (type=%.400s)",
1253                     Py_TYPE(v)->tp_name);
1254        Py_DECREF(v);
1255        goto onError;
1256    }
1257    return v;
1258
1259 onError:
1260    return NULL;
1261}
1262
1263PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1264			   Py_ssize_t size,
1265			   const char *encoding,
1266			   const char *errors)
1267{
1268    PyObject *v, *unicode;
1269
1270    unicode = PyUnicode_FromUnicode(s, size);
1271    if (unicode == NULL)
1272	return NULL;
1273    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1274    Py_DECREF(unicode);
1275    return v;
1276}
1277
1278PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1279                                    const char *encoding,
1280                                    const char *errors)
1281{
1282    PyObject *v;
1283
1284    if (!PyUnicode_Check(unicode)) {
1285        PyErr_BadArgument();
1286        goto onError;
1287    }
1288
1289    if (encoding == NULL)
1290	encoding = PyUnicode_GetDefaultEncoding();
1291
1292    /* Encode via the codec registry */
1293    v = PyCodec_Encode(unicode, encoding, errors);
1294    if (v == NULL)
1295        goto onError;
1296    return v;
1297
1298 onError:
1299    return NULL;
1300}
1301
1302PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1303                                    const char *encoding,
1304                                    const char *errors)
1305{
1306    PyObject *v;
1307
1308    if (!PyUnicode_Check(unicode)) {
1309        PyErr_BadArgument();
1310        goto onError;
1311    }
1312
1313    if (encoding == NULL)
1314	encoding = PyUnicode_GetDefaultEncoding();
1315
1316    /* Shortcuts for common default encodings */
1317    if (errors == NULL) {
1318	if (strcmp(encoding, "utf-8") == 0)
1319	    return PyUnicode_AsUTF8String(unicode);
1320	else if (strcmp(encoding, "latin-1") == 0)
1321	    return PyUnicode_AsLatin1String(unicode);
1322#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1323	else if (strcmp(encoding, "mbcs") == 0)
1324	    return PyUnicode_AsMBCSString(unicode);
1325#endif
1326	else if (strcmp(encoding, "ascii") == 0)
1327	    return PyUnicode_AsASCIIString(unicode);
1328    }
1329
1330    /* Encode via the codec registry */
1331    v = PyCodec_Encode(unicode, encoding, errors);
1332    if (v == NULL)
1333        goto onError;
1334    if (PyByteArray_Check(v)) {
1335        char msg[100];
1336        PyOS_snprintf(msg, sizeof(msg),
1337                      "encoder %s returned buffer instead of bytes",
1338                      encoding);
1339        if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
1340            v = NULL;
1341            goto onError;
1342        }
1343        v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1344    }
1345    else if (!PyBytes_Check(v)) {
1346        PyErr_Format(PyExc_TypeError,
1347                     "encoder did not return a bytes object (type=%.400s)",
1348                     Py_TYPE(v)->tp_name);
1349        v = NULL;
1350    }
1351    return v;
1352
1353 onError:
1354    return NULL;
1355}
1356
1357PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1358                                     const char *encoding,
1359                                     const char *errors)
1360{
1361    PyObject *v;
1362
1363    if (!PyUnicode_Check(unicode)) {
1364        PyErr_BadArgument();
1365        goto onError;
1366    }
1367
1368    if (encoding == NULL)
1369	encoding = PyUnicode_GetDefaultEncoding();
1370
1371    /* Encode via the codec registry */
1372    v = PyCodec_Encode(unicode, encoding, errors);
1373    if (v == NULL)
1374        goto onError;
1375    if (!PyUnicode_Check(v)) {
1376        PyErr_Format(PyExc_TypeError,
1377                     "encoder did not return an unicode object (type=%.400s)",
1378                     Py_TYPE(v)->tp_name);
1379        Py_DECREF(v);
1380        goto onError;
1381    }
1382    return v;
1383
1384 onError:
1385    return NULL;
1386}
1387
1388PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1389					    const char *errors)
1390{
1391    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1392    if (v)
1393        return v;
1394    if (errors != NULL)
1395        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1396    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1397                             PyUnicode_GET_SIZE(unicode),
1398                             NULL);
1399    if (!v)
1400        return NULL;
1401    ((PyUnicodeObject *)unicode)->defenc = v;
1402    return v;
1403}
1404
1405PyObject*
1406PyUnicode_DecodeFSDefault(const char *s) {
1407    Py_ssize_t size = (Py_ssize_t)strlen(s);
1408    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1409}
1410
1411PyObject*
1412PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1413{
1414    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1415       can be undefined. If it is case, decode using UTF-8. The following assumes
1416       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1417       bootstrapping process where the codecs aren't ready yet.
1418    */
1419    if (Py_FileSystemDefaultEncoding) {
1420#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1421        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1422            return PyUnicode_DecodeMBCS(s, size, "replace");
1423        }
1424#elif defined(__APPLE__)
1425        if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1426            return PyUnicode_DecodeUTF8(s, size, "replace");
1427        }
1428#endif
1429        return PyUnicode_Decode(s, size,
1430                                Py_FileSystemDefaultEncoding,
1431                                "replace");
1432    }
1433    else {
1434        return PyUnicode_DecodeUTF8(s, size, "replace");
1435    }
1436}
1437
1438char*
1439PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1440{
1441    PyObject *bytes;
1442    if (!PyUnicode_Check(unicode)) {
1443        PyErr_BadArgument();
1444        return NULL;
1445    }
1446    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1447    if (bytes == NULL)
1448        return NULL;
1449    if (psize != NULL)
1450        *psize = PyBytes_GET_SIZE(bytes);
1451    return PyBytes_AS_STRING(bytes);
1452}
1453
1454char*
1455PyUnicode_AsString(PyObject *unicode)
1456{
1457    return PyUnicode_AsStringAndSize(unicode, NULL);
1458}
1459
1460Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1461{
1462    if (!PyUnicode_Check(unicode)) {
1463        PyErr_BadArgument();
1464        goto onError;
1465    }
1466    return PyUnicode_AS_UNICODE(unicode);
1467
1468 onError:
1469    return NULL;
1470}
1471
1472Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1473{
1474    if (!PyUnicode_Check(unicode)) {
1475        PyErr_BadArgument();
1476        goto onError;
1477    }
1478    return PyUnicode_GET_SIZE(unicode);
1479
1480 onError:
1481    return -1;
1482}
1483
1484const char *PyUnicode_GetDefaultEncoding(void)
1485{
1486    return unicode_default_encoding;
1487}
1488
1489int PyUnicode_SetDefaultEncoding(const char *encoding)
1490{
1491    if (strcmp(encoding, unicode_default_encoding) != 0) {
1492        PyErr_Format(PyExc_ValueError,
1493                     "Can only set default encoding to %s",
1494                     unicode_default_encoding);
1495        return -1;
1496    }
1497    return 0;
1498}
1499
1500/* error handling callback helper:
1501   build arguments, call the callback and check the arguments,
1502   if no exception occurred, copy the replacement to the output
1503   and adjust various state variables.
1504   return 0 on success, -1 on error
1505*/
1506
1507static
1508int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1509                 const char *encoding, const char *reason,
1510                 const char **input, const char **inend, Py_ssize_t *startinpos,
1511                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1512                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1513{
1514    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1515
1516    PyObject *restuple = NULL;
1517    PyObject *repunicode = NULL;
1518    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1519    Py_ssize_t insize;
1520    Py_ssize_t requiredsize;
1521    Py_ssize_t newpos;
1522    Py_UNICODE *repptr;
1523    PyObject *inputobj = NULL;
1524    Py_ssize_t repsize;
1525    int res = -1;
1526
1527    if (*errorHandler == NULL) {
1528	*errorHandler = PyCodec_LookupError(errors);
1529	if (*errorHandler == NULL)
1530	   goto onError;
1531    }
1532
1533    if (*exceptionObject == NULL) {
1534    	*exceptionObject = PyUnicodeDecodeError_Create(
1535	    encoding, *input, *inend-*input, *startinpos, *endinpos, reason);
1536	if (*exceptionObject == NULL)
1537	   goto onError;
1538    }
1539    else {
1540	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1541	    goto onError;
1542	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1543	    goto onError;
1544	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1545	    goto onError;
1546    }
1547
1548    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1549    if (restuple == NULL)
1550	goto onError;
1551    if (!PyTuple_Check(restuple)) {
1552	PyErr_Format(PyExc_TypeError, &argparse[4]);
1553	goto onError;
1554    }
1555    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1556	goto onError;
1557
1558    /* Copy back the bytes variables, which might have been modified by the
1559       callback */
1560    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1561    if (!inputobj)
1562        goto onError;
1563    if (!PyBytes_Check(inputobj)) {
1564	PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1565    }
1566    *input = PyBytes_AS_STRING(inputobj);
1567    insize = PyBytes_GET_SIZE(inputobj);
1568    *inend = *input + insize;
1569    /* we can DECREF safely, as the exception has another reference,
1570       so the object won't go away. */
1571    Py_DECREF(inputobj);
1572
1573    if (newpos<0)
1574	newpos = insize+newpos;
1575    if (newpos<0 || newpos>insize) {
1576	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1577	goto onError;
1578    }
1579
1580    /* need more space? (at least enough for what we
1581       have+the replacement+the rest of the string (starting
1582       at the new input position), so we won't have to check space
1583       when there are no errors in the rest of the string) */
1584    repptr = PyUnicode_AS_UNICODE(repunicode);
1585    repsize = PyUnicode_GET_SIZE(repunicode);
1586    requiredsize = *outpos + repsize + insize-newpos;
1587    if (requiredsize > outsize) {
1588	if (requiredsize<2*outsize)
1589	    requiredsize = 2*outsize;
1590	if (PyUnicode_Resize(output, requiredsize) < 0)
1591	    goto onError;
1592	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1593    }
1594    *endinpos = newpos;
1595    *inptr = *input + newpos;
1596    Py_UNICODE_COPY(*outptr, repptr, repsize);
1597    *outptr += repsize;
1598    *outpos += repsize;
1599
1600    /* we made it! */
1601    res = 0;
1602
1603    onError:
1604    Py_XDECREF(restuple);
1605    return res;
1606}
1607
1608/* --- UTF-7 Codec -------------------------------------------------------- */
1609
1610/* see RFC2152 for details */
1611
1612static
1613char utf7_special[128] = {
1614    /* indicate whether a UTF-7 character is special i.e. cannot be directly
1615       encoded:
1616	   0 - not special
1617	   1 - special
1618	   2 - whitespace (optional)
1619	   3 - RFC2152 Set O (optional) */
1620    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1621    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1622    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1623    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1624    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1625    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1626    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1627    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1628
1629};
1630
1631/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1632   warnings about the comparison always being false; since
1633   utf7_special[0] is 1, we can safely make that one comparison
1634   true  */
1635
1636#define SPECIAL(c, encodeO, encodeWS) \
1637    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1638     (encodeWS && (utf7_special[(c)] == 2)) || \
1639     (encodeO && (utf7_special[(c)] == 3)))
1640
1641#define B64(n)  \
1642    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1643#define B64CHAR(c) \
1644    (ISALNUM(c) || (c) == '+' || (c) == '/')
1645#define UB64(c) \
1646    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
1647     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1648
1649#define ENCODE(out, ch, bits)                   \
1650    while (bits >= 6) {                         \
1651        *out++ = B64(ch >> (bits-6));           \
1652        bits -= 6;                              \
1653    }
1654
1655#define DECODE(out, ch, bits, surrogate)                                \
1656    while (bits >= 16) {                                                \
1657        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
1658        bits -= 16;                                                     \
1659        if (surrogate) {                                                \
1660            /* We have already generated an error for the high surrogate \
1661               so let's not bother seeing if the low surrogate is correct or not */ \
1662            surrogate = 0;                                              \
1663        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
1664            /* This is a surrogate pair. Unfortunately we can't represent \
1665               it in a 16-bit character */                              \
1666            surrogate = 1;                                              \
1667            errmsg = "code pairs are not supported";                    \
1668            goto utf7Error;                                             \
1669        } else {                                                        \
1670            *out++ = outCh;                                             \
1671        }                                                               \
1672    }
1673
1674PyObject *PyUnicode_DecodeUTF7(const char *s,
1675			       Py_ssize_t size,
1676			       const char *errors)
1677{
1678    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1679}
1680
1681PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1682			       Py_ssize_t size,
1683			       const char *errors,
1684			       Py_ssize_t *consumed)
1685{
1686    const char *starts = s;
1687    Py_ssize_t startinpos;
1688    Py_ssize_t endinpos;
1689    Py_ssize_t outpos;
1690    const char *e;
1691    PyUnicodeObject *unicode;
1692    Py_UNICODE *p;
1693    const char *errmsg = "";
1694    int inShift = 0;
1695    unsigned int bitsleft = 0;
1696    unsigned long charsleft = 0;
1697    int surrogate = 0;
1698    PyObject *errorHandler = NULL;
1699    PyObject *exc = NULL;
1700
1701    unicode = _PyUnicode_New(size);
1702    if (!unicode)
1703        return NULL;
1704    if (size == 0) {
1705        if (consumed)
1706            *consumed = 0;
1707        return (PyObject *)unicode;
1708    }
1709
1710    p = unicode->str;
1711    e = s + size;
1712
1713    while (s < e) {
1714        Py_UNICODE ch;
1715        restart:
1716        ch = *s;
1717
1718        if (inShift) {
1719            if ((ch == '-') || !B64CHAR(ch)) {
1720                inShift = 0;
1721                s++;
1722
1723                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1724                if (bitsleft >= 6) {
1725                    /* The shift sequence has a partial character in it. If
1726                       bitsleft < 6 then we could just classify it as padding
1727                       but that is not the case here */
1728
1729                    errmsg = "partial character in shift sequence";
1730                    goto utf7Error;
1731                }
1732                /* According to RFC2152 the remaining bits should be zero. We
1733                   choose to signal an error/insert a replacement character
1734                   here so indicate the potential of a misencoded character. */
1735
1736                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1737                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1738                    errmsg = "non-zero padding bits in shift sequence";
1739                    goto utf7Error;
1740                }
1741
1742                if (ch == '-') {
1743                    if ((s < e) && (*(s) == '-')) {
1744                        *p++ = '-';
1745                        inShift = 1;
1746                    }
1747                } else if (SPECIAL(ch,0,0)) {
1748                    errmsg = "unexpected special character";
1749	                goto utf7Error;
1750                } else  {
1751                    *p++ = ch;
1752                }
1753            } else {
1754                charsleft = (charsleft << 6) | UB64(ch);
1755                bitsleft += 6;
1756                s++;
1757                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1758            }
1759        }
1760        else if ( ch == '+' ) {
1761            startinpos = s-starts;
1762            s++;
1763            if (s < e && *s == '-') {
1764                s++;
1765                *p++ = '+';
1766            } else
1767            {
1768                inShift = 1;
1769                bitsleft = 0;
1770            }
1771        }
1772        else if (SPECIAL(ch,0,0)) {
1773            startinpos = s-starts;
1774            errmsg = "unexpected special character";
1775            s++;
1776            goto utf7Error;
1777        }
1778        else {
1779            *p++ = ch;
1780            s++;
1781        }
1782        continue;
1783    utf7Error:
1784        outpos = p-PyUnicode_AS_UNICODE(unicode);
1785        endinpos = s-starts;
1786        if (unicode_decode_call_errorhandler(
1787             errors, &errorHandler,
1788             "utf7", errmsg,
1789             &starts, &e, &startinpos, &endinpos, &exc, &s,
1790             (PyObject **)&unicode, &outpos, &p))
1791        goto onError;
1792    }
1793
1794    if (inShift && !consumed) {
1795        outpos = p-PyUnicode_AS_UNICODE(unicode);
1796        endinpos = size;
1797        if (unicode_decode_call_errorhandler(
1798             errors, &errorHandler,
1799             "utf7", "unterminated shift sequence",
1800             &starts, &e, &startinpos, &endinpos, &exc, &s,
1801             (PyObject **)&unicode, &outpos, &p))
1802            goto onError;
1803        if (s < e)
1804           goto restart;
1805    }
1806    if (consumed) {
1807        if(inShift)
1808            *consumed = startinpos;
1809        else
1810            *consumed = s-starts;
1811    }
1812
1813    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1814        goto onError;
1815
1816    Py_XDECREF(errorHandler);
1817    Py_XDECREF(exc);
1818    return (PyObject *)unicode;
1819
1820onError:
1821    Py_XDECREF(errorHandler);
1822    Py_XDECREF(exc);
1823    Py_DECREF(unicode);
1824    return NULL;
1825}
1826
1827
1828PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1829                   Py_ssize_t size,
1830                   int encodeSetO,
1831                   int encodeWhiteSpace,
1832                   const char *errors)
1833{
1834    PyObject *v, *result;
1835    /* It might be possible to tighten this worst case */
1836    Py_ssize_t cbAllocated = 5 * size;
1837    int inShift = 0;
1838    Py_ssize_t i = 0;
1839    unsigned int bitsleft = 0;
1840    unsigned long charsleft = 0;
1841    char * out;
1842    char * start;
1843
1844    if (size == 0)
1845       return PyBytes_FromStringAndSize(NULL, 0);
1846
1847    v = PyByteArray_FromStringAndSize(NULL, cbAllocated);
1848    if (v == NULL)
1849        return NULL;
1850
1851    start = out = PyByteArray_AS_STRING(v);
1852    for (;i < size; ++i) {
1853        Py_UNICODE ch = s[i];
1854
1855        if (!inShift) {
1856            if (ch == '+') {
1857                *out++ = '+';
1858                *out++ = '-';
1859            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1860                charsleft = ch;
1861                bitsleft = 16;
1862                *out++ = '+';
1863                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1864                inShift = bitsleft > 0;
1865            } else {
1866                *out++ = (char) ch;
1867            }
1868        } else {
1869            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1870                *out++ = B64(charsleft << (6-bitsleft));
1871                charsleft = 0;
1872                bitsleft = 0;
1873                /* Characters not in the BASE64 set implicitly unshift the sequence
1874                   so no '-' is required, except if the character is itself a '-' */
1875                if (B64CHAR(ch) || ch == '-') {
1876                    *out++ = '-';
1877                }
1878                inShift = 0;
1879                *out++ = (char) ch;
1880            } else {
1881                bitsleft += 16;
1882                charsleft = (charsleft << 16) | ch;
1883                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1884
1885                /* If the next character is special then we dont' need to terminate
1886                   the shift sequence. If the next character is not a BASE64 character
1887                   or '-' then the shift sequence will be terminated implicitly and we
1888                   don't have to insert a '-'. */
1889
1890                if (bitsleft == 0) {
1891                    if (i + 1 < size) {
1892                        Py_UNICODE ch2 = s[i+1];
1893
1894                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1895
1896                        } else if (B64CHAR(ch2) || ch2 == '-') {
1897                            *out++ = '-';
1898                            inShift = 0;
1899                        } else {
1900                            inShift = 0;
1901                        }
1902
1903                    }
1904                    else {
1905                        *out++ = '-';
1906                        inShift = 0;
1907                    }
1908                }
1909            }
1910        }
1911    }
1912    if (bitsleft) {
1913        *out++= B64(charsleft << (6-bitsleft) );
1914        *out++ = '-';
1915    }
1916
1917    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start);
1918    Py_DECREF(v);
1919    return result;
1920}
1921
1922#undef SPECIAL
1923#undef B64
1924#undef B64CHAR
1925#undef UB64
1926#undef ENCODE
1927#undef DECODE
1928
1929/* --- UTF-8 Codec -------------------------------------------------------- */
1930
1931static
1932char utf8_code_length[256] = {
1933    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1934       illegal prefix.  see RFC 2279 for details */
1935    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1936    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1937    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1938    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1939    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1940    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1941    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1942    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1943    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1944    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1945    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1946    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1947    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1948    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1949    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1950    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1951};
1952
1953PyObject *PyUnicode_DecodeUTF8(const char *s,
1954			       Py_ssize_t size,
1955			       const char *errors)
1956{
1957    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1958}
1959
1960PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1961			                Py_ssize_t size,
1962			                const char *errors,
1963			                Py_ssize_t *consumed)
1964{
1965    const char *starts = s;
1966    int n;
1967    Py_ssize_t startinpos;
1968    Py_ssize_t endinpos;
1969    Py_ssize_t outpos;
1970    const char *e;
1971    PyUnicodeObject *unicode;
1972    Py_UNICODE *p;
1973    const char *errmsg = "";
1974    PyObject *errorHandler = NULL;
1975    PyObject *exc = NULL;
1976
1977    /* Note: size will always be longer than the resulting Unicode
1978       character count */
1979    unicode = _PyUnicode_New(size);
1980    if (!unicode)
1981        return NULL;
1982    if (size == 0) {
1983        if (consumed)
1984            *consumed = 0;
1985        return (PyObject *)unicode;
1986    }
1987
1988    /* Unpack UTF-8 encoded data */
1989    p = unicode->str;
1990    e = s + size;
1991
1992    while (s < e) {
1993        Py_UCS4 ch = (unsigned char)*s;
1994
1995        if (ch < 0x80) {
1996            *p++ = (Py_UNICODE)ch;
1997            s++;
1998            continue;
1999        }
2000
2001        n = utf8_code_length[ch];
2002
2003        if (s + n > e) {
2004	    if (consumed)
2005		break;
2006	    else {
2007		errmsg = "unexpected end of data";
2008		startinpos = s-starts;
2009		endinpos = size;
2010		goto utf8Error;
2011	    }
2012	}
2013
2014        switch (n) {
2015
2016        case 0:
2017            errmsg = "unexpected code byte";
2018	    startinpos = s-starts;
2019	    endinpos = startinpos+1;
2020	    goto utf8Error;
2021
2022        case 1:
2023            errmsg = "internal error";
2024	    startinpos = s-starts;
2025	    endinpos = startinpos+1;
2026	    goto utf8Error;
2027
2028        case 2:
2029            if ((s[1] & 0xc0) != 0x80) {
2030                errmsg = "invalid data";
2031		startinpos = s-starts;
2032		endinpos = startinpos+2;
2033		goto utf8Error;
2034	    }
2035            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2036            if (ch < 0x80) {
2037		startinpos = s-starts;
2038		endinpos = startinpos+2;
2039                errmsg = "illegal encoding";
2040		goto utf8Error;
2041	    }
2042	    else
2043		*p++ = (Py_UNICODE)ch;
2044            break;
2045
2046        case 3:
2047            if ((s[1] & 0xc0) != 0x80 ||
2048                (s[2] & 0xc0) != 0x80) {
2049                errmsg = "invalid data";
2050		startinpos = s-starts;
2051		endinpos = startinpos+3;
2052		goto utf8Error;
2053	    }
2054            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2055            if (ch < 0x0800) {
2056		/* Note: UTF-8 encodings of surrogates are considered
2057		   legal UTF-8 sequences;
2058
2059		   XXX For wide builds (UCS-4) we should probably try
2060		       to recombine the surrogates into a single code
2061		       unit.
2062		*/
2063                errmsg = "illegal encoding";
2064		startinpos = s-starts;
2065		endinpos = startinpos+3;
2066		goto utf8Error;
2067	    }
2068	    else
2069		*p++ = (Py_UNICODE)ch;
2070            break;
2071
2072        case 4:
2073            if ((s[1] & 0xc0) != 0x80 ||
2074                (s[2] & 0xc0) != 0x80 ||
2075                (s[3] & 0xc0) != 0x80) {
2076                errmsg = "invalid data";
2077		startinpos = s-starts;
2078		endinpos = startinpos+4;
2079		goto utf8Error;
2080	    }
2081            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2082                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2083            /* validate and convert to UTF-16 */
2084            if ((ch < 0x10000)        /* minimum value allowed for 4
2085					 byte encoding */
2086                || (ch > 0x10ffff))   /* maximum value allowed for
2087					 UTF-16 */
2088	    {
2089                errmsg = "illegal encoding";
2090		startinpos = s-starts;
2091		endinpos = startinpos+4;
2092		goto utf8Error;
2093	    }
2094#ifdef Py_UNICODE_WIDE
2095	    *p++ = (Py_UNICODE)ch;
2096#else
2097            /*  compute and append the two surrogates: */
2098
2099            /*  translate from 10000..10FFFF to 0..FFFF */
2100            ch -= 0x10000;
2101
2102            /*  high surrogate = top 10 bits added to D800 */
2103            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2104
2105            /*  low surrogate = bottom 10 bits added to DC00 */
2106            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2107#endif
2108            break;
2109
2110        default:
2111            /* Other sizes are only needed for UCS-4 */
2112            errmsg = "unsupported Unicode code range";
2113	    startinpos = s-starts;
2114	    endinpos = startinpos+n;
2115	    goto utf8Error;
2116        }
2117        s += n;
2118	continue;
2119
2120    utf8Error:
2121    outpos = p-PyUnicode_AS_UNICODE(unicode);
2122    if (unicode_decode_call_errorhandler(
2123	     errors, &errorHandler,
2124	     "utf8", errmsg,
2125	     &starts, &e, &startinpos, &endinpos, &exc, &s,
2126	     (PyObject **)&unicode, &outpos, &p))
2127	goto onError;
2128    }
2129    if (consumed)
2130	*consumed = s-starts;
2131
2132    /* Adjust length */
2133    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2134        goto onError;
2135
2136    Py_XDECREF(errorHandler);
2137    Py_XDECREF(exc);
2138    return (PyObject *)unicode;
2139
2140onError:
2141    Py_XDECREF(errorHandler);
2142    Py_XDECREF(exc);
2143    Py_DECREF(unicode);
2144    return NULL;
2145}
2146
2147/* Allocation strategy:  if the string is short, convert into a stack buffer
2148   and allocate exactly as much space needed at the end.  Else allocate the
2149   maximum possible needed (4 result bytes per Unicode character), and return
2150   the excess memory at the end.
2151*/
2152PyObject *
2153PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2154		     Py_ssize_t size,
2155		     const char *errors)
2156{
2157#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2158
2159    Py_ssize_t i;                /* index into s of next input byte */
2160    PyObject *result;            /* result string object */
2161    char *p;                     /* next free byte in output buffer */
2162    Py_ssize_t nallocated;      /* number of result bytes allocated */
2163    Py_ssize_t nneeded;            /* number of result bytes needed */
2164    char stackbuf[MAX_SHORT_UNICHARS * 4];
2165
2166    assert(s != NULL);
2167    assert(size >= 0);
2168
2169    if (size <= MAX_SHORT_UNICHARS) {
2170        /* Write into the stack buffer; nallocated can't overflow.
2171         * At the end, we'll allocate exactly as much heap space as it
2172         * turns out we need.
2173         */
2174        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2175        result = NULL;   /* will allocate after we're done */
2176        p = stackbuf;
2177    }
2178    else {
2179        /* Overallocate on the heap, and give the excess back at the end. */
2180        nallocated = size * 4;
2181        if (nallocated / 4 != size)  /* overflow! */
2182            return PyErr_NoMemory();
2183        result = PyBytes_FromStringAndSize(NULL, nallocated);
2184        if (result == NULL)
2185            return NULL;
2186        p = PyBytes_AS_STRING(result);
2187    }
2188
2189    for (i = 0; i < size;) {
2190        Py_UCS4 ch = s[i++];
2191
2192        if (ch < 0x80)
2193            /* Encode ASCII */
2194            *p++ = (char) ch;
2195
2196        else if (ch < 0x0800) {
2197            /* Encode Latin-1 */
2198            *p++ = (char)(0xc0 | (ch >> 6));
2199            *p++ = (char)(0x80 | (ch & 0x3f));
2200        }
2201        else {
2202            /* Encode UCS2 Unicode ordinals */
2203            if (ch < 0x10000) {
2204                /* Special case: check for high surrogate */
2205                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2206                    Py_UCS4 ch2 = s[i];
2207                    /* Check for low surrogate and combine the two to
2208                       form a UCS4 value */
2209                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2210                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2211                        i++;
2212                        goto encodeUCS4;
2213                    }
2214                    /* Fall through: handles isolated high surrogates */
2215                }
2216                *p++ = (char)(0xe0 | (ch >> 12));
2217                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2218                *p++ = (char)(0x80 | (ch & 0x3f));
2219                continue;
2220    	    }
2221encodeUCS4:
2222            /* Encode UCS4 Unicode ordinals */
2223            *p++ = (char)(0xf0 | (ch >> 18));
2224            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2225            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2226            *p++ = (char)(0x80 | (ch & 0x3f));
2227        }
2228    }
2229
2230    if (result == NULL) {
2231        /* This was stack allocated. */
2232        nneeded = p - stackbuf;
2233        assert(nneeded <= nallocated);
2234        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2235    }
2236    else {
2237        /* Cut back to size actually needed. */
2238        nneeded = p - PyBytes_AS_STRING(result);
2239        assert(nneeded <= nallocated);
2240        _PyBytes_Resize(&result, nneeded);
2241    }
2242    return result;
2243
2244#undef MAX_SHORT_UNICHARS
2245}
2246
2247PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2248{
2249    if (!PyUnicode_Check(unicode)) {
2250        PyErr_BadArgument();
2251        return NULL;
2252    }
2253    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2254				PyUnicode_GET_SIZE(unicode),
2255				NULL);
2256}
2257
2258/* --- UTF-32 Codec ------------------------------------------------------- */
2259
2260PyObject *
2261PyUnicode_DecodeUTF32(const char *s,
2262		      Py_ssize_t size,
2263		      const char *errors,
2264		      int *byteorder)
2265{
2266    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2267}
2268
2269PyObject *
2270PyUnicode_DecodeUTF32Stateful(const char *s,
2271			      Py_ssize_t size,
2272			      const char *errors,
2273			      int *byteorder,
2274			      Py_ssize_t *consumed)
2275{
2276    const char *starts = s;
2277    Py_ssize_t startinpos;
2278    Py_ssize_t endinpos;
2279    Py_ssize_t outpos;
2280    PyUnicodeObject *unicode;
2281    Py_UNICODE *p;
2282#ifndef Py_UNICODE_WIDE
2283    int i, pairs;
2284#else
2285    const int pairs = 0;
2286#endif
2287    const unsigned char *q, *e;
2288    int bo = 0;       /* assume native ordering by default */
2289    const char *errmsg = "";
2290    /* Offsets from q for retrieving bytes in the right order. */
2291#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2292    int iorder[] = {0, 1, 2, 3};
2293#else
2294    int iorder[] = {3, 2, 1, 0};
2295#endif
2296    PyObject *errorHandler = NULL;
2297    PyObject *exc = NULL;
2298    /* On narrow builds we split characters outside the BMP into two
2299       codepoints => count how much extra space we need. */
2300#ifndef Py_UNICODE_WIDE
2301    for (i = pairs = 0; i < size/4; i++)
2302	if (((Py_UCS4 *)s)[i] >= 0x10000)
2303	    pairs++;
2304#endif
2305
2306    /* This might be one to much, because of a BOM */
2307    unicode = _PyUnicode_New((size+3)/4+pairs);
2308    if (!unicode)
2309        return NULL;
2310    if (size == 0)
2311        return (PyObject *)unicode;
2312
2313    /* Unpack UTF-32 encoded data */
2314    p = unicode->str;
2315    q = (unsigned char *)s;
2316    e = q + size;
2317
2318    if (byteorder)
2319        bo = *byteorder;
2320
2321    /* Check for BOM marks (U+FEFF) in the input and adjust current
2322       byte order setting accordingly. In native mode, the leading BOM
2323       mark is skipped, in all other modes, it is copied to the output
2324       stream as-is (giving a ZWNBSP character). */
2325    if (bo == 0) {
2326        if (size >= 4) {
2327            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2328                                (q[iorder[1]] << 8) | q[iorder[0]];
2329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2330	    if (bom == 0x0000FEFF) {
2331		q += 4;
2332		bo = -1;
2333	    }
2334	    else if (bom == 0xFFFE0000) {
2335		q += 4;
2336		bo = 1;
2337	    }
2338#else
2339	    if (bom == 0x0000FEFF) {
2340		q += 4;
2341		bo = 1;
2342	    }
2343	    else if (bom == 0xFFFE0000) {
2344		q += 4;
2345		bo = -1;
2346	    }
2347#endif
2348	}
2349    }
2350
2351    if (bo == -1) {
2352        /* force LE */
2353        iorder[0] = 0;
2354        iorder[1] = 1;
2355        iorder[2] = 2;
2356        iorder[3] = 3;
2357    }
2358    else if (bo == 1) {
2359        /* force BE */
2360        iorder[0] = 3;
2361        iorder[1] = 2;
2362        iorder[2] = 1;
2363        iorder[3] = 0;
2364    }
2365
2366    while (q < e) {
2367	Py_UCS4 ch;
2368	/* remaining bytes at the end? (size should be divisible by 4) */
2369	if (e-q<4) {
2370	    if (consumed)
2371		break;
2372	    errmsg = "truncated data";
2373	    startinpos = ((const char *)q)-starts;
2374	    endinpos = ((const char *)e)-starts;
2375	    goto utf32Error;
2376	    /* The remaining input chars are ignored if the callback
2377	       chooses to skip the input */
2378	}
2379	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2380	     (q[iorder[1]] << 8) | q[iorder[0]];
2381
2382	if (ch >= 0x110000)
2383	{
2384	    errmsg = "codepoint not in range(0x110000)";
2385	    startinpos = ((const char *)q)-starts;
2386	    endinpos = startinpos+4;
2387	    goto utf32Error;
2388	}
2389#ifndef Py_UNICODE_WIDE
2390	if (ch >= 0x10000)
2391	{
2392	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
2393	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2394	}
2395	else
2396#endif
2397	    *p++ = ch;
2398	q += 4;
2399	continue;
2400    utf32Error:
2401	outpos = p-PyUnicode_AS_UNICODE(unicode);
2402	if (unicode_decode_call_errorhandler(
2403	         errors, &errorHandler,
2404	         "utf32", errmsg,
2405	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2406	         (PyObject **)&unicode, &outpos, &p))
2407	    goto onError;
2408    }
2409
2410    if (byteorder)
2411        *byteorder = bo;
2412
2413    if (consumed)
2414	*consumed = (const char *)q-starts;
2415
2416    /* Adjust length */
2417    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2418        goto onError;
2419
2420    Py_XDECREF(errorHandler);
2421    Py_XDECREF(exc);
2422    return (PyObject *)unicode;
2423
2424onError:
2425    Py_DECREF(unicode);
2426    Py_XDECREF(errorHandler);
2427    Py_XDECREF(exc);
2428    return NULL;
2429}
2430
2431PyObject *
2432PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2433		      Py_ssize_t size,
2434		      const char *errors,
2435		      int byteorder)
2436{
2437    PyObject *v, *result;
2438    unsigned char *p;
2439#ifndef Py_UNICODE_WIDE
2440    int i, pairs;
2441#else
2442    const int pairs = 0;
2443#endif
2444    /* Offsets from p for storing byte pairs in the right order. */
2445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2446    int iorder[] = {0, 1, 2, 3};
2447#else
2448    int iorder[] = {3, 2, 1, 0};
2449#endif
2450
2451#define STORECHAR(CH)                       \
2452    do {                                    \
2453        p[iorder[3]] = ((CH) >> 24) & 0xff; \
2454        p[iorder[2]] = ((CH) >> 16) & 0xff; \
2455        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
2456        p[iorder[0]] = (CH) & 0xff;         \
2457        p += 4;                             \
2458    } while(0)
2459
2460    /* In narrow builds we can output surrogate pairs as one codepoint,
2461       so we need less space. */
2462#ifndef Py_UNICODE_WIDE
2463    for (i = pairs = 0; i < size-1; i++)
2464	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2465	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2466	    pairs++;
2467#endif
2468    v = PyByteArray_FromStringAndSize(NULL,
2469		  4 * (size - pairs + (byteorder == 0)));
2470    if (v == NULL)
2471        return NULL;
2472
2473    p = (unsigned char *)PyByteArray_AS_STRING(v);
2474    if (byteorder == 0)
2475	STORECHAR(0xFEFF);
2476    if (size == 0)
2477        goto done;
2478
2479    if (byteorder == -1) {
2480        /* force LE */
2481        iorder[0] = 0;
2482        iorder[1] = 1;
2483        iorder[2] = 2;
2484        iorder[3] = 3;
2485    }
2486    else if (byteorder == 1) {
2487        /* force BE */
2488        iorder[0] = 3;
2489        iorder[1] = 2;
2490        iorder[2] = 1;
2491        iorder[3] = 0;
2492    }
2493
2494    while (size-- > 0) {
2495	Py_UCS4 ch = *s++;
2496#ifndef Py_UNICODE_WIDE
2497	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2498	    Py_UCS4 ch2 = *s;
2499	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2500		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2501		s++;
2502		size--;
2503	    }
2504	}
2505#endif
2506        STORECHAR(ch);
2507    }
2508
2509  done:
2510    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2511    Py_DECREF(v);
2512    return result;
2513#undef STORECHAR
2514}
2515
2516PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2517{
2518    if (!PyUnicode_Check(unicode)) {
2519        PyErr_BadArgument();
2520        return NULL;
2521    }
2522    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2523				 PyUnicode_GET_SIZE(unicode),
2524				 NULL,
2525				 0);
2526}
2527
2528/* --- UTF-16 Codec ------------------------------------------------------- */
2529
2530PyObject *
2531PyUnicode_DecodeUTF16(const char *s,
2532		      Py_ssize_t size,
2533		      const char *errors,
2534		      int *byteorder)
2535{
2536    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2537}
2538
2539PyObject *
2540PyUnicode_DecodeUTF16Stateful(const char *s,
2541			      Py_ssize_t size,
2542			      const char *errors,
2543			      int *byteorder,
2544			      Py_ssize_t *consumed)
2545{
2546    const char *starts = s;
2547    Py_ssize_t startinpos;
2548    Py_ssize_t endinpos;
2549    Py_ssize_t outpos;
2550    PyUnicodeObject *unicode;
2551    Py_UNICODE *p;
2552    const unsigned char *q, *e;
2553    int bo = 0;       /* assume native ordering by default */
2554    const char *errmsg = "";
2555    /* Offsets from q for retrieving byte pairs in the right order. */
2556#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2557    int ihi = 1, ilo = 0;
2558#else
2559    int ihi = 0, ilo = 1;
2560#endif
2561    PyObject *errorHandler = NULL;
2562    PyObject *exc = NULL;
2563
2564    /* Note: size will always be longer than the resulting Unicode
2565       character count */
2566    unicode = _PyUnicode_New(size);
2567    if (!unicode)
2568        return NULL;
2569    if (size == 0)
2570        return (PyObject *)unicode;
2571
2572    /* Unpack UTF-16 encoded data */
2573    p = unicode->str;
2574    q = (unsigned char *)s;
2575    e = q + size;
2576
2577    if (byteorder)
2578        bo = *byteorder;
2579
2580    /* Check for BOM marks (U+FEFF) in the input and adjust current
2581       byte order setting accordingly. In native mode, the leading BOM
2582       mark is skipped, in all other modes, it is copied to the output
2583       stream as-is (giving a ZWNBSP character). */
2584    if (bo == 0) {
2585        if (size >= 2) {
2586            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2587#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2588	    if (bom == 0xFEFF) {
2589		q += 2;
2590		bo = -1;
2591	    }
2592	    else if (bom == 0xFFFE) {
2593		q += 2;
2594		bo = 1;
2595	    }
2596#else
2597	    if (bom == 0xFEFF) {
2598		q += 2;
2599		bo = 1;
2600	    }
2601	    else if (bom == 0xFFFE) {
2602		q += 2;
2603		bo = -1;
2604	    }
2605#endif
2606	}
2607    }
2608
2609    if (bo == -1) {
2610        /* force LE */
2611        ihi = 1;
2612        ilo = 0;
2613    }
2614    else if (bo == 1) {
2615        /* force BE */
2616        ihi = 0;
2617        ilo = 1;
2618    }
2619
2620    while (q < e) {
2621	Py_UNICODE ch;
2622	/* remaining bytes at the end? (size should be even) */
2623	if (e-q<2) {
2624	    if (consumed)
2625		break;
2626	    errmsg = "truncated data";
2627	    startinpos = ((const char *)q)-starts;
2628	    endinpos = ((const char *)e)-starts;
2629	    goto utf16Error;
2630	    /* The remaining input chars are ignored if the callback
2631	       chooses to skip the input */
2632	}
2633	ch = (q[ihi] << 8) | q[ilo];
2634
2635	q += 2;
2636
2637	if (ch < 0xD800 || ch > 0xDFFF) {
2638	    *p++ = ch;
2639	    continue;
2640	}
2641
2642	/* UTF-16 code pair: */
2643	if (q >= e) {
2644	    errmsg = "unexpected end of data";
2645	    startinpos = (((const char *)q)-2)-starts;
2646	    endinpos = ((const char *)e)-starts;
2647	    goto utf16Error;
2648	}
2649	if (0xD800 <= ch && ch <= 0xDBFF) {
2650	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2651	    q += 2;
2652	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2653#ifndef Py_UNICODE_WIDE
2654		*p++ = ch;
2655		*p++ = ch2;
2656#else
2657		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2658#endif
2659		continue;
2660	    }
2661	    else {
2662                errmsg = "illegal UTF-16 surrogate";
2663		startinpos = (((const char *)q)-4)-starts;
2664		endinpos = startinpos+2;
2665		goto utf16Error;
2666	    }
2667
2668	}
2669	errmsg = "illegal encoding";
2670	startinpos = (((const char *)q)-2)-starts;
2671	endinpos = startinpos+2;
2672	/* Fall through to report the error */
2673
2674    utf16Error:
2675	outpos = p-PyUnicode_AS_UNICODE(unicode);
2676	if (unicode_decode_call_errorhandler(
2677	         errors, &errorHandler,
2678	         "utf16", errmsg,
2679	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2680	         (PyObject **)&unicode, &outpos, &p))
2681	    goto onError;
2682    }
2683
2684    if (byteorder)
2685        *byteorder = bo;
2686
2687    if (consumed)
2688	*consumed = (const char *)q-starts;
2689
2690    /* Adjust length */
2691    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2692        goto onError;
2693
2694    Py_XDECREF(errorHandler);
2695    Py_XDECREF(exc);
2696    return (PyObject *)unicode;
2697
2698onError:
2699    Py_DECREF(unicode);
2700    Py_XDECREF(errorHandler);
2701    Py_XDECREF(exc);
2702    return NULL;
2703}
2704
2705PyObject *
2706PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2707		      Py_ssize_t size,
2708		      const char *errors,
2709		      int byteorder)
2710{
2711    PyObject *v, *result;
2712    unsigned char *p;
2713#ifdef Py_UNICODE_WIDE
2714    int i, pairs;
2715#else
2716    const int pairs = 0;
2717#endif
2718    /* Offsets from p for storing byte pairs in the right order. */
2719#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2720    int ihi = 1, ilo = 0;
2721#else
2722    int ihi = 0, ilo = 1;
2723#endif
2724
2725#define STORECHAR(CH)                   \
2726    do {                                \
2727        p[ihi] = ((CH) >> 8) & 0xff;    \
2728        p[ilo] = (CH) & 0xff;           \
2729        p += 2;                         \
2730    } while(0)
2731
2732#ifdef Py_UNICODE_WIDE
2733    for (i = pairs = 0; i < size; i++)
2734	if (s[i] >= 0x10000)
2735	    pairs++;
2736#endif
2737    v = PyByteArray_FromStringAndSize(NULL,
2738		  2 * (size + pairs + (byteorder == 0)));
2739    if (v == NULL)
2740        return NULL;
2741
2742    p = (unsigned char *)PyByteArray_AS_STRING(v);
2743    if (byteorder == 0)
2744	STORECHAR(0xFEFF);
2745    if (size == 0)
2746        goto done;
2747
2748    if (byteorder == -1) {
2749        /* force LE */
2750        ihi = 1;
2751        ilo = 0;
2752    }
2753    else if (byteorder == 1) {
2754        /* force BE */
2755        ihi = 0;
2756        ilo = 1;
2757    }
2758
2759    while (size-- > 0) {
2760	Py_UNICODE ch = *s++;
2761	Py_UNICODE ch2 = 0;
2762#ifdef Py_UNICODE_WIDE
2763	if (ch >= 0x10000) {
2764	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2765	    ch  = 0xD800 | ((ch-0x10000) >> 10);
2766	}
2767#endif
2768        STORECHAR(ch);
2769        if (ch2)
2770            STORECHAR(ch2);
2771    }
2772
2773  done:
2774    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2775    Py_DECREF(v);
2776    return result;
2777#undef STORECHAR
2778}
2779
2780PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2781{
2782    if (!PyUnicode_Check(unicode)) {
2783        PyErr_BadArgument();
2784        return NULL;
2785    }
2786    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2787				 PyUnicode_GET_SIZE(unicode),
2788				 NULL,
2789				 0);
2790}
2791
2792/* --- Unicode Escape Codec ----------------------------------------------- */
2793
2794static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2795
2796PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2797					Py_ssize_t size,
2798					const char *errors)
2799{
2800    const char *starts = s;
2801    Py_ssize_t startinpos;
2802    Py_ssize_t endinpos;
2803    Py_ssize_t outpos;
2804    int i;
2805    PyUnicodeObject *v;
2806    Py_UNICODE *p;
2807    const char *end;
2808    char* message;
2809    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2810    PyObject *errorHandler = NULL;
2811    PyObject *exc = NULL;
2812
2813    /* Escaped strings will always be longer than the resulting
2814       Unicode string, so we start with size here and then reduce the
2815       length after conversion to the true value.
2816       (but if the error callback returns a long replacement string
2817       we'll have to allocate more space) */
2818    v = _PyUnicode_New(size);
2819    if (v == NULL)
2820        goto onError;
2821    if (size == 0)
2822        return (PyObject *)v;
2823
2824    p = PyUnicode_AS_UNICODE(v);
2825    end = s + size;
2826
2827    while (s < end) {
2828        unsigned char c;
2829        Py_UNICODE x;
2830        int digits;
2831
2832        /* Non-escape characters are interpreted as Unicode ordinals */
2833        if (*s != '\\') {
2834            *p++ = (unsigned char) *s++;
2835            continue;
2836        }
2837
2838        startinpos = s-starts;
2839        /* \ - Escapes */
2840        s++;
2841        c = *s++;
2842        if (s > end)
2843            c = '\0'; /* Invalid after \ */
2844        switch (c) {
2845
2846        /* \x escapes */
2847        case '\n': break;
2848        case '\\': *p++ = '\\'; break;
2849        case '\'': *p++ = '\''; break;
2850        case '\"': *p++ = '\"'; break;
2851        case 'b': *p++ = '\b'; break;
2852        case 'f': *p++ = '\014'; break; /* FF */
2853        case 't': *p++ = '\t'; break;
2854        case 'n': *p++ = '\n'; break;
2855        case 'r': *p++ = '\r'; break;
2856        case 'v': *p++ = '\013'; break; /* VT */
2857        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2858
2859        /* \OOO (octal) escapes */
2860        case '0': case '1': case '2': case '3':
2861        case '4': case '5': case '6': case '7':
2862            x = s[-1] - '0';
2863            if (s < end && '0' <= *s && *s <= '7') {
2864                x = (x<<3) + *s++ - '0';
2865                if (s < end && '0' <= *s && *s <= '7')
2866                    x = (x<<3) + *s++ - '0';
2867            }
2868            *p++ = x;
2869            break;
2870
2871        /* hex escapes */
2872        /* \xXX */
2873        case 'x':
2874            digits = 2;
2875            message = "truncated \\xXX escape";
2876            goto hexescape;
2877
2878        /* \uXXXX */
2879        case 'u':
2880            digits = 4;
2881            message = "truncated \\uXXXX escape";
2882            goto hexescape;
2883
2884        /* \UXXXXXXXX */
2885        case 'U':
2886            digits = 8;
2887            message = "truncated \\UXXXXXXXX escape";
2888        hexescape:
2889            chr = 0;
2890            outpos = p-PyUnicode_AS_UNICODE(v);
2891            if (s+digits>end) {
2892                endinpos = size;
2893                if (unicode_decode_call_errorhandler(
2894                    errors, &errorHandler,
2895                    "unicodeescape", "end of string in escape sequence",
2896                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2897                    (PyObject **)&v, &outpos, &p))
2898                    goto onError;
2899                goto nextByte;
2900            }
2901            for (i = 0; i < digits; ++i) {
2902                c = (unsigned char) s[i];
2903                if (!ISXDIGIT(c)) {
2904                    endinpos = (s+i+1)-starts;
2905                    if (unicode_decode_call_errorhandler(
2906                        errors, &errorHandler,
2907                        "unicodeescape", message,
2908                        &starts, &end, &startinpos, &endinpos, &exc, &s,
2909                        (PyObject **)&v, &outpos, &p))
2910                        goto onError;
2911                    goto nextByte;
2912                }
2913                chr = (chr<<4) & ~0xF;
2914                if (c >= '0' && c <= '9')
2915                    chr += c - '0';
2916                else if (c >= 'a' && c <= 'f')
2917                    chr += 10 + c - 'a';
2918                else
2919                    chr += 10 + c - 'A';
2920            }
2921            s += i;
2922            if (chr == 0xffffffff && PyErr_Occurred())
2923                /* _decoding_error will have already written into the
2924                   target buffer. */
2925                break;
2926        store:
2927            /* when we get here, chr is a 32-bit unicode character */
2928            if (chr <= 0xffff)
2929                /* UCS-2 character */
2930                *p++ = (Py_UNICODE) chr;
2931            else if (chr <= 0x10ffff) {
2932                /* UCS-4 character. Either store directly, or as
2933                   surrogate pair. */
2934#ifdef Py_UNICODE_WIDE
2935                *p++ = chr;
2936#else
2937                chr -= 0x10000L;
2938                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2939                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2940#endif
2941            } else {
2942                endinpos = s-starts;
2943                outpos = p-PyUnicode_AS_UNICODE(v);
2944                if (unicode_decode_call_errorhandler(
2945                    errors, &errorHandler,
2946                    "unicodeescape", "illegal Unicode character",
2947                    &starts, &end, &startinpos, &endinpos, &exc, &s,
2948                    (PyObject **)&v, &outpos, &p))
2949                    goto onError;
2950            }
2951            break;
2952
2953        /* \N{name} */
2954        case 'N':
2955            message = "malformed \\N character escape";
2956            if (ucnhash_CAPI == NULL) {
2957                /* load the unicode data module */
2958                PyObject *m, *api;
2959                m = PyImport_ImportModuleNoBlock("unicodedata");
2960                if (m == NULL)
2961                    goto ucnhashError;
2962                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2963                Py_DECREF(m);
2964                if (api == NULL)
2965                    goto ucnhashError;
2966                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2967                Py_DECREF(api);
2968                if (ucnhash_CAPI == NULL)
2969                    goto ucnhashError;
2970            }
2971            if (*s == '{') {
2972                const char *start = s+1;
2973                /* look for the closing brace */
2974                while (*s != '}' && s < end)
2975                    s++;
2976                if (s > start && s < end && *s == '}') {
2977                    /* found a name.  look it up in the unicode database */
2978                    message = "unknown Unicode character name";
2979                    s++;
2980                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2981                        goto store;
2982                }
2983            }
2984            endinpos = s-starts;
2985            outpos = p-PyUnicode_AS_UNICODE(v);
2986            if (unicode_decode_call_errorhandler(
2987                errors, &errorHandler,
2988                "unicodeescape", message,
2989                &starts, &end, &startinpos, &endinpos, &exc, &s,
2990                (PyObject **)&v, &outpos, &p))
2991                goto onError;
2992            break;
2993
2994        default:
2995            if (s > end) {
2996                message = "\\ at end of string";
2997                s--;
2998                endinpos = s-starts;
2999                outpos = p-PyUnicode_AS_UNICODE(v);
3000                if (unicode_decode_call_errorhandler(
3001                    errors, &errorHandler,
3002                    "unicodeescape", message,
3003                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3004                    (PyObject **)&v, &outpos, &p))
3005                    goto onError;
3006            }
3007            else {
3008                *p++ = '\\';
3009                *p++ = (unsigned char)s[-1];
3010            }
3011            break;
3012        }
3013        nextByte:
3014        ;
3015    }
3016    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3017        goto onError;
3018    Py_XDECREF(errorHandler);
3019    Py_XDECREF(exc);
3020    return (PyObject *)v;
3021
3022ucnhashError:
3023    PyErr_SetString(
3024        PyExc_UnicodeError,
3025        "\\N escapes not supported (can't load unicodedata module)"
3026        );
3027    Py_XDECREF(v);
3028    Py_XDECREF(errorHandler);
3029    Py_XDECREF(exc);
3030    return NULL;
3031
3032onError:
3033    Py_XDECREF(v);
3034    Py_XDECREF(errorHandler);
3035    Py_XDECREF(exc);
3036    return NULL;
3037}
3038
3039/* Return a Unicode-Escape string version of the Unicode object.
3040
3041   If quotes is true, the string is enclosed in u"" or u'' quotes as
3042   appropriate.
3043
3044*/
3045
3046Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3047                                      Py_ssize_t size,
3048                                      Py_UNICODE ch)
3049{
3050    /* like wcschr, but doesn't stop at NULL characters */
3051
3052    while (size-- > 0) {
3053        if (*s == ch)
3054            return s;
3055        s++;
3056    }
3057
3058    return NULL;
3059}
3060
3061static const char *hexdigits = "0123456789abcdef";
3062
3063PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3064					Py_ssize_t size)
3065{
3066    PyObject *repr, *result;
3067    char *p;
3068
3069    /* XXX(nnorwitz): rather than over-allocating, it would be
3070       better to choose a different scheme.  Perhaps scan the
3071       first N-chars of the string and allocate based on that size.
3072    */
3073    /* Initial allocation is based on the longest-possible unichr
3074       escape.
3075
3076       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3077       unichr, so in this case it's the longest unichr escape. In
3078       narrow (UTF-16) builds this is five chars per source unichr
3079       since there are two unichrs in the surrogate pair, so in narrow
3080       (UTF-16) builds it's not the longest unichr escape.
3081
3082       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3083       so in the narrow (UTF-16) build case it's the longest unichr
3084       escape.
3085    */
3086
3087    repr = PyByteArray_FromStringAndSize(NULL,
3088#ifdef Py_UNICODE_WIDE
3089        + 10*size
3090#else
3091        + 6*size
3092#endif
3093        + 1);
3094    if (repr == NULL)
3095        return NULL;
3096
3097    p = PyByteArray_AS_STRING(repr);
3098
3099    while (size-- > 0) {
3100        Py_UNICODE ch = *s++;
3101
3102        /* Escape backslashes */
3103        if (ch == '\\') {
3104            *p++ = '\\';
3105            *p++ = (char) ch;
3106            continue;
3107        }
3108
3109#ifdef Py_UNICODE_WIDE
3110        /* Map 21-bit characters to '\U00xxxxxx' */
3111        else if (ch >= 0x10000) {
3112            *p++ = '\\';
3113            *p++ = 'U';
3114            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3115            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3116            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3117            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3118            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3119            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3120            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3121            *p++ = hexdigits[ch & 0x0000000F];
3122	    continue;
3123        }
3124#else
3125	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3126	else if (ch >= 0xD800 && ch < 0xDC00) {
3127	    Py_UNICODE ch2;
3128	    Py_UCS4 ucs;
3129
3130	    ch2 = *s++;
3131	    size--;
3132	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3133		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3134		*p++ = '\\';
3135		*p++ = 'U';
3136		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3137		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3138		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3139		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3140		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3141		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3142		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3143		*p++ = hexdigits[ucs & 0x0000000F];
3144		continue;
3145	    }
3146	    /* Fall through: isolated surrogates are copied as-is */
3147	    s--;
3148	    size++;
3149	}
3150#endif
3151
3152        /* Map 16-bit characters to '\uxxxx' */
3153        if (ch >= 256) {
3154            *p++ = '\\';
3155            *p++ = 'u';
3156            *p++ = hexdigits[(ch >> 12) & 0x000F];
3157            *p++ = hexdigits[(ch >> 8) & 0x000F];
3158            *p++ = hexdigits[(ch >> 4) & 0x000F];
3159            *p++ = hexdigits[ch & 0x000F];
3160        }
3161
3162        /* Map special whitespace to '\t', \n', '\r' */
3163        else if (ch == '\t') {
3164            *p++ = '\\';
3165            *p++ = 't';
3166        }
3167        else if (ch == '\n') {
3168            *p++ = '\\';
3169            *p++ = 'n';
3170        }
3171        else if (ch == '\r') {
3172            *p++ = '\\';
3173            *p++ = 'r';
3174        }
3175
3176        /* Map non-printable US ASCII to '\xhh' */
3177        else if (ch < ' ' || ch >= 0x7F) {
3178            *p++ = '\\';
3179            *p++ = 'x';
3180            *p++ = hexdigits[(ch >> 4) & 0x000F];
3181            *p++ = hexdigits[ch & 0x000F];
3182        }
3183
3184        /* Copy everything else as-is */
3185        else
3186            *p++ = (char) ch;
3187    }
3188
3189    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr),
3190                                        p - PyByteArray_AS_STRING(repr));
3191    Py_DECREF(repr);
3192    return result;
3193}
3194
3195PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3196{
3197    PyObject *s, *result;
3198    if (!PyUnicode_Check(unicode)) {
3199        PyErr_BadArgument();
3200        return NULL;
3201    }
3202    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3203                                      PyUnicode_GET_SIZE(unicode));
3204
3205    if (!s)
3206        return NULL;
3207    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
3208                                        PyByteArray_GET_SIZE(s));
3209    Py_DECREF(s);
3210    return result;
3211}
3212
3213/* --- Raw Unicode Escape Codec ------------------------------------------- */
3214
3215PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3216					   Py_ssize_t size,
3217					   const char *errors)
3218{
3219    const char *starts = s;
3220    Py_ssize_t startinpos;
3221    Py_ssize_t endinpos;
3222    Py_ssize_t outpos;
3223    PyUnicodeObject *v;
3224    Py_UNICODE *p;
3225    const char *end;
3226    const char *bs;
3227    PyObject *errorHandler = NULL;
3228    PyObject *exc = NULL;
3229
3230    /* Escaped strings will always be longer than the resulting
3231       Unicode string, so we start with size here and then reduce the
3232       length after conversion to the true value. (But decoding error
3233       handler might have to resize the string) */
3234    v = _PyUnicode_New(size);
3235    if (v == NULL)
3236	goto onError;
3237    if (size == 0)
3238	return (PyObject *)v;
3239    p = PyUnicode_AS_UNICODE(v);
3240    end = s + size;
3241    while (s < end) {
3242	unsigned char c;
3243	Py_UCS4 x;
3244	int i;
3245        int count;
3246
3247	/* Non-escape characters are interpreted as Unicode ordinals */
3248	if (*s != '\\') {
3249	    *p++ = (unsigned char)*s++;
3250	    continue;
3251	}
3252	startinpos = s-starts;
3253
3254	/* \u-escapes are only interpreted iff the number of leading
3255	   backslashes if odd */
3256	bs = s;
3257	for (;s < end;) {
3258	    if (*s != '\\')
3259		break;
3260	    *p++ = (unsigned char)*s++;
3261	}
3262	if (((s - bs) & 1) == 0 ||
3263	    s >= end ||
3264	    (*s != 'u' && *s != 'U')) {
3265	    continue;
3266	}
3267	p--;
3268        count = *s=='u' ? 4 : 8;
3269	s++;
3270
3271	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3272	outpos = p-PyUnicode_AS_UNICODE(v);
3273	for (x = 0, i = 0; i < count; ++i, ++s) {
3274	    c = (unsigned char)*s;
3275	    if (!ISXDIGIT(c)) {
3276		endinpos = s-starts;
3277		if (unicode_decode_call_errorhandler(
3278		    errors, &errorHandler,
3279		    "rawunicodeescape", "truncated \\uXXXX",
3280		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3281		    (PyObject **)&v, &outpos, &p))
3282		    goto onError;
3283		goto nextByte;
3284	    }
3285	    x = (x<<4) & ~0xF;
3286	    if (c >= '0' && c <= '9')
3287		x += c - '0';
3288	    else if (c >= 'a' && c <= 'f')
3289		x += 10 + c - 'a';
3290	    else
3291		x += 10 + c - 'A';
3292	}
3293        if (x <= 0xffff)
3294                /* UCS-2 character */
3295                *p++ = (Py_UNICODE) x;
3296        else if (x <= 0x10ffff) {
3297                /* UCS-4 character. Either store directly, or as
3298                   surrogate pair. */
3299#ifdef Py_UNICODE_WIDE
3300                *p++ = (Py_UNICODE) x;
3301#else
3302                x -= 0x10000L;
3303                *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3304                *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3305#endif
3306        } else {
3307            endinpos = s-starts;
3308            outpos = p-PyUnicode_AS_UNICODE(v);
3309            if (unicode_decode_call_errorhandler(
3310                    errors, &errorHandler,
3311                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3312		    &starts, &end, &startinpos, &endinpos, &exc, &s,
3313		    (PyObject **)&v, &outpos, &p))
3314		    goto onError;
3315        }
3316	nextByte:
3317	;
3318    }
3319    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3320	goto onError;
3321    Py_XDECREF(errorHandler);
3322    Py_XDECREF(exc);
3323    return (PyObject *)v;
3324
3325 onError:
3326    Py_XDECREF(v);
3327    Py_XDECREF(errorHandler);
3328    Py_XDECREF(exc);
3329    return NULL;
3330}
3331
3332PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3333					   Py_ssize_t size)
3334{
3335    PyObject *repr, *result;
3336    char *p;
3337    char *q;
3338
3339#ifdef Py_UNICODE_WIDE
3340    repr = PyByteArray_FromStringAndSize(NULL, 10 * size);
3341#else
3342    repr = PyByteArray_FromStringAndSize(NULL, 6 * size);
3343#endif
3344    if (repr == NULL)
3345        return NULL;
3346    if (size == 0)
3347        goto done;
3348
3349    p = q = PyByteArray_AS_STRING(repr);
3350    while (size-- > 0) {
3351        Py_UNICODE ch = *s++;
3352#ifdef Py_UNICODE_WIDE
3353	/* Map 32-bit characters to '\Uxxxxxxxx' */
3354	if (ch >= 0x10000) {
3355            *p++ = '\\';
3356            *p++ = 'U';
3357            *p++ = hexdigits[(ch >> 28) & 0xf];
3358            *p++ = hexdigits[(ch >> 24) & 0xf];
3359            *p++ = hexdigits[(ch >> 20) & 0xf];
3360            *p++ = hexdigits[(ch >> 16) & 0xf];
3361            *p++ = hexdigits[(ch >> 12) & 0xf];
3362            *p++ = hexdigits[(ch >> 8) & 0xf];
3363            *p++ = hexdigits[(ch >> 4) & 0xf];
3364            *p++ = hexdigits[ch & 15];
3365        }
3366        else
3367#else
3368	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3369	if (ch >= 0xD800 && ch < 0xDC00) {
3370	    Py_UNICODE ch2;
3371	    Py_UCS4 ucs;
3372
3373	    ch2 = *s++;
3374	    size--;
3375	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3376		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3377		*p++ = '\\';
3378		*p++ = 'U';
3379		*p++ = hexdigits[(ucs >> 28) & 0xf];
3380		*p++ = hexdigits[(ucs >> 24) & 0xf];
3381		*p++ = hexdigits[(ucs >> 20) & 0xf];
3382		*p++ = hexdigits[(ucs >> 16) & 0xf];
3383		*p++ = hexdigits[(ucs >> 12) & 0xf];
3384		*p++ = hexdigits[(ucs >> 8) & 0xf];
3385		*p++ = hexdigits[(ucs >> 4) & 0xf];
3386		*p++ = hexdigits[ucs & 0xf];
3387		continue;
3388	    }
3389	    /* Fall through: isolated surrogates are copied as-is */
3390	    s--;
3391	    size++;
3392	}
3393#endif
3394	/* Map 16-bit characters to '\uxxxx' */
3395	if (ch >= 256) {
3396            *p++ = '\\';
3397            *p++ = 'u';
3398            *p++ = hexdigits[(ch >> 12) & 0xf];
3399            *p++ = hexdigits[(ch >> 8) & 0xf];
3400            *p++ = hexdigits[(ch >> 4) & 0xf];
3401            *p++ = hexdigits[ch & 15];
3402        }
3403	/* Copy everything else as-is */
3404	else
3405            *p++ = (char) ch;
3406    }
3407    size = p - q;
3408
3409  done:
3410    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
3411    Py_DECREF(repr);
3412    return result;
3413}
3414
3415PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3416{
3417    PyObject *s, *result;
3418    if (!PyUnicode_Check(unicode)) {
3419        PyErr_BadArgument();
3420        return NULL;
3421    }
3422    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3423                                         PyUnicode_GET_SIZE(unicode));
3424
3425    if (!s)
3426        return NULL;
3427    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
3428                                        PyByteArray_GET_SIZE(s));
3429    Py_DECREF(s);
3430    return result;
3431}
3432
3433/* --- Unicode Internal Codec ------------------------------------------- */
3434
3435PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3436					   Py_ssize_t size,
3437					   const char *errors)
3438{
3439    const char *starts = s;
3440    Py_ssize_t startinpos;
3441    Py_ssize_t endinpos;
3442    Py_ssize_t outpos;
3443    PyUnicodeObject *v;
3444    Py_UNICODE *p;
3445    const char *end;
3446    const char *reason;
3447    PyObject *errorHandler = NULL;
3448    PyObject *exc = NULL;
3449
3450#ifdef Py_UNICODE_WIDE
3451    Py_UNICODE unimax = PyUnicode_GetMax();
3452#endif
3453
3454    /* XXX overflow detection missing */
3455    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3456    if (v == NULL)
3457	goto onError;
3458    if (PyUnicode_GetSize((PyObject *)v) == 0)
3459	return (PyObject *)v;
3460    p = PyUnicode_AS_UNICODE(v);
3461    end = s + size;
3462
3463    while (s < end) {
3464        memcpy(p, s, sizeof(Py_UNICODE));
3465        /* We have to sanity check the raw data, otherwise doom looms for
3466           some malformed UCS-4 data. */
3467        if (
3468            #ifdef Py_UNICODE_WIDE
3469            *p > unimax || *p < 0 ||
3470            #endif
3471            end-s < Py_UNICODE_SIZE
3472            )
3473            {
3474            startinpos = s - starts;
3475            if (end-s < Py_UNICODE_SIZE) {
3476                endinpos = end-starts;
3477                reason = "truncated input";
3478            }
3479            else {
3480                endinpos = s - starts + Py_UNICODE_SIZE;
3481                reason = "illegal code point (> 0x10FFFF)";
3482            }
3483            outpos = p - PyUnicode_AS_UNICODE(v);
3484            if (unicode_decode_call_errorhandler(
3485                    errors, &errorHandler,
3486                    "unicode_internal", reason,
3487                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3488                    (PyObject **)&v, &outpos, &p)) {
3489                goto onError;
3490            }
3491        }
3492        else {
3493            p++;
3494            s += Py_UNICODE_SIZE;
3495        }
3496    }
3497
3498    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3499        goto onError;
3500    Py_XDECREF(errorHandler);
3501    Py_XDECREF(exc);
3502    return (PyObject *)v;
3503
3504 onError:
3505    Py_XDECREF(v);
3506    Py_XDECREF(errorHandler);
3507    Py_XDECREF(exc);
3508    return NULL;
3509}
3510
3511/* --- Latin-1 Codec ------------------------------------------------------ */
3512
3513PyObject *PyUnicode_DecodeLatin1(const char *s,
3514				 Py_ssize_t size,
3515				 const char *errors)
3516{
3517    PyUnicodeObject *v;
3518    Py_UNICODE *p;
3519
3520    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3521    if (size == 1) {
3522	Py_UNICODE r = *(unsigned char*)s;
3523	return PyUnicode_FromUnicode(&r, 1);
3524    }
3525
3526    v = _PyUnicode_New(size);
3527    if (v == NULL)
3528	goto onError;
3529    if (size == 0)
3530	return (PyObject *)v;
3531    p = PyUnicode_AS_UNICODE(v);
3532    while (size-- > 0)
3533	*p++ = (unsigned char)*s++;
3534    return (PyObject *)v;
3535
3536 onError:
3537    Py_XDECREF(v);
3538    return NULL;
3539}
3540
3541/* create or adjust a UnicodeEncodeError */
3542static void make_encode_exception(PyObject **exceptionObject,
3543    const char *encoding,
3544    const Py_UNICODE *unicode, Py_ssize_t size,
3545    Py_ssize_t startpos, Py_ssize_t endpos,
3546    const char *reason)
3547{
3548    if (*exceptionObject == NULL) {
3549	*exceptionObject = PyUnicodeEncodeError_Create(
3550	    encoding, unicode, size, startpos, endpos, reason);
3551    }
3552    else {
3553	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3554	    goto onError;
3555	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3556	    goto onError;
3557	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3558	    goto onError;
3559	return;
3560	onError:
3561	Py_DECREF(*exceptionObject);
3562	*exceptionObject = NULL;
3563    }
3564}
3565
3566/* raises a UnicodeEncodeError */
3567static void raise_encode_exception(PyObject **exceptionObject,
3568    const char *encoding,
3569    const Py_UNICODE *unicode, Py_ssize_t size,
3570    Py_ssize_t startpos, Py_ssize_t endpos,
3571    const char *reason)
3572{
3573    make_encode_exception(exceptionObject,
3574	encoding, unicode, size, startpos, endpos, reason);
3575    if (*exceptionObject != NULL)
3576	PyCodec_StrictErrors(*exceptionObject);
3577}
3578
3579/* error handling callback helper:
3580   build arguments, call the callback and check the arguments,
3581   put the result into newpos and return the replacement string, which
3582   has to be freed by the caller */
3583static PyObject *unicode_encode_call_errorhandler(const char *errors,
3584    PyObject **errorHandler,
3585    const char *encoding, const char *reason,
3586    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3587    Py_ssize_t startpos, Py_ssize_t endpos,
3588    Py_ssize_t *newpos)
3589{
3590    static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3591
3592    PyObject *restuple;
3593    PyObject *resunicode;
3594
3595    if (*errorHandler == NULL) {
3596	*errorHandler = PyCodec_LookupError(errors);
3597        if (*errorHandler == NULL)
3598	    return NULL;
3599    }
3600
3601    make_encode_exception(exceptionObject,
3602	encoding, unicode, size, startpos, endpos, reason);
3603    if (*exceptionObject == NULL)
3604	return NULL;
3605
3606    restuple = PyObject_CallFunctionObjArgs(
3607	*errorHandler, *exceptionObject, NULL);
3608    if (restuple == NULL)
3609	return NULL;
3610    if (!PyTuple_Check(restuple)) {
3611	PyErr_Format(PyExc_TypeError, &argparse[4]);
3612	Py_DECREF(restuple);
3613	return NULL;
3614    }
3615    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3616	&resunicode, newpos)) {
3617	Py_DECREF(restuple);
3618	return NULL;
3619    }
3620    if (*newpos<0)
3621	*newpos = size+*newpos;
3622    if (*newpos<0 || *newpos>size) {
3623	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3624	Py_DECREF(restuple);
3625	return NULL;
3626    }
3627    Py_INCREF(resunicode);
3628    Py_DECREF(restuple);
3629    return resunicode;
3630}
3631
3632static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3633				 Py_ssize_t size,
3634				 const char *errors,
3635				 int limit)
3636{
3637    /* output object */
3638    PyObject *res;
3639    /* pointers to the beginning and end+1 of input */
3640    const Py_UNICODE *startp = p;
3641    const Py_UNICODE *endp = p + size;
3642    /* pointer to the beginning of the unencodable characters */
3643    /* const Py_UNICODE *badp = NULL; */
3644    /* pointer into the output */
3645    char *str;
3646    /* current output position */
3647    Py_ssize_t ressize;
3648    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3649    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3650    PyObject *errorHandler = NULL;
3651    PyObject *exc = NULL;
3652    PyObject *result = NULL;
3653    /* the following variable is used for caching string comparisons
3654     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3655    int known_errorHandler = -1;
3656
3657    /* allocate enough for a simple encoding without
3658       replacements, if we need more, we'll resize */
3659    if (size == 0)
3660        return PyBytes_FromStringAndSize(NULL, 0);
3661    res = PyByteArray_FromStringAndSize(NULL, size);
3662    if (res == NULL)
3663        return NULL;
3664    str = PyByteArray_AS_STRING(res);
3665    ressize = size;
3666
3667    while (p<endp) {
3668	Py_UNICODE c = *p;
3669
3670	/* can we encode this? */
3671	if (c<limit) {
3672	    /* no overflow check, because we know that the space is enough */
3673	    *str++ = (char)c;
3674	    ++p;
3675	}
3676	else {
3677	    Py_ssize_t unicodepos = p-startp;
3678	    Py_ssize_t requiredsize;
3679	    PyObject *repunicode;
3680	    Py_ssize_t repsize;
3681	    Py_ssize_t newpos;
3682	    Py_ssize_t respos;
3683	    Py_UNICODE *uni2;
3684	    /* startpos for collecting unencodable chars */
3685	    const Py_UNICODE *collstart = p;
3686	    const Py_UNICODE *collend = p;
3687	    /* find all unecodable characters */
3688	    while ((collend < endp) && ((*collend)>=limit))
3689		++collend;
3690	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3691	    if (known_errorHandler==-1) {
3692		if ((errors==NULL) || (!strcmp(errors, "strict")))
3693		    known_errorHandler = 1;
3694		else if (!strcmp(errors, "replace"))
3695		    known_errorHandler = 2;
3696		else if (!strcmp(errors, "ignore"))
3697		    known_errorHandler = 3;
3698		else if (!strcmp(errors, "xmlcharrefreplace"))
3699		    known_errorHandler = 4;
3700		else
3701		    known_errorHandler = 0;
3702	    }
3703	    switch (known_errorHandler) {
3704		case 1: /* strict */
3705		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3706		    goto onError;
3707		case 2: /* replace */
3708		    while (collstart++<collend)
3709			*str++ = '?'; /* fall through */
3710		case 3: /* ignore */
3711		    p = collend;
3712		    break;
3713		case 4: /* xmlcharrefreplace */
3714		    respos = str - PyByteArray_AS_STRING(res);
3715		    /* determine replacement size (temporarily (mis)uses p) */
3716		    for (p = collstart, repsize = 0; p < collend; ++p) {
3717			if (*p<10)
3718			    repsize += 2+1+1;
3719			else if (*p<100)
3720			    repsize += 2+2+1;
3721			else if (*p<1000)
3722			    repsize += 2+3+1;
3723			else if (*p<10000)
3724			    repsize += 2+4+1;
3725#ifndef Py_UNICODE_WIDE
3726			else
3727			    repsize += 2+5+1;
3728#else
3729			else if (*p<100000)
3730			    repsize += 2+5+1;
3731			else if (*p<1000000)
3732			    repsize += 2+6+1;
3733			else
3734			    repsize += 2+7+1;
3735#endif
3736		    }
3737		    requiredsize = respos+repsize+(endp-collend);
3738		    if (requiredsize > ressize) {
3739			if (requiredsize<2*ressize)
3740			    requiredsize = 2*ressize;
3741			if (PyByteArray_Resize(res, requiredsize))
3742			    goto onError;
3743			str = PyByteArray_AS_STRING(res) + respos;
3744			ressize = requiredsize;
3745		    }
3746		    /* generate replacement (temporarily (mis)uses p) */
3747		    for (p = collstart; p < collend; ++p) {
3748			str += sprintf(str, "&#%d;", (int)*p);
3749		    }
3750		    p = collend;
3751		    break;
3752		default:
3753		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3754			encoding, reason, startp, size, &exc,
3755			collstart-startp, collend-startp, &newpos);
3756		    if (repunicode == NULL)
3757			goto onError;
3758		    /* need more space? (at least enough for what we
3759		       have+the replacement+the rest of the string, so
3760		       we won't have to check space for encodable characters) */
3761		    respos = str - PyByteArray_AS_STRING(res);
3762		    repsize = PyUnicode_GET_SIZE(repunicode);
3763		    requiredsize = respos+repsize+(endp-collend);
3764		    if (requiredsize > ressize) {
3765			if (requiredsize<2*ressize)
3766			    requiredsize = 2*ressize;
3767			if (PyByteArray_Resize(res, requiredsize)) {
3768			    Py_DECREF(repunicode);
3769			    goto onError;
3770			}
3771			str = PyByteArray_AS_STRING(res) + respos;
3772			ressize = requiredsize;
3773		    }
3774		    /* check if there is anything unencodable in the replacement
3775		       and copy it to the output */
3776		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3777			c = *uni2;
3778			if (c >= limit) {
3779			    raise_encode_exception(&exc, encoding, startp, size,
3780				unicodepos, unicodepos+1, reason);
3781			    Py_DECREF(repunicode);
3782			    goto onError;
3783			}
3784			*str = (char)c;
3785		    }
3786		    p = startp + newpos;
3787		    Py_DECREF(repunicode);
3788	    }
3789	}
3790    }
3791    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res),
3792                                        str - PyByteArray_AS_STRING(res));
3793  onError:
3794    Py_DECREF(res);
3795    Py_XDECREF(errorHandler);
3796    Py_XDECREF(exc);
3797    return result;
3798}
3799
3800PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3801				 Py_ssize_t size,
3802				 const char *errors)
3803{
3804    return unicode_encode_ucs1(p, size, errors, 256);
3805}
3806
3807PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3808{
3809    if (!PyUnicode_Check(unicode)) {
3810	PyErr_BadArgument();
3811	return NULL;
3812    }
3813    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3814				  PyUnicode_GET_SIZE(unicode),
3815				  NULL);
3816}
3817
3818/* --- 7-bit ASCII Codec -------------------------------------------------- */
3819
3820PyObject *PyUnicode_DecodeASCII(const char *s,
3821				Py_ssize_t size,
3822				const char *errors)
3823{
3824    const char *starts = s;
3825    PyUnicodeObject *v;
3826    Py_UNICODE *p;
3827    Py_ssize_t startinpos;
3828    Py_ssize_t endinpos;
3829    Py_ssize_t outpos;
3830    const char *e;
3831    PyObject *errorHandler = NULL;
3832    PyObject *exc = NULL;
3833
3834    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3835    if (size == 1 && *(unsigned char*)s < 128) {
3836	Py_UNICODE r = *(unsigned char*)s;
3837	return PyUnicode_FromUnicode(&r, 1);
3838    }
3839
3840    v = _PyUnicode_New(size);
3841    if (v == NULL)
3842	goto onError;
3843    if (size == 0)
3844	return (PyObject *)v;
3845    p = PyUnicode_AS_UNICODE(v);
3846    e = s + size;
3847    while (s < e) {
3848	register unsigned char c = (unsigned char)*s;
3849	if (c < 128) {
3850	    *p++ = c;
3851	    ++s;
3852	}
3853	else {
3854	    startinpos = s-starts;
3855	    endinpos = startinpos + 1;
3856	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3857	    if (unicode_decode_call_errorhandler(
3858		 errors, &errorHandler,
3859		 "ascii", "ordinal not in range(128)",
3860		 &starts, &e, &startinpos, &endinpos, &exc, &s,
3861		 (PyObject **)&v, &outpos, &p))
3862		goto onError;
3863	}
3864    }
3865    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3866	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3867	    goto onError;
3868    Py_XDECREF(errorHandler);
3869    Py_XDECREF(exc);
3870    return (PyObject *)v;
3871
3872 onError:
3873    Py_XDECREF(v);
3874    Py_XDECREF(errorHandler);
3875    Py_XDECREF(exc);
3876    return NULL;
3877}
3878
3879PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3880				Py_ssize_t size,
3881				const char *errors)
3882{
3883    return unicode_encode_ucs1(p, size, errors, 128);
3884}
3885
3886PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3887{
3888    if (!PyUnicode_Check(unicode)) {
3889	PyErr_BadArgument();
3890	return NULL;
3891    }
3892    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3893				 PyUnicode_GET_SIZE(unicode),
3894				 NULL);
3895}
3896
3897#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3898
3899/* --- MBCS codecs for Windows -------------------------------------------- */
3900
3901#if SIZEOF_INT < SIZEOF_SSIZE_T
3902#define NEED_RETRY
3903#endif
3904
3905/* XXX This code is limited to "true" double-byte encodings, as
3906   a) it assumes an incomplete character consists of a single byte, and
3907   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3908      encodings, see IsDBCSLeadByteEx documentation. */
3909
3910static int is_dbcs_lead_byte(const char *s, int offset)
3911{
3912    const char *curr = s + offset;
3913
3914    if (IsDBCSLeadByte(*curr)) {
3915	const char *prev = CharPrev(s, curr);
3916	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3917    }
3918    return 0;
3919}
3920
3921/*
3922 * Decode MBCS string into unicode object. If 'final' is set, converts
3923 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3924 */
3925static int decode_mbcs(PyUnicodeObject **v,
3926			const char *s, /* MBCS string */
3927			int size, /* sizeof MBCS string */
3928			int final)
3929{
3930    Py_UNICODE *p;
3931    Py_ssize_t n = 0;
3932    int usize = 0;
3933
3934    assert(size >= 0);
3935
3936    /* Skip trailing lead-byte unless 'final' is set */
3937    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3938	--size;
3939
3940    /* First get the size of the result */
3941    if (size > 0) {
3942	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3943	if (usize == 0) {
3944	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3945	    return -1;
3946	}
3947    }
3948
3949    if (*v == NULL) {
3950	/* Create unicode object */
3951	*v = _PyUnicode_New(usize);
3952	if (*v == NULL)
3953	    return -1;
3954    }
3955    else {
3956	/* Extend unicode object */
3957	n = PyUnicode_GET_SIZE(*v);
3958	if (_PyUnicode_Resize(v, n + usize) < 0)
3959	    return -1;
3960    }
3961
3962    /* Do the conversion */
3963    if (size > 0) {
3964	p = PyUnicode_AS_UNICODE(*v) + n;
3965	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3966	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
3967	    return -1;
3968	}
3969    }
3970
3971    return size;
3972}
3973
3974PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3975					Py_ssize_t size,
3976					const char *errors,
3977					Py_ssize_t *consumed)
3978{
3979    PyUnicodeObject *v = NULL;
3980    int done;
3981
3982    if (consumed)
3983	*consumed = 0;
3984
3985#ifdef NEED_RETRY
3986  retry:
3987    if (size > INT_MAX)
3988	done = decode_mbcs(&v, s, INT_MAX, 0);
3989    else
3990#endif
3991	done = decode_mbcs(&v, s, (int)size, !consumed);
3992
3993    if (done < 0) {
3994        Py_XDECREF(v);
3995	return NULL;
3996    }
3997
3998    if (consumed)
3999	*consumed += done;
4000
4001#ifdef NEED_RETRY
4002    if (size > INT_MAX) {
4003	s += done;
4004	size -= done;
4005	goto retry;
4006    }
4007#endif
4008
4009    return (PyObject *)v;
4010}
4011
4012PyObject *PyUnicode_DecodeMBCS(const char *s,
4013				Py_ssize_t size,
4014				const char *errors)
4015{
4016    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4017}
4018
4019/*
4020 * Convert unicode into string object (MBCS).
4021 * Returns 0 if succeed, -1 otherwise.
4022 */
4023static int encode_mbcs(PyObject **repr,
4024			const Py_UNICODE *p, /* unicode */
4025			int size) /* size of unicode */
4026{
4027    int mbcssize = 0;
4028    Py_ssize_t n = 0;
4029
4030    assert(size >= 0);
4031
4032    /* First get the size of the result */
4033    if (size > 0) {
4034	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4035	if (mbcssize == 0) {
4036	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4037	    return -1;
4038	}
4039    }
4040
4041    if (*repr == NULL) {
4042	/* Create string object */
4043	*repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4044	if (*repr == NULL)
4045	    return -1;
4046    }
4047    else {
4048	/* Extend string object */
4049	n = PyBytes_Size(*repr);
4050	if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4051	    return -1;
4052    }
4053
4054    /* Do the conversion */
4055    if (size > 0) {
4056	char *s = PyBytes_AS_STRING(*repr) + n;
4057	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4058	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
4059	    return -1;
4060	}
4061    }
4062
4063    return 0;
4064}
4065
4066PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4067				Py_ssize_t size,
4068				const char *errors)
4069{
4070    PyObject *repr = NULL;
4071    int ret;
4072
4073#ifdef NEED_RETRY
4074 retry:
4075    if (size > INT_MAX)
4076	ret = encode_mbcs(&repr, p, INT_MAX);
4077    else
4078#endif
4079	ret = encode_mbcs(&repr, p, (int)size);
4080
4081    if (ret < 0) {
4082	Py_XDECREF(repr);
4083	return NULL;
4084    }
4085
4086#ifdef NEED_RETRY
4087    if (size > INT_MAX) {
4088	p += INT_MAX;
4089	size -= INT_MAX;
4090	goto retry;
4091    }
4092#endif
4093
4094    return repr;
4095}
4096
4097PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4098{
4099    if (!PyUnicode_Check(unicode)) {
4100        PyErr_BadArgument();
4101        return NULL;
4102    }
4103    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4104				PyUnicode_GET_SIZE(unicode),
4105				NULL);
4106}
4107
4108#undef NEED_RETRY
4109
4110#endif /* MS_WINDOWS */
4111
4112/* --- Character Mapping Codec -------------------------------------------- */
4113
4114PyObject *PyUnicode_DecodeCharmap(const char *s,
4115				  Py_ssize_t size,
4116				  PyObject *mapping,
4117				  const char *errors)
4118{
4119    const char *starts = s;
4120    Py_ssize_t startinpos;
4121    Py_ssize_t endinpos;
4122    Py_ssize_t outpos;
4123    const char *e;
4124    PyUnicodeObject *v;
4125    Py_UNICODE *p;
4126    Py_ssize_t extrachars = 0;
4127    PyObject *errorHandler = NULL;
4128    PyObject *exc = NULL;
4129    Py_UNICODE *mapstring = NULL;
4130    Py_ssize_t maplen = 0;
4131
4132    /* Default to Latin-1 */
4133    if (mapping == NULL)
4134	return PyUnicode_DecodeLatin1(s, size, errors);
4135
4136    v = _PyUnicode_New(size);
4137    if (v == NULL)
4138	goto onError;
4139    if (size == 0)
4140	return (PyObject *)v;
4141    p = PyUnicode_AS_UNICODE(v);
4142    e = s + size;
4143    if (PyUnicode_CheckExact(mapping)) {
4144	mapstring = PyUnicode_AS_UNICODE(mapping);
4145	maplen = PyUnicode_GET_SIZE(mapping);
4146	while (s < e) {
4147	    unsigned char ch = *s;
4148	    Py_UNICODE x = 0xfffe; /* illegal value */
4149
4150	    if (ch < maplen)
4151		x = mapstring[ch];
4152
4153	    if (x == 0xfffe) {
4154		/* undefined mapping */
4155		outpos = p-PyUnicode_AS_UNICODE(v);
4156		startinpos = s-starts;
4157		endinpos = startinpos+1;
4158		if (unicode_decode_call_errorhandler(
4159		     errors, &errorHandler,
4160		     "charmap", "character maps to <undefined>",
4161		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4162		     (PyObject **)&v, &outpos, &p)) {
4163		    goto onError;
4164		}
4165		continue;
4166	    }
4167	    *p++ = x;
4168	    ++s;
4169	}
4170    }
4171    else {
4172	while (s < e) {
4173	    unsigned char ch = *s;
4174	    PyObject *w, *x;
4175
4176	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
4177	    w = PyLong_FromLong((long)ch);
4178	    if (w == NULL)
4179		goto onError;
4180	    x = PyObject_GetItem(mapping, w);
4181	    Py_DECREF(w);
4182	    if (x == NULL) {
4183		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4184		    /* No mapping found means: mapping is undefined. */
4185		    PyErr_Clear();
4186		    x = Py_None;
4187		    Py_INCREF(x);
4188		} else
4189		    goto onError;
4190	    }
4191
4192	    /* Apply mapping */
4193	    if (PyLong_Check(x)) {
4194		long value = PyLong_AS_LONG(x);
4195		if (value < 0 || value > 65535) {
4196		    PyErr_SetString(PyExc_TypeError,
4197				    "character mapping must be in range(65536)");
4198		    Py_DECREF(x);
4199		    goto onError;
4200		}
4201		*p++ = (Py_UNICODE)value;
4202	    }
4203	    else if (x == Py_None) {
4204		/* undefined mapping */
4205		outpos = p-PyUnicode_AS_UNICODE(v);
4206		startinpos = s-starts;
4207		endinpos = startinpos+1;
4208		if (unicode_decode_call_errorhandler(
4209		     errors, &errorHandler,
4210		     "charmap", "character maps to <undefined>",
4211		     &starts, &e, &startinpos, &endinpos, &exc, &s,
4212		     (PyObject **)&v, &outpos, &p)) {
4213		    Py_DECREF(x);
4214		    goto onError;
4215		}
4216		Py_DECREF(x);
4217		continue;
4218	    }
4219	    else if (PyUnicode_Check(x)) {
4220		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4221
4222		if (targetsize == 1)
4223		    /* 1-1 mapping */
4224		    *p++ = *PyUnicode_AS_UNICODE(x);
4225
4226		else if (targetsize > 1) {
4227		    /* 1-n mapping */
4228		    if (targetsize > extrachars) {
4229			/* resize first */
4230			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4231			Py_ssize_t needed = (targetsize - extrachars) + \
4232				     (targetsize << 2);
4233			extrachars += needed;
4234			/* XXX overflow detection missing */
4235			if (_PyUnicode_Resize(&v,
4236					     PyUnicode_GET_SIZE(v) + needed) < 0) {
4237			    Py_DECREF(x);
4238			    goto onError;
4239			}
4240			p = PyUnicode_AS_UNICODE(v) + oldpos;
4241		    }
4242		    Py_UNICODE_COPY(p,
4243				    PyUnicode_AS_UNICODE(x),
4244				    targetsize);
4245		    p += targetsize;
4246		    extrachars -= targetsize;
4247		}
4248		/* 1-0 mapping: skip the character */
4249	    }
4250	    else {
4251		/* wrong return value */
4252		PyErr_SetString(PyExc_TypeError,
4253		      "character mapping must return integer, None or unicode");
4254		Py_DECREF(x);
4255		goto onError;
4256	    }
4257	    Py_DECREF(x);
4258	    ++s;
4259	}
4260    }
4261    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4262	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4263	    goto onError;
4264    Py_XDECREF(errorHandler);
4265    Py_XDECREF(exc);
4266    return (PyObject *)v;
4267
4268 onError:
4269    Py_XDECREF(errorHandler);
4270    Py_XDECREF(exc);
4271    Py_XDECREF(v);
4272    return NULL;
4273}
4274
4275/* Charmap encoding: the lookup table */
4276
4277struct encoding_map{
4278  PyObject_HEAD
4279  unsigned char level1[32];
4280  int count2, count3;
4281  unsigned char level23[1];
4282};
4283
4284static PyObject*
4285encoding_map_size(PyObject *obj, PyObject* args)
4286{
4287    struct encoding_map *map = (struct encoding_map*)obj;
4288    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4289                          128*map->count3);
4290}
4291
4292static PyMethodDef encoding_map_methods[] = {
4293	{"size", encoding_map_size, METH_NOARGS,
4294         PyDoc_STR("Return the size (in bytes) of this object") },
4295        { 0 }
4296};
4297
4298static void
4299encoding_map_dealloc(PyObject* o)
4300{
4301	PyObject_FREE(o);
4302}
4303
4304static PyTypeObject EncodingMapType = {
4305	PyVarObject_HEAD_INIT(NULL, 0)
4306        "EncodingMap",          /*tp_name*/
4307        sizeof(struct encoding_map),   /*tp_basicsize*/
4308        0,                      /*tp_itemsize*/
4309        /* methods */
4310        encoding_map_dealloc,   /*tp_dealloc*/
4311        0,                      /*tp_print*/
4312        0,                      /*tp_getattr*/
4313        0,                      /*tp_setattr*/
4314        0,                      /*tp_compare*/
4315        0,                      /*tp_repr*/
4316        0,                      /*tp_as_number*/
4317        0,                      /*tp_as_sequence*/
4318        0,                      /*tp_as_mapping*/
4319        0,                      /*tp_hash*/
4320        0,                      /*tp_call*/
4321        0,                      /*tp_str*/
4322        0,                      /*tp_getattro*/
4323        0,                      /*tp_setattro*/
4324        0,                      /*tp_as_buffer*/
4325        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4326        0,                      /*tp_doc*/
4327        0,                      /*tp_traverse*/
4328        0,                      /*tp_clear*/
4329        0,                      /*tp_richcompare*/
4330        0,                      /*tp_weaklistoffset*/
4331        0,                      /*tp_iter*/
4332        0,                      /*tp_iternext*/
4333        encoding_map_methods,   /*tp_methods*/
4334        0,                      /*tp_members*/
4335        0,                      /*tp_getset*/
4336        0,                      /*tp_base*/
4337        0,                      /*tp_dict*/
4338        0,                      /*tp_descr_get*/
4339        0,                      /*tp_descr_set*/
4340        0,                      /*tp_dictoffset*/
4341        0,                      /*tp_init*/
4342        0,                      /*tp_alloc*/
4343        0,                      /*tp_new*/
4344        0,                      /*tp_free*/
4345        0,                      /*tp_is_gc*/
4346};
4347
4348PyObject*
4349PyUnicode_BuildEncodingMap(PyObject* string)
4350{
4351    Py_UNICODE *decode;
4352    PyObject *result;
4353    struct encoding_map *mresult;
4354    int i;
4355    int need_dict = 0;
4356    unsigned char level1[32];
4357    unsigned char level2[512];
4358    unsigned char *mlevel1, *mlevel2, *mlevel3;
4359    int count2 = 0, count3 = 0;
4360
4361    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4362        PyErr_BadArgument();
4363        return NULL;
4364    }
4365    decode = PyUnicode_AS_UNICODE(string);
4366    memset(level1, 0xFF, sizeof level1);
4367    memset(level2, 0xFF, sizeof level2);
4368
4369    /* If there isn't a one-to-one mapping of NULL to \0,
4370       or if there are non-BMP characters, we need to use
4371       a mapping dictionary. */
4372    if (decode[0] != 0)
4373        need_dict = 1;
4374    for (i = 1; i < 256; i++) {
4375        int l1, l2;
4376        if (decode[i] == 0
4377            #ifdef Py_UNICODE_WIDE
4378            || decode[i] > 0xFFFF
4379            #endif
4380        ) {
4381            need_dict = 1;
4382            break;
4383        }
4384        if (decode[i] == 0xFFFE)
4385            /* unmapped character */
4386            continue;
4387        l1 = decode[i] >> 11;
4388        l2 = decode[i] >> 7;
4389        if (level1[l1] == 0xFF)
4390            level1[l1] = count2++;
4391        if (level2[l2] == 0xFF)
4392            level2[l2] = count3++;
4393    }
4394
4395    if (count2 >= 0xFF || count3 >= 0xFF)
4396        need_dict = 1;
4397
4398    if (need_dict) {
4399        PyObject *result = PyDict_New();
4400        PyObject *key, *value;
4401        if (!result)
4402            return NULL;
4403        for (i = 0; i < 256; i++) {
4404            key = value = NULL;
4405            key = PyLong_FromLong(decode[i]);
4406            value = PyLong_FromLong(i);
4407            if (!key || !value)
4408                goto failed1;
4409            if (PyDict_SetItem(result, key, value) == -1)
4410                goto failed1;
4411            Py_DECREF(key);
4412            Py_DECREF(value);
4413        }
4414        return result;
4415      failed1:
4416        Py_XDECREF(key);
4417        Py_XDECREF(value);
4418        Py_DECREF(result);
4419        return NULL;
4420    }
4421
4422    /* Create a three-level trie */
4423    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4424                             16*count2 + 128*count3 - 1);
4425    if (!result)
4426        return PyErr_NoMemory();
4427    PyObject_Init(result, &EncodingMapType);
4428    mresult = (struct encoding_map*)result;
4429    mresult->count2 = count2;
4430    mresult->count3 = count3;
4431    mlevel1 = mresult->level1;
4432    mlevel2 = mresult->level23;
4433    mlevel3 = mresult->level23 + 16*count2;
4434    memcpy(mlevel1, level1, 32);
4435    memset(mlevel2, 0xFF, 16*count2);
4436    memset(mlevel3, 0, 128*count3);
4437    count3 = 0;
4438    for (i = 1; i < 256; i++) {
4439        int o1, o2, o3, i2, i3;
4440        if (decode[i] == 0xFFFE)
4441            /* unmapped character */
4442            continue;
4443        o1 = decode[i]>>11;
4444        o2 = (decode[i]>>7) & 0xF;
4445        i2 = 16*mlevel1[o1] + o2;
4446        if (mlevel2[i2] == 0xFF)
4447            mlevel2[i2] = count3++;
4448        o3 = decode[i] & 0x7F;
4449        i3 = 128*mlevel2[i2] + o3;
4450        mlevel3[i3] = i;
4451    }
4452    return result;
4453}
4454
4455static int
4456encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4457{
4458    struct encoding_map *map = (struct encoding_map*)mapping;
4459    int l1 = c>>11;
4460    int l2 = (c>>7) & 0xF;
4461    int l3 = c & 0x7F;
4462    int i;
4463
4464#ifdef Py_UNICODE_WIDE
4465    if (c > 0xFFFF) {
4466	return -1;
4467    }
4468#endif
4469    if (c == 0)
4470        return 0;
4471    /* level 1*/
4472    i = map->level1[l1];
4473    if (i == 0xFF) {
4474        return -1;
4475    }
4476    /* level 2*/
4477    i = map->level23[16*i+l2];
4478    if (i == 0xFF) {
4479        return -1;
4480    }
4481    /* level 3 */
4482    i = map->level23[16*map->count2 + 128*i + l3];
4483    if (i == 0) {
4484        return -1;
4485    }
4486    return i;
4487}
4488
4489/* Lookup the character ch in the mapping. If the character
4490   can't be found, Py_None is returned (or NULL, if another
4491   error occurred). */
4492static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4493{
4494    PyObject *w = PyLong_FromLong((long)c);
4495    PyObject *x;
4496
4497    if (w == NULL)
4498	 return NULL;
4499    x = PyObject_GetItem(mapping, w);
4500    Py_DECREF(w);
4501    if (x == NULL) {
4502	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4503	    /* No mapping found means: mapping is undefined. */
4504	    PyErr_Clear();
4505	    x = Py_None;
4506	    Py_INCREF(x);
4507	    return x;
4508	} else
4509	    return NULL;
4510    }
4511    else if (x == Py_None)
4512	return x;
4513    else if (PyLong_Check(x)) {
4514	long value = PyLong_AS_LONG(x);
4515	if (value < 0 || value > 255) {
4516	    PyErr_SetString(PyExc_TypeError,
4517			     "character mapping must be in range(256)");
4518	    Py_DECREF(x);
4519	    return NULL;
4520	}
4521	return x;
4522    }
4523    else if (PyBytes_Check(x))
4524	return x;
4525    else {
4526	/* wrong return value */
4527	PyErr_Format(PyExc_TypeError,
4528                "character mapping must return integer, bytes or None, not %.400s",
4529                x->ob_type->tp_name);
4530	Py_DECREF(x);
4531	return NULL;
4532    }
4533}
4534
4535static int
4536charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4537{
4538	Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4539	/* exponentially overallocate to minimize reallocations */
4540	if (requiredsize < 2*outsize)
4541	    requiredsize = 2*outsize;
4542	if (_PyBytes_Resize(outobj, requiredsize))
4543	    return -1;
4544	return 0;
4545}
4546
4547typedef enum charmapencode_result {
4548  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4549}charmapencode_result;
4550/* lookup the character, put the result in the output string and adjust
4551   various state variables. Resize the output bytes object if not enough
4552   space is available. Return a new reference to the object that
4553   was put in the output buffer, or Py_None, if the mapping was undefined
4554   (in which case no character was written) or NULL, if a
4555   reallocation error occurred. The caller must decref the result */
4556static
4557charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4558    PyObject **outobj, Py_ssize_t *outpos)
4559{
4560    PyObject *rep;
4561    char *outstart;
4562    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
4563
4564    if (Py_TYPE(mapping) == &EncodingMapType) {
4565        int res = encoding_map_lookup(c, mapping);
4566	Py_ssize_t requiredsize = *outpos+1;
4567        if (res == -1)
4568            return enc_FAILED;
4569	if (outsize<requiredsize)
4570	    if (charmapencode_resize(outobj, outpos, requiredsize))
4571		return enc_EXCEPTION;
4572        outstart = PyBytes_AS_STRING(*outobj);
4573	outstart[(*outpos)++] = (char)res;
4574	return enc_SUCCESS;
4575    }
4576
4577    rep = charmapencode_lookup(c, mapping);
4578    if (rep==NULL)
4579	return enc_EXCEPTION;
4580    else if (rep==Py_None) {
4581	Py_DECREF(rep);
4582	return enc_FAILED;
4583    } else {
4584	if (PyLong_Check(rep)) {
4585	    Py_ssize_t requiredsize = *outpos+1;
4586	    if (outsize<requiredsize)
4587		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4588		    Py_DECREF(rep);
4589		    return enc_EXCEPTION;
4590		}
4591            outstart = PyBytes_AS_STRING(*outobj);
4592	    outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
4593	}
4594	else {
4595	    const char *repchars = PyBytes_AS_STRING(rep);
4596	    Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
4597	    Py_ssize_t requiredsize = *outpos+repsize;
4598	    if (outsize<requiredsize)
4599		if (charmapencode_resize(outobj, outpos, requiredsize)) {
4600		    Py_DECREF(rep);
4601		    return enc_EXCEPTION;
4602		}
4603            outstart = PyBytes_AS_STRING(*outobj);
4604	    memcpy(outstart + *outpos, repchars, repsize);
4605	    *outpos += repsize;
4606	}
4607    }
4608    Py_DECREF(rep);
4609    return enc_SUCCESS;
4610}
4611
4612/* handle an error in PyUnicode_EncodeCharmap
4613   Return 0 on success, -1 on error */
4614static
4615int charmap_encoding_error(
4616    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4617    PyObject **exceptionObject,
4618    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4619    PyObject **res, Py_ssize_t *respos)
4620{
4621    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4622    Py_ssize_t repsize;
4623    Py_ssize_t newpos;
4624    Py_UNICODE *uni2;
4625    /* startpos for collecting unencodable chars */
4626    Py_ssize_t collstartpos = *inpos;
4627    Py_ssize_t collendpos = *inpos+1;
4628    Py_ssize_t collpos;
4629    char *encoding = "charmap";
4630    char *reason = "character maps to <undefined>";
4631    charmapencode_result x;
4632
4633    /* find all unencodable characters */
4634    while (collendpos < size) {
4635        PyObject *rep;
4636        if (Py_TYPE(mapping) == &EncodingMapType) {
4637	    int res = encoding_map_lookup(p[collendpos], mapping);
4638	    if (res != -1)
4639		break;
4640	    ++collendpos;
4641	    continue;
4642	}
4643
4644	rep = charmapencode_lookup(p[collendpos], mapping);
4645	if (rep==NULL)
4646	    return -1;
4647	else if (rep!=Py_None) {
4648	    Py_DECREF(rep);
4649	    break;
4650	}
4651	Py_DECREF(rep);
4652	++collendpos;
4653    }
4654    /* cache callback name lookup
4655     * (if not done yet, i.e. it's the first error) */
4656    if (*known_errorHandler==-1) {
4657	if ((errors==NULL) || (!strcmp(errors, "strict")))
4658	    *known_errorHandler = 1;
4659	else if (!strcmp(errors, "replace"))
4660	    *known_errorHandler = 2;
4661	else if (!strcmp(errors, "ignore"))
4662	    *known_errorHandler = 3;
4663	else if (!strcmp(errors, "xmlcharrefreplace"))
4664	    *known_errorHandler = 4;
4665	else
4666	    *known_errorHandler = 0;
4667    }
4668    switch (*known_errorHandler) {
4669	case 1: /* strict */
4670	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4671	    return -1;
4672	case 2: /* replace */
4673	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4674		x = charmapencode_output('?', mapping, res, respos);
4675		if (x==enc_EXCEPTION) {
4676		    return -1;
4677		}
4678		else if (x==enc_FAILED) {
4679		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4680		    return -1;
4681		}
4682	    }
4683	    /* fall through */
4684	case 3: /* ignore */
4685	    *inpos = collendpos;
4686	    break;
4687	case 4: /* xmlcharrefreplace */
4688	    /* generate replacement (temporarily (mis)uses p) */
4689	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4690		char buffer[2+29+1+1];
4691		char *cp;
4692		sprintf(buffer, "&#%d;", (int)p[collpos]);
4693		for (cp = buffer; *cp; ++cp) {
4694		    x = charmapencode_output(*cp, mapping, res, respos);
4695		    if (x==enc_EXCEPTION)
4696			return -1;
4697		    else if (x==enc_FAILED) {
4698			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4699			return -1;
4700		    }
4701		}
4702	    }
4703	    *inpos = collendpos;
4704	    break;
4705	default:
4706	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4707		encoding, reason, p, size, exceptionObject,
4708		collstartpos, collendpos, &newpos);
4709	    if (repunicode == NULL)
4710		return -1;
4711	    /* generate replacement  */
4712	    repsize = PyUnicode_GET_SIZE(repunicode);
4713	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4714		x = charmapencode_output(*uni2, mapping, res, respos);
4715		if (x==enc_EXCEPTION) {
4716		    return -1;
4717		}
4718		else if (x==enc_FAILED) {
4719		    Py_DECREF(repunicode);
4720		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4721		    return -1;
4722		}
4723	    }
4724	    *inpos = newpos;
4725	    Py_DECREF(repunicode);
4726    }
4727    return 0;
4728}
4729
4730PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4731				  Py_ssize_t size,
4732				  PyObject *mapping,
4733				  const char *errors)
4734{
4735    /* output object */
4736    PyObject *res = NULL;
4737    /* current input position */
4738    Py_ssize_t inpos = 0;
4739    /* current output position */
4740    Py_ssize_t respos = 0;
4741    PyObject *errorHandler = NULL;
4742    PyObject *exc = NULL;
4743    /* the following variable is used for caching string comparisons
4744     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4745     * 3=ignore, 4=xmlcharrefreplace */
4746    int known_errorHandler = -1;
4747
4748    /* Default to Latin-1 */
4749    if (mapping == NULL)
4750	return PyUnicode_EncodeLatin1(p, size, errors);
4751
4752    /* allocate enough for a simple encoding without
4753       replacements, if we need more, we'll resize */
4754    res = PyBytes_FromStringAndSize(NULL, size);
4755    if (res == NULL)
4756        goto onError;
4757    if (size == 0)
4758	return res;
4759
4760    while (inpos<size) {
4761	/* try to encode it */
4762	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4763	if (x==enc_EXCEPTION) /* error */
4764	    goto onError;
4765	if (x==enc_FAILED) { /* unencodable character */
4766	    if (charmap_encoding_error(p, size, &inpos, mapping,
4767		&exc,
4768		&known_errorHandler, &errorHandler, errors,
4769		&res, &respos)) {
4770		goto onError;
4771	    }
4772	}
4773	else
4774	    /* done with this character => adjust input position */
4775	    ++inpos;
4776    }
4777
4778    /* Resize if we allocated to much */
4779    if (respos<PyBytes_GET_SIZE(res))
4780	_PyBytes_Resize(&res, respos);
4781
4782    Py_XDECREF(exc);
4783    Py_XDECREF(errorHandler);
4784    return res;
4785
4786    onError:
4787    Py_XDECREF(res);
4788    Py_XDECREF(exc);
4789    Py_XDECREF(errorHandler);
4790    return NULL;
4791}
4792
4793PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4794				    PyObject *mapping)
4795{
4796    if (!PyUnicode_Check(unicode) || mapping == NULL) {
4797	PyErr_BadArgument();
4798	return NULL;
4799    }
4800    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4801				   PyUnicode_GET_SIZE(unicode),
4802				   mapping,
4803				   NULL);
4804}
4805
4806/* create or adjust a UnicodeTranslateError */
4807static void make_translate_exception(PyObject **exceptionObject,
4808    const Py_UNICODE *unicode, Py_ssize_t size,
4809    Py_ssize_t startpos, Py_ssize_t endpos,
4810    const char *reason)
4811{
4812    if (*exceptionObject == NULL) {
4813    	*exceptionObject = PyUnicodeTranslateError_Create(
4814	    unicode, size, startpos, endpos, reason);
4815    }
4816    else {
4817	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4818	    goto onError;
4819	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4820	    goto onError;
4821	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4822	    goto onError;
4823	return;
4824	onError:
4825	Py_DECREF(*exceptionObject);
4826	*exceptionObject = NULL;
4827    }
4828}
4829
4830/* raises a UnicodeTranslateError */
4831static void raise_translate_exception(PyObject **exceptionObject,
4832    const Py_UNICODE *unicode, Py_ssize_t size,
4833    Py_ssize_t startpos, Py_ssize_t endpos,
4834    const char *reason)
4835{
4836    make_translate_exception(exceptionObject,
4837	unicode, size, startpos, endpos, reason);
4838    if (*exceptionObject != NULL)
4839	PyCodec_StrictErrors(*exceptionObject);
4840}
4841
4842/* error handling callback helper:
4843   build arguments, call the callback and check the arguments,
4844   put the result into newpos and return the replacement string, which
4845   has to be freed by the caller */
4846static PyObject *unicode_translate_call_errorhandler(const char *errors,
4847    PyObject **errorHandler,
4848    const char *reason,
4849    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4850    Py_ssize_t startpos, Py_ssize_t endpos,
4851    Py_ssize_t *newpos)
4852{
4853    static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4854
4855    Py_ssize_t i_newpos;
4856    PyObject *restuple;
4857    PyObject *resunicode;
4858
4859    if (*errorHandler == NULL) {
4860	*errorHandler = PyCodec_LookupError(errors);
4861        if (*errorHandler == NULL)
4862	    return NULL;
4863    }
4864
4865    make_translate_exception(exceptionObject,
4866	unicode, size, startpos, endpos, reason);
4867    if (*exceptionObject == NULL)
4868	return NULL;
4869
4870    restuple = PyObject_CallFunctionObjArgs(
4871	*errorHandler, *exceptionObject, NULL);
4872    if (restuple == NULL)
4873	return NULL;
4874    if (!PyTuple_Check(restuple)) {
4875	PyErr_Format(PyExc_TypeError, &argparse[4]);
4876	Py_DECREF(restuple);
4877	return NULL;
4878    }
4879    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4880	&resunicode, &i_newpos)) {
4881	Py_DECREF(restuple);
4882	return NULL;
4883    }
4884    if (i_newpos<0)
4885	*newpos = size+i_newpos;
4886    else
4887        *newpos = i_newpos;
4888    if (*newpos<0 || *newpos>size) {
4889	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4890	Py_DECREF(restuple);
4891	return NULL;
4892    }
4893    Py_INCREF(resunicode);
4894    Py_DECREF(restuple);
4895    return resunicode;
4896}
4897
4898/* Lookup the character ch in the mapping and put the result in result,
4899   which must be decrefed by the caller.
4900   Return 0 on success, -1 on error */
4901static
4902int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4903{
4904    PyObject *w = PyLong_FromLong((long)c);
4905    PyObject *x;
4906
4907    if (w == NULL)
4908	 return -1;
4909    x = PyObject_GetItem(mapping, w);
4910    Py_DECREF(w);
4911    if (x == NULL) {
4912	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4913	    /* No mapping found means: use 1:1 mapping. */
4914	    PyErr_Clear();
4915	    *result = NULL;
4916	    return 0;
4917	} else
4918	    return -1;
4919    }
4920    else if (x == Py_None) {
4921	*result = x;
4922	return 0;
4923    }
4924    else if (PyLong_Check(x)) {
4925	long value = PyLong_AS_LONG(x);
4926	long max = PyUnicode_GetMax();
4927	if (value < 0 || value > max) {
4928	    PyErr_Format(PyExc_TypeError,
4929                         "character mapping must be in range(0x%x)", max+1);
4930	    Py_DECREF(x);
4931	    return -1;
4932	}
4933	*result = x;
4934	return 0;
4935    }
4936    else if (PyUnicode_Check(x)) {
4937	*result = x;
4938	return 0;
4939    }
4940    else {
4941	/* wrong return value */
4942	PyErr_SetString(PyExc_TypeError,
4943	      "character mapping must return integer, None or unicode");
4944	Py_DECREF(x);
4945	return -1;
4946    }
4947}
4948/* ensure that *outobj is at least requiredsize characters long,
4949if not reallocate and adjust various state variables.
4950Return 0 on success, -1 on error */
4951static
4952int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4953    Py_ssize_t requiredsize)
4954{
4955    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4956    if (requiredsize > oldsize) {
4957	/* remember old output position */
4958	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4959	/* exponentially overallocate to minimize reallocations */
4960	if (requiredsize < 2 * oldsize)
4961	    requiredsize = 2 * oldsize;
4962	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4963	    return -1;
4964	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4965    }
4966    return 0;
4967}
4968/* lookup the character, put the result in the output string and adjust
4969   various state variables. Return a new reference to the object that
4970   was put in the output buffer in *result, or Py_None, if the mapping was
4971   undefined (in which case no character was written).
4972   The called must decref result.
4973   Return 0 on success, -1 on error. */
4974static
4975int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4976    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4977    PyObject **res)
4978{
4979    if (charmaptranslate_lookup(*curinp, mapping, res))
4980	return -1;
4981    if (*res==NULL) {
4982	/* not found => default to 1:1 mapping */
4983	*(*outp)++ = *curinp;
4984    }
4985    else if (*res==Py_None)
4986	;
4987    else if (PyLong_Check(*res)) {
4988	/* no overflow check, because we know that the space is enough */
4989	*(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
4990    }
4991    else if (PyUnicode_Check(*res)) {
4992	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4993	if (repsize==1) {
4994	    /* no overflow check, because we know that the space is enough */
4995	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4996	}
4997	else if (repsize!=0) {
4998	    /* more than one character */
4999	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5000		(insize - (curinp-startinp)) +
5001		repsize - 1;
5002	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
5003		return -1;
5004	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5005	    *outp += repsize;
5006	}
5007    }
5008    else
5009	return -1;
5010    return 0;
5011}
5012
5013PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5014				     Py_ssize_t size,
5015				     PyObject *mapping,
5016				     const char *errors)
5017{
5018    /* output object */
5019    PyObject *res = NULL;
5020    /* pointers to the beginning and end+1 of input */
5021    const Py_UNICODE *startp = p;
5022    const Py_UNICODE *endp = p + size;
5023    /* pointer into the output */
5024    Py_UNICODE *str;
5025    /* current output position */
5026    Py_ssize_t respos = 0;
5027    char *reason = "character maps to <undefined>";
5028    PyObject *errorHandler = NULL;
5029    PyObject *exc = NULL;
5030    /* the following variable is used for caching string comparisons
5031     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5032     * 3=ignore, 4=xmlcharrefreplace */
5033    int known_errorHandler = -1;
5034
5035    if (mapping == NULL) {
5036	PyErr_BadArgument();
5037	return NULL;
5038    }
5039
5040    /* allocate enough for a simple 1:1 translation without
5041       replacements, if we need more, we'll resize */
5042    res = PyUnicode_FromUnicode(NULL, size);
5043    if (res == NULL)
5044	goto onError;
5045    if (size == 0)
5046	return res;
5047    str = PyUnicode_AS_UNICODE(res);
5048
5049    while (p<endp) {
5050	/* try to encode it */
5051	PyObject *x = NULL;
5052	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5053	    Py_XDECREF(x);
5054	    goto onError;
5055	}
5056	Py_XDECREF(x);
5057	if (x!=Py_None) /* it worked => adjust input pointer */
5058	    ++p;
5059	else { /* untranslatable character */
5060	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5061	    Py_ssize_t repsize;
5062	    Py_ssize_t newpos;
5063	    Py_UNICODE *uni2;
5064	    /* startpos for collecting untranslatable chars */
5065	    const Py_UNICODE *collstart = p;
5066	    const Py_UNICODE *collend = p+1;
5067	    const Py_UNICODE *coll;
5068
5069	    /* find all untranslatable characters */
5070	    while (collend < endp) {
5071		if (charmaptranslate_lookup(*collend, mapping, &x))
5072		    goto onError;
5073		Py_XDECREF(x);
5074		if (x!=Py_None)
5075		    break;
5076		++collend;
5077	    }
5078	    /* cache callback name lookup
5079	     * (if not done yet, i.e. it's the first error) */
5080	    if (known_errorHandler==-1) {
5081		if ((errors==NULL) || (!strcmp(errors, "strict")))
5082		    known_errorHandler = 1;
5083		else if (!strcmp(errors, "replace"))
5084		    known_errorHandler = 2;
5085		else if (!strcmp(errors, "ignore"))
5086		    known_errorHandler = 3;
5087		else if (!strcmp(errors, "xmlcharrefreplace"))
5088		    known_errorHandler = 4;
5089		else
5090		    known_errorHandler = 0;
5091	    }
5092	    switch (known_errorHandler) {
5093		case 1: /* strict */
5094		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5095		    goto onError;
5096		case 2: /* replace */
5097		    /* No need to check for space, this is a 1:1 replacement */
5098		    for (coll = collstart; coll<collend; ++coll)
5099			*str++ = '?';
5100		    /* fall through */
5101		case 3: /* ignore */
5102		    p = collend;
5103		    break;
5104		case 4: /* xmlcharrefreplace */
5105		    /* generate replacement (temporarily (mis)uses p) */
5106		    for (p = collstart; p < collend; ++p) {
5107			char buffer[2+29+1+1];
5108			char *cp;
5109			sprintf(buffer, "&#%d;", (int)*p);
5110			if (charmaptranslate_makespace(&res, &str,
5111			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5112			    goto onError;
5113			for (cp = buffer; *cp; ++cp)
5114			    *str++ = *cp;
5115		    }
5116		    p = collend;
5117		    break;
5118		default:
5119		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5120			reason, startp, size, &exc,
5121			collstart-startp, collend-startp, &newpos);
5122		    if (repunicode == NULL)
5123			goto onError;
5124		    /* generate replacement  */
5125		    repsize = PyUnicode_GET_SIZE(repunicode);
5126		    if (charmaptranslate_makespace(&res, &str,
5127			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5128			Py_DECREF(repunicode);
5129			goto onError;
5130		    }
5131		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5132			*str++ = *uni2;
5133		    p = startp + newpos;
5134		    Py_DECREF(repunicode);
5135	    }
5136	}
5137    }
5138    /* Resize if we allocated to much */
5139    respos = str-PyUnicode_AS_UNICODE(res);
5140    if (respos<PyUnicode_GET_SIZE(res)) {
5141	if (_PyUnicode_Resize(&res, respos) < 0)
5142	    goto onError;
5143    }
5144    Py_XDECREF(exc);
5145    Py_XDECREF(errorHandler);
5146    return res;
5147
5148    onError:
5149    Py_XDECREF(res);
5150    Py_XDECREF(exc);
5151    Py_XDECREF(errorHandler);
5152    return NULL;
5153}
5154
5155PyObject *PyUnicode_Translate(PyObject *str,
5156			      PyObject *mapping,
5157			      const char *errors)
5158{
5159    PyObject *result;
5160
5161    str = PyUnicode_FromObject(str);
5162    if (str == NULL)
5163	goto onError;
5164    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5165					PyUnicode_GET_SIZE(str),
5166					mapping,
5167					errors);
5168    Py_DECREF(str);
5169    return result;
5170
5171 onError:
5172    Py_XDECREF(str);
5173    return NULL;
5174}
5175
5176/* --- Decimal Encoder ---------------------------------------------------- */
5177
5178int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5179			    Py_ssize_t length,
5180			    char *output,
5181			    const char *errors)
5182{
5183    Py_UNICODE *p, *end;
5184    PyObject *errorHandler = NULL;
5185    PyObject *exc = NULL;
5186    const char *encoding = "decimal";
5187    const char *reason = "invalid decimal Unicode string";
5188    /* the following variable is used for caching string comparisons
5189     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5190    int known_errorHandler = -1;
5191
5192    if (output == NULL) {
5193	PyErr_BadArgument();
5194	return -1;
5195    }
5196
5197    p = s;
5198    end = s + length;
5199    while (p < end) {
5200	register Py_UNICODE ch = *p;
5201	int decimal;
5202	PyObject *repunicode;
5203	Py_ssize_t repsize;
5204	Py_ssize_t newpos;
5205	Py_UNICODE *uni2;
5206	Py_UNICODE *collstart;
5207	Py_UNICODE *collend;
5208
5209	if (Py_UNICODE_ISSPACE(ch)) {
5210	    *output++ = ' ';
5211	    ++p;
5212	    continue;
5213	}
5214	decimal = Py_UNICODE_TODECIMAL(ch);
5215	if (decimal >= 0) {
5216	    *output++ = '0' + decimal;
5217	    ++p;
5218	    continue;
5219	}
5220	if (0 < ch && ch < 256) {
5221	    *output++ = (char)ch;
5222	    ++p;
5223	    continue;
5224	}
5225	/* All other characters are considered unencodable */
5226	collstart = p;
5227	collend = p+1;
5228	while (collend < end) {
5229	    if ((0 < *collend && *collend < 256) ||
5230	        !Py_UNICODE_ISSPACE(*collend) ||
5231	        Py_UNICODE_TODECIMAL(*collend))
5232		break;
5233	}
5234	/* cache callback name lookup
5235	 * (if not done yet, i.e. it's the first error) */
5236	if (known_errorHandler==-1) {
5237	    if ((errors==NULL) || (!strcmp(errors, "strict")))
5238		known_errorHandler = 1;
5239	    else if (!strcmp(errors, "replace"))
5240		known_errorHandler = 2;
5241	    else if (!strcmp(errors, "ignore"))
5242		known_errorHandler = 3;
5243	    else if (!strcmp(errors, "xmlcharrefreplace"))
5244		known_errorHandler = 4;
5245	    else
5246		known_errorHandler = 0;
5247	}
5248	switch (known_errorHandler) {
5249	    case 1: /* strict */
5250		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5251		goto onError;
5252	    case 2: /* replace */
5253		for (p = collstart; p < collend; ++p)
5254		    *output++ = '?';
5255		/* fall through */
5256	    case 3: /* ignore */
5257		p = collend;
5258		break;
5259	    case 4: /* xmlcharrefreplace */
5260		/* generate replacement (temporarily (mis)uses p) */
5261		for (p = collstart; p < collend; ++p)
5262		    output += sprintf(output, "&#%d;", (int)*p);
5263		p = collend;
5264		break;
5265	    default:
5266		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5267		    encoding, reason, s, length, &exc,
5268		    collstart-s, collend-s, &newpos);
5269		if (repunicode == NULL)
5270		    goto onError;
5271		/* generate replacement  */
5272		repsize = PyUnicode_GET_SIZE(repunicode);
5273		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5274		    Py_UNICODE ch = *uni2;
5275		    if (Py_UNICODE_ISSPACE(ch))
5276			*output++ = ' ';
5277		    else {
5278			decimal = Py_UNICODE_TODECIMAL(ch);
5279			if (decimal >= 0)
5280			    *output++ = '0' + decimal;
5281			else if (0 < ch && ch < 256)
5282			    *output++ = (char)ch;
5283			else {
5284			    Py_DECREF(repunicode);
5285			    raise_encode_exception(&exc, encoding,
5286				s, length, collstart-s, collend-s, reason);
5287			    goto onError;
5288			}
5289		    }
5290		}
5291		p = s + newpos;
5292		Py_DECREF(repunicode);
5293	}
5294    }
5295    /* 0-terminate the output string */
5296    *output++ = '\0';
5297    Py_XDECREF(exc);
5298    Py_XDECREF(errorHandler);
5299    return 0;
5300
5301 onError:
5302    Py_XDECREF(exc);
5303    Py_XDECREF(errorHandler);
5304    return -1;
5305}
5306
5307/* --- Helpers ------------------------------------------------------------ */
5308
5309#include "stringlib/unicodedefs.h"
5310#include "stringlib/fastsearch.h"
5311#include "stringlib/count.h"
5312/* Include _ParseTupleFinds from find.h */
5313#define FROM_UNICODE
5314#include "stringlib/find.h"
5315#include "stringlib/partition.h"
5316
5317#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
5318#include "stringlib/localeutil.h"
5319
5320/* helper macro to fixup start/end slice values */
5321#define FIX_START_END(obj)                      \
5322    if (start < 0)                              \
5323        start += (obj)->length;                 \
5324    if (start < 0)                              \
5325        start = 0;                              \
5326    if (end > (obj)->length)                    \
5327        end = (obj)->length;                    \
5328    if (end < 0)                                \
5329        end += (obj)->length;                   \
5330    if (end < 0)                                \
5331        end = 0;
5332
5333Py_ssize_t PyUnicode_Count(PyObject *str,
5334                           PyObject *substr,
5335                           Py_ssize_t start,
5336                           Py_ssize_t end)
5337{
5338    Py_ssize_t result;
5339    PyUnicodeObject* str_obj;
5340    PyUnicodeObject* sub_obj;
5341
5342    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5343    if (!str_obj)
5344	return -1;
5345    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5346    if (!sub_obj) {
5347	Py_DECREF(str_obj);
5348	return -1;
5349    }
5350
5351    FIX_START_END(str_obj);
5352
5353    result = stringlib_count(
5354        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5355        );
5356
5357    Py_DECREF(sub_obj);
5358    Py_DECREF(str_obj);
5359
5360    return result;
5361}
5362
5363Py_ssize_t PyUnicode_Find(PyObject *str,
5364                          PyObject *sub,
5365                          Py_ssize_t start,
5366                          Py_ssize_t end,
5367                          int direction)
5368{
5369    Py_ssize_t result;
5370
5371    str = PyUnicode_FromObject(str);
5372    if (!str)
5373	return -2;
5374    sub = PyUnicode_FromObject(sub);
5375    if (!sub) {
5376	Py_DECREF(str);
5377	return -2;
5378    }
5379
5380    if (direction > 0)
5381        result = stringlib_find_slice(
5382            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5383            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5384            start, end
5385            );
5386    else
5387        result = stringlib_rfind_slice(
5388            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5389            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5390            start, end
5391            );
5392
5393    Py_DECREF(str);
5394    Py_DECREF(sub);
5395
5396    return result;
5397}
5398
5399static
5400int tailmatch(PyUnicodeObject *self,
5401	      PyUnicodeObject *substring,
5402	      Py_ssize_t start,
5403	      Py_ssize_t end,
5404	      int direction)
5405{
5406    if (substring->length == 0)
5407        return 1;
5408
5409    FIX_START_END(self);
5410
5411    end -= substring->length;
5412    if (end < start)
5413	return 0;
5414
5415    if (direction > 0) {
5416	if (Py_UNICODE_MATCH(self, end, substring))
5417	    return 1;
5418    } else {
5419        if (Py_UNICODE_MATCH(self, start, substring))
5420	    return 1;
5421    }
5422
5423    return 0;
5424}
5425
5426Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5427			PyObject *substr,
5428			Py_ssize_t start,
5429			Py_ssize_t end,
5430			int direction)
5431{
5432    Py_ssize_t result;
5433
5434    str = PyUnicode_FromObject(str);
5435    if (str == NULL)
5436	return -1;
5437    substr = PyUnicode_FromObject(substr);
5438    if (substr == NULL) {
5439	Py_DECREF(str);
5440	return -1;
5441    }
5442
5443    result = tailmatch((PyUnicodeObject *)str,
5444		       (PyUnicodeObject *)substr,
5445		       start, end, direction);
5446    Py_DECREF(str);
5447    Py_DECREF(substr);
5448    return result;
5449}
5450
5451/* Apply fixfct filter to the Unicode object self and return a
5452   reference to the modified object */
5453
5454static
5455PyObject *fixup(PyUnicodeObject *self,
5456		int (*fixfct)(PyUnicodeObject *s))
5457{
5458
5459    PyUnicodeObject *u;
5460
5461    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5462    if (u == NULL)
5463	return NULL;
5464
5465    Py_UNICODE_COPY(u->str, self->str, self->length);
5466
5467    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5468	/* fixfct should return TRUE if it modified the buffer. If
5469	   FALSE, return a reference to the original buffer instead
5470	   (to save space, not time) */
5471	Py_INCREF(self);
5472	Py_DECREF(u);
5473	return (PyObject*) self;
5474    }
5475    return (PyObject*) u;
5476}
5477
5478static
5479int fixupper(PyUnicodeObject *self)
5480{
5481    Py_ssize_t len = self->length;
5482    Py_UNICODE *s = self->str;
5483    int status = 0;
5484
5485    while (len-- > 0) {
5486	register Py_UNICODE ch;
5487
5488	ch = Py_UNICODE_TOUPPER(*s);
5489	if (ch != *s) {
5490            status = 1;
5491	    *s = ch;
5492	}
5493        s++;
5494    }
5495
5496    return status;
5497}
5498
5499static
5500int fixlower(PyUnicodeObject *self)
5501{
5502    Py_ssize_t len = self->length;
5503    Py_UNICODE *s = self->str;
5504    int status = 0;
5505
5506    while (len-- > 0) {
5507	register Py_UNICODE ch;
5508
5509	ch = Py_UNICODE_TOLOWER(*s);
5510	if (ch != *s) {
5511            status = 1;
5512	    *s = ch;
5513	}
5514        s++;
5515    }
5516
5517    return status;
5518}
5519
5520static
5521int fixswapcase(PyUnicodeObject *self)
5522{
5523    Py_ssize_t len = self->length;
5524    Py_UNICODE *s = self->str;
5525    int status = 0;
5526
5527    while (len-- > 0) {
5528        if (Py_UNICODE_ISUPPER(*s)) {
5529            *s = Py_UNICODE_TOLOWER(*s);
5530            status = 1;
5531        } else if (Py_UNICODE_ISLOWER(*s)) {
5532            *s = Py_UNICODE_TOUPPER(*s);
5533            status = 1;
5534        }
5535        s++;
5536    }
5537
5538    return status;
5539}
5540
5541static
5542int fixcapitalize(PyUnicodeObject *self)
5543{
5544    Py_ssize_t len = self->length;
5545    Py_UNICODE *s = self->str;
5546    int status = 0;
5547
5548    if (len == 0)
5549	return 0;
5550    if (Py_UNICODE_ISLOWER(*s)) {
5551	*s = Py_UNICODE_TOUPPER(*s);
5552	status = 1;
5553    }
5554    s++;
5555    while (--len > 0) {
5556        if (Py_UNICODE_ISUPPER(*s)) {
5557            *s = Py_UNICODE_TOLOWER(*s);
5558            status = 1;
5559        }
5560        s++;
5561    }
5562    return status;
5563}
5564
5565static
5566int fixtitle(PyUnicodeObject *self)
5567{
5568    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5569    register Py_UNICODE *e;
5570    int previous_is_cased;
5571
5572    /* Shortcut for single character strings */
5573    if (PyUnicode_GET_SIZE(self) == 1) {
5574	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5575	if (*p != ch) {
5576	    *p = ch;
5577	    return 1;
5578	}
5579	else
5580	    return 0;
5581    }
5582
5583    e = p + PyUnicode_GET_SIZE(self);
5584    previous_is_cased = 0;
5585    for (; p < e; p++) {
5586	register const Py_UNICODE ch = *p;
5587
5588	if (previous_is_cased)
5589	    *p = Py_UNICODE_TOLOWER(ch);
5590	else
5591	    *p = Py_UNICODE_TOTITLE(ch);
5592
5593	if (Py_UNICODE_ISLOWER(ch) ||
5594	    Py_UNICODE_ISUPPER(ch) ||
5595	    Py_UNICODE_ISTITLE(ch))
5596	    previous_is_cased = 1;
5597	else
5598	    previous_is_cased = 0;
5599    }
5600    return 1;
5601}
5602
5603PyObject *
5604PyUnicode_Join(PyObject *separator, PyObject *seq)
5605{
5606    PyObject *internal_separator = NULL;
5607    const Py_UNICODE blank = ' ';
5608    const Py_UNICODE *sep = &blank;
5609    Py_ssize_t seplen = 1;
5610    PyUnicodeObject *res = NULL; /* the result */
5611    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5612    Py_ssize_t res_used;         /* # used bytes */
5613    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5614    PyObject *fseq;          /* PySequence_Fast(seq) */
5615    Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5616    PyObject *item;
5617    Py_ssize_t i;
5618
5619    fseq = PySequence_Fast(seq, "");
5620    if (fseq == NULL) {
5621    	return NULL;
5622    }
5623
5624    /* Grrrr.  A codec may be invoked to convert str objects to
5625     * Unicode, and so it's possible to call back into Python code
5626     * during PyUnicode_FromObject(), and so it's possible for a sick
5627     * codec to change the size of fseq (if seq is a list).  Therefore
5628     * we have to keep refetching the size -- can't assume seqlen
5629     * is invariant.
5630     */
5631    seqlen = PySequence_Fast_GET_SIZE(fseq);
5632    /* If empty sequence, return u"". */
5633    if (seqlen == 0) {
5634    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5635    	goto Done;
5636    }
5637    /* If singleton sequence with an exact Unicode, return that. */
5638    if (seqlen == 1) {
5639	item = PySequence_Fast_GET_ITEM(fseq, 0);
5640	if (PyUnicode_CheckExact(item)) {
5641	    Py_INCREF(item);
5642	    res = (PyUnicodeObject *)item;
5643	    goto Done;
5644	}
5645    }
5646
5647    /* At least two items to join, or one that isn't exact Unicode. */
5648    if (seqlen > 1) {
5649        /* Set up sep and seplen -- they're needed. */
5650    	if (separator == NULL) {
5651	    sep = &blank;
5652	    seplen = 1;
5653        }
5654    	else {
5655	    internal_separator = PyUnicode_FromObject(separator);
5656	    if (internal_separator == NULL)
5657	        goto onError;
5658	    sep = PyUnicode_AS_UNICODE(internal_separator);
5659	    seplen = PyUnicode_GET_SIZE(internal_separator);
5660	    /* In case PyUnicode_FromObject() mutated seq. */
5661	    seqlen = PySequence_Fast_GET_SIZE(fseq);
5662        }
5663    }
5664
5665    /* Get space. */
5666    res = _PyUnicode_New(res_alloc);
5667    if (res == NULL)
5668        goto onError;
5669    res_p = PyUnicode_AS_UNICODE(res);
5670    res_used = 0;
5671
5672    for (i = 0; i < seqlen; ++i) {
5673	Py_ssize_t itemlen;
5674	Py_ssize_t new_res_used;
5675
5676	item = PySequence_Fast_GET_ITEM(fseq, i);
5677	/* Convert item to Unicode. */
5678	if (!PyUnicode_Check(item)) {
5679	    PyErr_Format(PyExc_TypeError,
5680			 "sequence item %zd: expected str instance,"
5681			 " %.80s found",
5682			 i, Py_TYPE(item)->tp_name);
5683	    goto onError;
5684	}
5685	item = PyUnicode_FromObject(item);
5686	if (item == NULL)
5687	    goto onError;
5688	/* We own a reference to item from here on. */
5689
5690	/* In case PyUnicode_FromObject() mutated seq. */
5691	seqlen = PySequence_Fast_GET_SIZE(fseq);
5692
5693        /* Make sure we have enough space for the separator and the item. */
5694	itemlen = PyUnicode_GET_SIZE(item);
5695	new_res_used = res_used + itemlen;
5696	if (new_res_used < 0)
5697	    goto Overflow;
5698	if (i < seqlen - 1) {
5699	    new_res_used += seplen;
5700	    if (new_res_used < 0)
5701		goto Overflow;
5702	}
5703	if (new_res_used > res_alloc) {
5704	    /* double allocated size until it's big enough */
5705	    do {
5706	        res_alloc += res_alloc;
5707	        if (res_alloc <= 0)
5708	            goto Overflow;
5709	    } while (new_res_used > res_alloc);
5710	    if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5711		Py_DECREF(item);
5712		goto onError;
5713	    }
5714            res_p = PyUnicode_AS_UNICODE(res) + res_used;
5715	}
5716
5717	/* Copy item, and maybe the separator. */
5718	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5719	res_p += itemlen;
5720	if (i < seqlen - 1) {
5721	    Py_UNICODE_COPY(res_p, sep, seplen);
5722	    res_p += seplen;
5723	}
5724	Py_DECREF(item);
5725	res_used = new_res_used;
5726    }
5727
5728    /* Shrink res to match the used area; this probably can't fail,
5729     * but it's cheap to check.
5730     */
5731    if (_PyUnicode_Resize(&res, res_used) < 0)
5732	goto onError;
5733
5734 Done:
5735    Py_XDECREF(internal_separator);
5736    Py_DECREF(fseq);
5737    return (PyObject *)res;
5738
5739 Overflow:
5740    PyErr_SetString(PyExc_OverflowError,
5741                    "join() result is too long for a Python string");
5742    Py_DECREF(item);
5743    /* fall through */
5744
5745 onError:
5746    Py_XDECREF(internal_separator);
5747    Py_DECREF(fseq);
5748    Py_XDECREF(res);
5749    return NULL;
5750}
5751
5752static
5753PyUnicodeObject *pad(PyUnicodeObject *self,
5754		     Py_ssize_t left,
5755		     Py_ssize_t right,
5756		     Py_UNICODE fill)
5757{
5758    PyUnicodeObject *u;
5759
5760    if (left < 0)
5761        left = 0;
5762    if (right < 0)
5763        right = 0;
5764
5765    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5766        Py_INCREF(self);
5767        return self;
5768    }
5769
5770    u = _PyUnicode_New(left + self->length + right);
5771    if (u) {
5772        if (left)
5773            Py_UNICODE_FILL(u->str, fill, left);
5774        Py_UNICODE_COPY(u->str + left, self->str, self->length);
5775        if (right)
5776            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5777    }
5778
5779    return u;
5780}
5781
5782#define SPLIT_APPEND(data, left, right)					\
5783	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
5784	if (!str)							\
5785	    goto onError;						\
5786	if (PyList_Append(list, str)) {					\
5787	    Py_DECREF(str);						\
5788	    goto onError;						\
5789	}								\
5790        else								\
5791            Py_DECREF(str);
5792
5793static
5794PyObject *split_whitespace(PyUnicodeObject *self,
5795			   PyObject *list,
5796			   Py_ssize_t maxcount)
5797{
5798    register Py_ssize_t i;
5799    register Py_ssize_t j;
5800    Py_ssize_t len = self->length;
5801    PyObject *str;
5802    register const Py_UNICODE *buf = self->str;
5803
5804    for (i = j = 0; i < len; ) {
5805	/* find a token */
5806	while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5807	    i++;
5808	j = i;
5809	while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5810	    i++;
5811	if (j < i) {
5812	    if (maxcount-- <= 0)
5813		break;
5814	    SPLIT_APPEND(buf, j, i);
5815	    while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5816		i++;
5817	    j = i;
5818	}
5819    }
5820    if (j < len) {
5821	SPLIT_APPEND(buf, j, len);
5822    }
5823    return list;
5824
5825 onError:
5826    Py_DECREF(list);
5827    return NULL;
5828}
5829
5830PyObject *PyUnicode_Splitlines(PyObject *string,
5831			       int keepends)
5832{
5833    register Py_ssize_t i;
5834    register Py_ssize_t j;
5835    Py_ssize_t len;
5836    PyObject *list;
5837    PyObject *str;
5838    Py_UNICODE *data;
5839
5840    string = PyUnicode_FromObject(string);
5841    if (string == NULL)
5842	return NULL;
5843    data = PyUnicode_AS_UNICODE(string);
5844    len = PyUnicode_GET_SIZE(string);
5845
5846    list = PyList_New(0);
5847    if (!list)
5848        goto onError;
5849
5850    for (i = j = 0; i < len; ) {
5851	Py_ssize_t eol;
5852
5853	/* Find a line and append it */
5854	while (i < len && !BLOOM_LINEBREAK(data[i]))
5855	    i++;
5856
5857	/* Skip the line break reading CRLF as one line break */
5858	eol = i;
5859	if (i < len) {
5860	    if (data[i] == '\r' && i + 1 < len &&
5861		data[i+1] == '\n')
5862		i += 2;
5863	    else
5864		i++;
5865	    if (keepends)
5866		eol = i;
5867	}
5868	SPLIT_APPEND(data, j, eol);
5869	j = i;
5870    }
5871    if (j < len) {
5872	SPLIT_APPEND(data, j, len);
5873    }
5874
5875    Py_DECREF(string);
5876    return list;
5877
5878 onError:
5879    Py_XDECREF(list);
5880    Py_DECREF(string);
5881    return NULL;
5882}
5883
5884static
5885PyObject *split_char(PyUnicodeObject *self,
5886		     PyObject *list,
5887		     Py_UNICODE ch,
5888		     Py_ssize_t maxcount)
5889{
5890    register Py_ssize_t i;
5891    register Py_ssize_t j;
5892    Py_ssize_t len = self->length;
5893    PyObject *str;
5894    register const Py_UNICODE *buf = self->str;
5895
5896    for (i = j = 0; i < len; ) {
5897	if (buf[i] == ch) {
5898	    if (maxcount-- <= 0)
5899		break;
5900	    SPLIT_APPEND(buf, j, i);
5901	    i = j = i + 1;
5902	} else
5903	    i++;
5904    }
5905    if (j <= len) {
5906	SPLIT_APPEND(buf, j, len);
5907    }
5908    return list;
5909
5910 onError:
5911    Py_DECREF(list);
5912    return NULL;
5913}
5914
5915static
5916PyObject *split_substring(PyUnicodeObject *self,
5917			  PyObject *list,
5918			  PyUnicodeObject *substring,
5919			  Py_ssize_t maxcount)
5920{
5921    register Py_ssize_t i;
5922    register Py_ssize_t j;
5923    Py_ssize_t len = self->length;
5924    Py_ssize_t sublen = substring->length;
5925    PyObject *str;
5926
5927    for (i = j = 0; i <= len - sublen; ) {
5928	if (Py_UNICODE_MATCH(self, i, substring)) {
5929	    if (maxcount-- <= 0)
5930		break;
5931	    SPLIT_APPEND(self->str, j, i);
5932	    i = j = i + sublen;
5933	} else
5934	    i++;
5935    }
5936    if (j <= len) {
5937	SPLIT_APPEND(self->str, j, len);
5938    }
5939    return list;
5940
5941 onError:
5942    Py_DECREF(list);
5943    return NULL;
5944}
5945
5946static
5947PyObject *rsplit_whitespace(PyUnicodeObject *self,
5948			    PyObject *list,
5949			    Py_ssize_t maxcount)
5950{
5951    register Py_ssize_t i;
5952    register Py_ssize_t j;
5953    Py_ssize_t len = self->length;
5954    PyObject *str;
5955    register const Py_UNICODE *buf = self->str;
5956
5957    for (i = j = len - 1; i >= 0; ) {
5958	/* find a token */
5959	while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5960	    i--;
5961	j = i;
5962	while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5963	    i--;
5964	if (j > i) {
5965	    if (maxcount-- <= 0)
5966		break;
5967	    SPLIT_APPEND(buf, i + 1, j + 1);
5968	    while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5969		i--;
5970	    j = i;
5971	}
5972    }
5973    if (j >= 0) {
5974	SPLIT_APPEND(buf, 0, j + 1);
5975    }
5976    if (PyList_Reverse(list) < 0)
5977        goto onError;
5978    return list;
5979
5980 onError:
5981    Py_DECREF(list);
5982    return NULL;
5983}
5984
5985static
5986PyObject *rsplit_char(PyUnicodeObject *self,
5987		      PyObject *list,
5988		      Py_UNICODE ch,
5989		      Py_ssize_t maxcount)
5990{
5991    register Py_ssize_t i;
5992    register Py_ssize_t j;
5993    Py_ssize_t len = self->length;
5994    PyObject *str;
5995    register const Py_UNICODE *buf = self->str;
5996
5997    for (i = j = len - 1; i >= 0; ) {
5998	if (buf[i] == ch) {
5999	    if (maxcount-- <= 0)
6000		break;
6001	    SPLIT_APPEND(buf, i + 1, j + 1);
6002	    j = i = i - 1;
6003	} else
6004	    i--;
6005    }
6006    if (j >= -1) {
6007	SPLIT_APPEND(buf, 0, j + 1);
6008    }
6009    if (PyList_Reverse(list) < 0)
6010        goto onError;
6011    return list;
6012
6013 onError:
6014    Py_DECREF(list);
6015    return NULL;
6016}
6017
6018static
6019PyObject *rsplit_substring(PyUnicodeObject *self,
6020			   PyObject *list,
6021			   PyUnicodeObject *substring,
6022			   Py_ssize_t maxcount)
6023{
6024    register Py_ssize_t i;
6025    register Py_ssize_t j;
6026    Py_ssize_t len = self->length;
6027    Py_ssize_t sublen = substring->length;
6028    PyObject *str;
6029
6030    for (i = len - sublen, j = len; i >= 0; ) {
6031	if (Py_UNICODE_MATCH(self, i, substring)) {
6032	    if (maxcount-- <= 0)
6033		break;
6034	    SPLIT_APPEND(self->str, i + sublen, j);
6035	    j = i;
6036	    i -= sublen;
6037	} else
6038	    i--;
6039    }
6040    if (j >= 0) {
6041	SPLIT_APPEND(self->str, 0, j);
6042    }
6043    if (PyList_Reverse(list) < 0)
6044        goto onError;
6045    return list;
6046
6047 onError:
6048    Py_DECREF(list);
6049    return NULL;
6050}
6051
6052#undef SPLIT_APPEND
6053
6054static
6055PyObject *split(PyUnicodeObject *self,
6056		PyUnicodeObject *substring,
6057		Py_ssize_t maxcount)
6058{
6059    PyObject *list;
6060
6061    if (maxcount < 0)
6062        maxcount = PY_SSIZE_T_MAX;
6063
6064    list = PyList_New(0);
6065    if (!list)
6066        return NULL;
6067
6068    if (substring == NULL)
6069	return split_whitespace(self,list,maxcount);
6070
6071    else if (substring->length == 1)
6072	return split_char(self,list,substring->str[0],maxcount);
6073
6074    else if (substring->length == 0) {
6075	Py_DECREF(list);
6076	PyErr_SetString(PyExc_ValueError, "empty separator");
6077	return NULL;
6078    }
6079    else
6080	return split_substring(self,list,substring,maxcount);
6081}
6082
6083static
6084PyObject *rsplit(PyUnicodeObject *self,
6085		 PyUnicodeObject *substring,
6086		 Py_ssize_t maxcount)
6087{
6088    PyObject *list;
6089
6090    if (maxcount < 0)
6091        maxcount = PY_SSIZE_T_MAX;
6092
6093    list = PyList_New(0);
6094    if (!list)
6095        return NULL;
6096
6097    if (substring == NULL)
6098	return rsplit_whitespace(self,list,maxcount);
6099
6100    else if (substring->length == 1)
6101	return rsplit_char(self,list,substring->str[0],maxcount);
6102
6103    else if (substring->length == 0) {
6104	Py_DECREF(list);
6105	PyErr_SetString(PyExc_ValueError, "empty separator");
6106	return NULL;
6107    }
6108    else
6109	return rsplit_substring(self,list,substring,maxcount);
6110}
6111
6112static
6113PyObject *replace(PyUnicodeObject *self,
6114		  PyUnicodeObject *str1,
6115		  PyUnicodeObject *str2,
6116		  Py_ssize_t maxcount)
6117{
6118    PyUnicodeObject *u;
6119
6120    if (maxcount < 0)
6121	maxcount = PY_SSIZE_T_MAX;
6122
6123    if (str1->length == str2->length) {
6124        /* same length */
6125        Py_ssize_t i;
6126        if (str1->length == 1) {
6127            /* replace characters */
6128            Py_UNICODE u1, u2;
6129            if (!findchar(self->str, self->length, str1->str[0]))
6130                goto nothing;
6131            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6132            if (!u)
6133                return NULL;
6134            Py_UNICODE_COPY(u->str, self->str, self->length);
6135            u1 = str1->str[0];
6136            u2 = str2->str[0];
6137            for (i = 0; i < u->length; i++)
6138                if (u->str[i] == u1) {
6139                    if (--maxcount < 0)
6140                        break;
6141                    u->str[i] = u2;
6142                }
6143        } else {
6144            i = fastsearch(
6145                self->str, self->length, str1->str, str1->length, FAST_SEARCH
6146                );
6147            if (i < 0)
6148                goto nothing;
6149            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6150            if (!u)
6151                return NULL;
6152            Py_UNICODE_COPY(u->str, self->str, self->length);
6153            while (i <= self->length - str1->length)
6154                if (Py_UNICODE_MATCH(self, i, str1)) {
6155                    if (--maxcount < 0)
6156                        break;
6157                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6158                    i += str1->length;
6159                } else
6160                    i++;
6161        }
6162    } else {
6163
6164        Py_ssize_t n, i, j, e;
6165        Py_ssize_t product, new_size, delta;
6166        Py_UNICODE *p;
6167
6168        /* replace strings */
6169        n = stringlib_count(self->str, self->length, str1->str, str1->length);
6170        if (n > maxcount)
6171            n = maxcount;
6172        if (n == 0)
6173            goto nothing;
6174        /* new_size = self->length + n * (str2->length - str1->length)); */
6175        delta = (str2->length - str1->length);
6176        if (delta == 0) {
6177            new_size = self->length;
6178        } else {
6179            product = n * (str2->length - str1->length);
6180            if ((product / (str2->length - str1->length)) != n) {
6181                PyErr_SetString(PyExc_OverflowError,
6182                                "replace string is too long");
6183                return NULL;
6184            }
6185            new_size = self->length + product;
6186            if (new_size < 0) {
6187                PyErr_SetString(PyExc_OverflowError,
6188                                "replace string is too long");
6189                return NULL;
6190            }
6191        }
6192        u = _PyUnicode_New(new_size);
6193        if (!u)
6194            return NULL;
6195        i = 0;
6196        p = u->str;
6197        e = self->length - str1->length;
6198        if (str1->length > 0) {
6199            while (n-- > 0) {
6200                /* look for next match */
6201                j = i;
6202                while (j <= e) {
6203                    if (Py_UNICODE_MATCH(self, j, str1))
6204                        break;
6205                    j++;
6206                }
6207		if (j > i) {
6208                    if (j > e)
6209                        break;
6210                    /* copy unchanged part [i:j] */
6211                    Py_UNICODE_COPY(p, self->str+i, j-i);
6212                    p += j - i;
6213                }
6214                /* copy substitution string */
6215                if (str2->length > 0) {
6216                    Py_UNICODE_COPY(p, str2->str, str2->length);
6217                    p += str2->length;
6218                }
6219                i = j + str1->length;
6220            }
6221            if (i < self->length)
6222                /* copy tail [i:] */
6223                Py_UNICODE_COPY(p, self->str+i, self->length-i);
6224        } else {
6225            /* interleave */
6226            while (n > 0) {
6227                Py_UNICODE_COPY(p, str2->str, str2->length);
6228                p += str2->length;
6229                if (--n <= 0)
6230                    break;
6231                *p++ = self->str[i++];
6232            }
6233            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6234        }
6235    }
6236    return (PyObject *) u;
6237
6238nothing:
6239    /* nothing to replace; return original string (when possible) */
6240    if (PyUnicode_CheckExact(self)) {
6241        Py_INCREF(self);
6242        return (PyObject *) self;
6243    }
6244    return PyUnicode_FromUnicode(self->str, self->length);
6245}
6246
6247/* --- Unicode Object Methods --------------------------------------------- */
6248
6249PyDoc_STRVAR(title__doc__,
6250"S.title() -> str\n\
6251\n\
6252Return a titlecased version of S, i.e. words start with title case\n\
6253characters, all remaining cased characters have lower case.");
6254
6255static PyObject*
6256unicode_title(PyUnicodeObject *self)
6257{
6258    return fixup(self, fixtitle);
6259}
6260
6261PyDoc_STRVAR(capitalize__doc__,
6262"S.capitalize() -> str\n\
6263\n\
6264Return a capitalized version of S, i.e. make the first character\n\
6265have upper case.");
6266
6267static PyObject*
6268unicode_capitalize(PyUnicodeObject *self)
6269{
6270    return fixup(self, fixcapitalize);
6271}
6272
6273#if 0
6274PyDoc_STRVAR(capwords__doc__,
6275"S.capwords() -> str\n\
6276\n\
6277Apply .capitalize() to all words in S and return the result with\n\
6278normalized whitespace (all whitespace strings are replaced by ' ').");
6279
6280static PyObject*
6281unicode_capwords(PyUnicodeObject *self)
6282{
6283    PyObject *list;
6284    PyObject *item;
6285    Py_ssize_t i;
6286
6287    /* Split into words */
6288    list = split(self, NULL, -1);
6289    if (!list)
6290        return NULL;
6291
6292    /* Capitalize each word */
6293    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6294        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6295		     fixcapitalize);
6296        if (item == NULL)
6297            goto onError;
6298        Py_DECREF(PyList_GET_ITEM(list, i));
6299        PyList_SET_ITEM(list, i, item);
6300    }
6301
6302    /* Join the words to form a new string */
6303    item = PyUnicode_Join(NULL, list);
6304
6305onError:
6306    Py_DECREF(list);
6307    return (PyObject *)item;
6308}
6309#endif
6310
6311/* Argument converter.  Coerces to a single unicode character */
6312
6313static int
6314convert_uc(PyObject *obj, void *addr)
6315{
6316	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6317	PyObject *uniobj;
6318	Py_UNICODE *unistr;
6319
6320	uniobj = PyUnicode_FromObject(obj);
6321	if (uniobj == NULL) {
6322		PyErr_SetString(PyExc_TypeError,
6323			"The fill character cannot be converted to Unicode");
6324		return 0;
6325	}
6326	if (PyUnicode_GET_SIZE(uniobj) != 1) {
6327		PyErr_SetString(PyExc_TypeError,
6328			"The fill character must be exactly one character long");
6329		Py_DECREF(uniobj);
6330		return 0;
6331	}
6332	unistr = PyUnicode_AS_UNICODE(uniobj);
6333	*fillcharloc = unistr[0];
6334	Py_DECREF(uniobj);
6335	return 1;
6336}
6337
6338PyDoc_STRVAR(center__doc__,
6339"S.center(width[, fillchar]) -> str\n\
6340\n\
6341Return S centered in a Unicode string of length width. Padding is\n\
6342done using the specified fill character (default is a space)");
6343
6344static PyObject *
6345unicode_center(PyUnicodeObject *self, PyObject *args)
6346{
6347    Py_ssize_t marg, left;
6348    Py_ssize_t width;
6349    Py_UNICODE fillchar = ' ';
6350
6351    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6352        return NULL;
6353
6354    if (self->length >= width && PyUnicode_CheckExact(self)) {
6355        Py_INCREF(self);
6356        return (PyObject*) self;
6357    }
6358
6359    marg = width - self->length;
6360    left = marg / 2 + (marg & width & 1);
6361
6362    return (PyObject*) pad(self, left, marg - left, fillchar);
6363}
6364
6365#if 0
6366
6367/* This code should go into some future Unicode collation support
6368   module. The basic comparison should compare ordinals on a naive
6369   basis (this is what Java does and thus JPython too). */
6370
6371/* speedy UTF-16 code point order comparison */
6372/* gleaned from: */
6373/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6374
6375static short utf16Fixup[32] =
6376{
6377    0, 0, 0, 0, 0, 0, 0, 0,
6378    0, 0, 0, 0, 0, 0, 0, 0,
6379    0, 0, 0, 0, 0, 0, 0, 0,
6380    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6381};
6382
6383static int
6384unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6385{
6386    Py_ssize_t len1, len2;
6387
6388    Py_UNICODE *s1 = str1->str;
6389    Py_UNICODE *s2 = str2->str;
6390
6391    len1 = str1->length;
6392    len2 = str2->length;
6393
6394    while (len1 > 0 && len2 > 0) {
6395        Py_UNICODE c1, c2;
6396
6397        c1 = *s1++;
6398        c2 = *s2++;
6399
6400	if (c1 > (1<<11) * 26)
6401	    c1 += utf16Fixup[c1>>11];
6402	if (c2 > (1<<11) * 26)
6403            c2 += utf16Fixup[c2>>11];
6404        /* now c1 and c2 are in UTF-32-compatible order */
6405
6406        if (c1 != c2)
6407            return (c1 < c2) ? -1 : 1;
6408
6409        len1--; len2--;
6410    }
6411
6412    return (len1 < len2) ? -1 : (len1 != len2);
6413}
6414
6415#else
6416
6417static int
6418unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6419{
6420    register Py_ssize_t len1, len2;
6421
6422    Py_UNICODE *s1 = str1->str;
6423    Py_UNICODE *s2 = str2->str;
6424
6425    len1 = str1->length;
6426    len2 = str2->length;
6427
6428    while (len1 > 0 && len2 > 0) {
6429        Py_UNICODE c1, c2;
6430
6431        c1 = *s1++;
6432        c2 = *s2++;
6433
6434        if (c1 != c2)
6435            return (c1 < c2) ? -1 : 1;
6436
6437        len1--; len2--;
6438    }
6439
6440    return (len1 < len2) ? -1 : (len1 != len2);
6441}
6442
6443#endif
6444
6445int PyUnicode_Compare(PyObject *left,
6446		      PyObject *right)
6447{
6448    if (PyUnicode_Check(left) && PyUnicode_Check(right))
6449        return unicode_compare((PyUnicodeObject *)left,
6450                               (PyUnicodeObject *)right);
6451    PyErr_Format(PyExc_TypeError,
6452                 "Can't compare %.100s and %.100s",
6453                 left->ob_type->tp_name,
6454                 right->ob_type->tp_name);
6455    return -1;
6456}
6457
6458int
6459PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6460{
6461    int i;
6462    Py_UNICODE *id;
6463    assert(PyUnicode_Check(uni));
6464    id = PyUnicode_AS_UNICODE(uni);
6465    /* Compare Unicode string and source character set string */
6466    for (i = 0; id[i] && str[i]; i++)
6467	if (id[i] != str[i])
6468	    return ((int)id[i] < (int)str[i]) ? -1 : 1;
6469    if (id[i])
6470	return 1; /* uni is longer */
6471    if (str[i])
6472	return -1; /* str is longer */
6473    return 0;
6474}
6475
6476PyObject *PyUnicode_RichCompare(PyObject *left,
6477                                PyObject *right,
6478                                int op)
6479{
6480    int result;
6481
6482    result = PyUnicode_Compare(left, right);
6483    if (result == -1 && PyErr_Occurred())
6484        goto onError;
6485
6486    /* Convert the return value to a Boolean */
6487    switch (op) {
6488    case Py_EQ:
6489        result = (result == 0);
6490        break;
6491    case Py_NE:
6492        result = (result != 0);
6493        break;
6494    case Py_LE:
6495        result = (result <= 0);
6496        break;
6497    case Py_GE:
6498        result = (result >= 0);
6499        break;
6500    case Py_LT:
6501        result = (result == -1);
6502        break;
6503    case Py_GT:
6504        result = (result == 1);
6505        break;
6506    }
6507    return PyBool_FromLong(result);
6508
6509 onError:
6510
6511    /* Standard case
6512
6513       Type errors mean that PyUnicode_FromObject() could not convert
6514       one of the arguments (usually the right hand side) to Unicode,
6515       ie. we can't handle the comparison request. However, it is
6516       possible that the other object knows a comparison method, which
6517       is why we return Py_NotImplemented to give the other object a
6518       chance.
6519
6520    */
6521    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6522        PyErr_Clear();
6523        Py_INCREF(Py_NotImplemented);
6524        return Py_NotImplemented;
6525    }
6526    if (op != Py_EQ && op != Py_NE)
6527        return NULL;
6528
6529    /* Equality comparison.
6530
6531       This is a special case: we silence any PyExc_UnicodeDecodeError
6532       and instead turn it into a PyErr_UnicodeWarning.
6533
6534    */
6535    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6536        return NULL;
6537    PyErr_Clear();
6538    if (PyErr_WarnEx(PyExc_UnicodeWarning,
6539                     (op == Py_EQ) ?
6540                     "Unicode equal comparison "
6541                     "failed to convert both arguments to Unicode - "
6542                     "interpreting them as being unequal"
6543                     :
6544                     "Unicode unequal comparison "
6545                     "failed to convert both arguments to Unicode - "
6546                     "interpreting them as being unequal",
6547                     1) < 0)
6548        return NULL;
6549    result = (op == Py_NE);
6550    return PyBool_FromLong(result);
6551}
6552
6553int PyUnicode_Contains(PyObject *container,
6554		       PyObject *element)
6555{
6556    PyObject *str, *sub;
6557    int result;
6558
6559    /* Coerce the two arguments */
6560    sub = PyUnicode_FromObject(element);
6561    if (!sub) {
6562	PyErr_Format(PyExc_TypeError,
6563	    "'in <string>' requires string as left operand, not %s",
6564	    element->ob_type->tp_name);
6565        return -1;
6566    }
6567
6568    str = PyUnicode_FromObject(container);
6569    if (!str) {
6570        Py_DECREF(sub);
6571        return -1;
6572    }
6573
6574    result = stringlib_contains_obj(str, sub);
6575
6576    Py_DECREF(str);
6577    Py_DECREF(sub);
6578
6579    return result;
6580}
6581
6582/* Concat to string or Unicode object giving a new Unicode object. */
6583
6584PyObject *PyUnicode_Concat(PyObject *left,
6585			   PyObject *right)
6586{
6587    PyUnicodeObject *u = NULL, *v = NULL, *w;
6588
6589    /* Coerce the two arguments */
6590    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6591    if (u == NULL)
6592	goto onError;
6593    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6594    if (v == NULL)
6595	goto onError;
6596
6597    /* Shortcuts */
6598    if (v == unicode_empty) {
6599	Py_DECREF(v);
6600	return (PyObject *)u;
6601    }
6602    if (u == unicode_empty) {
6603	Py_DECREF(u);
6604	return (PyObject *)v;
6605    }
6606
6607    /* Concat the two Unicode strings */
6608    w = _PyUnicode_New(u->length + v->length);
6609    if (w == NULL)
6610	goto onError;
6611    Py_UNICODE_COPY(w->str, u->str, u->length);
6612    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6613
6614    Py_DECREF(u);
6615    Py_DECREF(v);
6616    return (PyObject *)w;
6617
6618onError:
6619    Py_XDECREF(u);
6620    Py_XDECREF(v);
6621    return NULL;
6622}
6623
6624void
6625PyUnicode_Append(PyObject **pleft, PyObject *right)
6626{
6627	PyObject *new;
6628	if (*pleft == NULL)
6629		return;
6630	if (right == NULL || !PyUnicode_Check(*pleft)) {
6631		Py_DECREF(*pleft);
6632		*pleft = NULL;
6633		return;
6634	}
6635	new = PyUnicode_Concat(*pleft, right);
6636	Py_DECREF(*pleft);
6637	*pleft = new;
6638}
6639
6640void
6641PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
6642{
6643	PyUnicode_Append(pleft, right);
6644	Py_XDECREF(right);
6645}
6646
6647PyDoc_STRVAR(count__doc__,
6648"S.count(sub[, start[, end]]) -> int\n\
6649\n\
6650Return the number of non-overlapping occurrences of substring sub in\n\
6651Unicode string S[start:end].  Optional arguments start and end are\n\
6652interpreted as in slice notation.");
6653
6654static PyObject *
6655unicode_count(PyUnicodeObject *self, PyObject *args)
6656{
6657    PyUnicodeObject *substring;
6658    Py_ssize_t start = 0;
6659    Py_ssize_t end = PY_SSIZE_T_MAX;
6660    PyObject *result;
6661
6662    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
6663		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6664        return NULL;
6665
6666    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6667        (PyObject *)substring);
6668    if (substring == NULL)
6669	return NULL;
6670
6671    FIX_START_END(self);
6672
6673    result = PyLong_FromSsize_t(
6674        stringlib_count(self->str + start, end - start,
6675                        substring->str, substring->length)
6676        );
6677
6678    Py_DECREF(substring);
6679
6680    return result;
6681}
6682
6683PyDoc_STRVAR(encode__doc__,
6684"S.encode([encoding[, errors]]) -> bytes\n\
6685\n\
6686Encode S using the codec registered for encoding. encoding defaults\n\
6687to the default encoding. errors may be given to set a different error\n\
6688handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6689a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6690'xmlcharrefreplace' as well as any other name registered with\n\
6691codecs.register_error that can handle UnicodeEncodeErrors.");
6692
6693static PyObject *
6694unicode_encode(PyUnicodeObject *self, PyObject *args)
6695{
6696    char *encoding = NULL;
6697    char *errors = NULL;
6698    PyObject *v;
6699
6700    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6701        return NULL;
6702    v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
6703    if (v == NULL)
6704        goto onError;
6705    if (!PyBytes_Check(v)) {
6706        PyErr_Format(PyExc_TypeError,
6707                     "encoder did not return a bytes object "
6708                     "(type=%.400s)",
6709                     Py_TYPE(v)->tp_name);
6710        Py_DECREF(v);
6711        return NULL;
6712    }
6713    return v;
6714
6715 onError:
6716    return NULL;
6717}
6718
6719PyDoc_STRVAR(expandtabs__doc__,
6720"S.expandtabs([tabsize]) -> str\n\
6721\n\
6722Return a copy of S where all tab characters are expanded using spaces.\n\
6723If tabsize is not given, a tab size of 8 characters is assumed.");
6724
6725static PyObject*
6726unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6727{
6728    Py_UNICODE *e;
6729    Py_UNICODE *p;
6730    Py_UNICODE *q;
6731    Py_UNICODE *qe;
6732    Py_ssize_t i, j, incr;
6733    PyUnicodeObject *u;
6734    int tabsize = 8;
6735
6736    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6737	return NULL;
6738
6739    /* First pass: determine size of output string */
6740    i = 0; /* chars up to and including most recent \n or \r */
6741    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6742    e = self->str + self->length; /* end of input */
6743    for (p = self->str; p < e; p++)
6744        if (*p == '\t') {
6745	    if (tabsize > 0) {
6746		incr = tabsize - (j % tabsize); /* cannot overflow */
6747		if (j > PY_SSIZE_T_MAX - incr)
6748		    goto overflow1;
6749		j += incr;
6750            }
6751	}
6752        else {
6753	    if (j > PY_SSIZE_T_MAX - 1)
6754		goto overflow1;
6755            j++;
6756            if (*p == '\n' || *p == '\r') {
6757		if (i > PY_SSIZE_T_MAX - j)
6758		    goto overflow1;
6759                i += j;
6760                j = 0;
6761            }
6762        }
6763
6764    if (i > PY_SSIZE_T_MAX - j)
6765	goto overflow1;
6766
6767    /* Second pass: create output string and fill it */
6768    u = _PyUnicode_New(i + j);
6769    if (!u)
6770        return NULL;
6771
6772    j = 0; /* same as in first pass */
6773    q = u->str; /* next output char */
6774    qe = u->str + u->length; /* end of output */
6775
6776    for (p = self->str; p < e; p++)
6777        if (*p == '\t') {
6778	    if (tabsize > 0) {
6779		i = tabsize - (j % tabsize);
6780		j += i;
6781		while (i--) {
6782		    if (q >= qe)
6783			goto overflow2;
6784		    *q++ = ' ';
6785                }
6786	    }
6787	}
6788	else {
6789	    if (q >= qe)
6790		goto overflow2;
6791	    *q++ = *p;
6792            j++;
6793            if (*p == '\n' || *p == '\r')
6794                j = 0;
6795        }
6796
6797    return (PyObject*) u;
6798
6799  overflow2:
6800    Py_DECREF(u);
6801  overflow1:
6802    PyErr_SetString(PyExc_OverflowError, "new string is too long");
6803    return NULL;
6804}
6805
6806PyDoc_STRVAR(find__doc__,
6807"S.find(sub[, start[, end]]) -> int\n\
6808\n\
6809Return the lowest index in S where substring sub is found,\n\
6810such that sub is contained within s[start:end].  Optional\n\
6811arguments start and end are interpreted as in slice notation.\n\
6812\n\
6813Return -1 on failure.");
6814
6815static PyObject *
6816unicode_find(PyUnicodeObject *self, PyObject *args)
6817{
6818    PyObject *substring;
6819    Py_ssize_t start;
6820    Py_ssize_t end;
6821    Py_ssize_t result;
6822
6823    if (!_ParseTupleFinds(args, &substring, &start, &end))
6824        return NULL;
6825
6826    result = stringlib_find_slice(
6827        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6828        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6829        start, end
6830        );
6831
6832    Py_DECREF(substring);
6833
6834    return PyLong_FromSsize_t(result);
6835}
6836
6837static PyObject *
6838unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6839{
6840    if (index < 0 || index >= self->length) {
6841        PyErr_SetString(PyExc_IndexError, "string index out of range");
6842        return NULL;
6843    }
6844
6845    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6846}
6847
6848/* Believe it or not, this produces the same value for ASCII strings
6849   as string_hash(). */
6850static long
6851unicode_hash(PyUnicodeObject *self)
6852{
6853    Py_ssize_t len;
6854    Py_UNICODE *p;
6855    long x;
6856
6857    if (self->hash != -1)
6858        return self->hash;
6859    len = Py_SIZE(self);
6860    p = self->str;
6861    x = *p << 7;
6862    while (--len >= 0)
6863        x = (1000003*x) ^ *p++;
6864    x ^= Py_SIZE(self);
6865    if (x == -1)
6866        x = -2;
6867    self->hash = x;
6868    return x;
6869}
6870
6871PyDoc_STRVAR(index__doc__,
6872"S.index(sub[, start[, end]]) -> int\n\
6873\n\
6874Like S.find() but raise ValueError when the substring is not found.");
6875
6876static PyObject *
6877unicode_index(PyUnicodeObject *self, PyObject *args)
6878{
6879    Py_ssize_t result;
6880    PyObject *substring;
6881    Py_ssize_t start;
6882    Py_ssize_t end;
6883
6884    if (!_ParseTupleFinds(args, &substring, &start, &end))
6885        return NULL;
6886
6887    result = stringlib_find_slice(
6888        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6889        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6890        start, end
6891        );
6892
6893    Py_DECREF(substring);
6894
6895    if (result < 0) {
6896        PyErr_SetString(PyExc_ValueError, "substring not found");
6897        return NULL;
6898    }
6899
6900    return PyLong_FromSsize_t(result);
6901}
6902
6903PyDoc_STRVAR(islower__doc__,
6904"S.islower() -> bool\n\
6905\n\
6906Return True if all cased characters in S are lowercase and there is\n\
6907at least one cased character in S, False otherwise.");
6908
6909static PyObject*
6910unicode_islower(PyUnicodeObject *self)
6911{
6912    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6913    register const Py_UNICODE *e;
6914    int cased;
6915
6916    /* Shortcut for single character strings */
6917    if (PyUnicode_GET_SIZE(self) == 1)
6918	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6919
6920    /* Special case for empty strings */
6921    if (PyUnicode_GET_SIZE(self) == 0)
6922	return PyBool_FromLong(0);
6923
6924    e = p + PyUnicode_GET_SIZE(self);
6925    cased = 0;
6926    for (; p < e; p++) {
6927	register const Py_UNICODE ch = *p;
6928
6929	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6930	    return PyBool_FromLong(0);
6931	else if (!cased && Py_UNICODE_ISLOWER(ch))
6932	    cased = 1;
6933    }
6934    return PyBool_FromLong(cased);
6935}
6936
6937PyDoc_STRVAR(isupper__doc__,
6938"S.isupper() -> bool\n\
6939\n\
6940Return True if all cased characters in S are uppercase and there is\n\
6941at least one cased character in S, False otherwise.");
6942
6943static PyObject*
6944unicode_isupper(PyUnicodeObject *self)
6945{
6946    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6947    register const Py_UNICODE *e;
6948    int cased;
6949
6950    /* Shortcut for single character strings */
6951    if (PyUnicode_GET_SIZE(self) == 1)
6952	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6953
6954    /* Special case for empty strings */
6955    if (PyUnicode_GET_SIZE(self) == 0)
6956	return PyBool_FromLong(0);
6957
6958    e = p + PyUnicode_GET_SIZE(self);
6959    cased = 0;
6960    for (; p < e; p++) {
6961	register const Py_UNICODE ch = *p;
6962
6963	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6964	    return PyBool_FromLong(0);
6965	else if (!cased && Py_UNICODE_ISUPPER(ch))
6966	    cased = 1;
6967    }
6968    return PyBool_FromLong(cased);
6969}
6970
6971PyDoc_STRVAR(istitle__doc__,
6972"S.istitle() -> bool\n\
6973\n\
6974Return True if S is a titlecased string and there is at least one\n\
6975character in S, i.e. upper- and titlecase characters may only\n\
6976follow uncased characters and lowercase characters only cased ones.\n\
6977Return False otherwise.");
6978
6979static PyObject*
6980unicode_istitle(PyUnicodeObject *self)
6981{
6982    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6983    register const Py_UNICODE *e;
6984    int cased, previous_is_cased;
6985
6986    /* Shortcut for single character strings */
6987    if (PyUnicode_GET_SIZE(self) == 1)
6988	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6989			       (Py_UNICODE_ISUPPER(*p) != 0));
6990
6991    /* Special case for empty strings */
6992    if (PyUnicode_GET_SIZE(self) == 0)
6993	return PyBool_FromLong(0);
6994
6995    e = p + PyUnicode_GET_SIZE(self);
6996    cased = 0;
6997    previous_is_cased = 0;
6998    for (; p < e; p++) {
6999	register const Py_UNICODE ch = *p;
7000
7001	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7002	    if (previous_is_cased)
7003		return PyBool_FromLong(0);
7004	    previous_is_cased = 1;
7005	    cased = 1;
7006	}
7007	else if (Py_UNICODE_ISLOWER(ch)) {
7008	    if (!previous_is_cased)
7009		return PyBool_FromLong(0);
7010	    previous_is_cased = 1;
7011	    cased = 1;
7012	}
7013	else
7014	    previous_is_cased = 0;
7015    }
7016    return PyBool_FromLong(cased);
7017}
7018
7019PyDoc_STRVAR(isspace__doc__,
7020"S.isspace() -> bool\n\
7021\n\
7022Return True if all characters in S are whitespace\n\
7023and there is at least one character in S, False otherwise.");
7024
7025static PyObject*
7026unicode_isspace(PyUnicodeObject *self)
7027{
7028    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7029    register const Py_UNICODE *e;
7030
7031    /* Shortcut for single character strings */
7032    if (PyUnicode_GET_SIZE(self) == 1 &&
7033	Py_UNICODE_ISSPACE(*p))
7034	return PyBool_FromLong(1);
7035
7036    /* Special case for empty strings */
7037    if (PyUnicode_GET_SIZE(self) == 0)
7038	return PyBool_FromLong(0);
7039
7040    e = p + PyUnicode_GET_SIZE(self);
7041    for (; p < e; p++) {
7042	if (!Py_UNICODE_ISSPACE(*p))
7043	    return PyBool_FromLong(0);
7044    }
7045    return PyBool_FromLong(1);
7046}
7047
7048PyDoc_STRVAR(isalpha__doc__,
7049"S.isalpha() -> bool\n\
7050\n\
7051Return True if all characters in S are alphabetic\n\
7052and there is at least one character in S, False otherwise.");
7053
7054static PyObject*
7055unicode_isalpha(PyUnicodeObject *self)
7056{
7057    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7058    register const Py_UNICODE *e;
7059
7060    /* Shortcut for single character strings */
7061    if (PyUnicode_GET_SIZE(self) == 1 &&
7062	Py_UNICODE_ISALPHA(*p))
7063	return PyBool_FromLong(1);
7064
7065    /* Special case for empty strings */
7066    if (PyUnicode_GET_SIZE(self) == 0)
7067	return PyBool_FromLong(0);
7068
7069    e = p + PyUnicode_GET_SIZE(self);
7070    for (; p < e; p++) {
7071	if (!Py_UNICODE_ISALPHA(*p))
7072	    return PyBool_FromLong(0);
7073    }
7074    return PyBool_FromLong(1);
7075}
7076
7077PyDoc_STRVAR(isalnum__doc__,
7078"S.isalnum() -> bool\n\
7079\n\
7080Return True if all characters in S are alphanumeric\n\
7081and there is at least one character in S, False otherwise.");
7082
7083static PyObject*
7084unicode_isalnum(PyUnicodeObject *self)
7085{
7086    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7087    register const Py_UNICODE *e;
7088
7089    /* Shortcut for single character strings */
7090    if (PyUnicode_GET_SIZE(self) == 1 &&
7091	Py_UNICODE_ISALNUM(*p))
7092	return PyBool_FromLong(1);
7093
7094    /* Special case for empty strings */
7095    if (PyUnicode_GET_SIZE(self) == 0)
7096	return PyBool_FromLong(0);
7097
7098    e = p + PyUnicode_GET_SIZE(self);
7099    for (; p < e; p++) {
7100	if (!Py_UNICODE_ISALNUM(*p))
7101	    return PyBool_FromLong(0);
7102    }
7103    return PyBool_FromLong(1);
7104}
7105
7106PyDoc_STRVAR(isdecimal__doc__,
7107"S.isdecimal() -> bool\n\
7108\n\
7109Return True if there are only decimal characters in S,\n\
7110False otherwise.");
7111
7112static PyObject*
7113unicode_isdecimal(PyUnicodeObject *self)
7114{
7115    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7116    register const Py_UNICODE *e;
7117
7118    /* Shortcut for single character strings */
7119    if (PyUnicode_GET_SIZE(self) == 1 &&
7120	Py_UNICODE_ISDECIMAL(*p))
7121	return PyBool_FromLong(1);
7122
7123    /* Special case for empty strings */
7124    if (PyUnicode_GET_SIZE(self) == 0)
7125	return PyBool_FromLong(0);
7126
7127    e = p + PyUnicode_GET_SIZE(self);
7128    for (; p < e; p++) {
7129	if (!Py_UNICODE_ISDECIMAL(*p))
7130	    return PyBool_FromLong(0);
7131    }
7132    return PyBool_FromLong(1);
7133}
7134
7135PyDoc_STRVAR(isdigit__doc__,
7136"S.isdigit() -> bool\n\
7137\n\
7138Return True if all characters in S are digits\n\
7139and there is at least one character in S, False otherwise.");
7140
7141static PyObject*
7142unicode_isdigit(PyUnicodeObject *self)
7143{
7144    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7145    register const Py_UNICODE *e;
7146
7147    /* Shortcut for single character strings */
7148    if (PyUnicode_GET_SIZE(self) == 1 &&
7149	Py_UNICODE_ISDIGIT(*p))
7150	return PyBool_FromLong(1);
7151
7152    /* Special case for empty strings */
7153    if (PyUnicode_GET_SIZE(self) == 0)
7154	return PyBool_FromLong(0);
7155
7156    e = p + PyUnicode_GET_SIZE(self);
7157    for (; p < e; p++) {
7158	if (!Py_UNICODE_ISDIGIT(*p))
7159	    return PyBool_FromLong(0);
7160    }
7161    return PyBool_FromLong(1);
7162}
7163
7164PyDoc_STRVAR(isnumeric__doc__,
7165"S.isnumeric() -> bool\n\
7166\n\
7167Return True if there are only numeric characters in S,\n\
7168False otherwise.");
7169
7170static PyObject*
7171unicode_isnumeric(PyUnicodeObject *self)
7172{
7173    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7174    register const Py_UNICODE *e;
7175
7176    /* Shortcut for single character strings */
7177    if (PyUnicode_GET_SIZE(self) == 1 &&
7178	Py_UNICODE_ISNUMERIC(*p))
7179	return PyBool_FromLong(1);
7180
7181    /* Special case for empty strings */
7182    if (PyUnicode_GET_SIZE(self) == 0)
7183	return PyBool_FromLong(0);
7184
7185    e = p + PyUnicode_GET_SIZE(self);
7186    for (; p < e; p++) {
7187	if (!Py_UNICODE_ISNUMERIC(*p))
7188	    return PyBool_FromLong(0);
7189    }
7190    return PyBool_FromLong(1);
7191}
7192
7193int
7194PyUnicode_IsIdentifier(PyObject *self)
7195{
7196    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7197    register const Py_UNICODE *e;
7198
7199    /* Special case for empty strings */
7200    if (PyUnicode_GET_SIZE(self) == 0)
7201	return 0;
7202
7203    /* PEP 3131 says that the first character must be in
7204       XID_Start and subsequent characters in XID_Continue,
7205       and for the ASCII range, the 2.x rules apply (i.e
7206       start with letters and underscore, continue with
7207       letters, digits, underscore). However, given the current
7208       definition of XID_Start and XID_Continue, it is sufficient
7209       to check just for these, except that _ must be allowed
7210       as starting an identifier.  */
7211    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7212        return 0;
7213
7214    e = p + PyUnicode_GET_SIZE(self);
7215    for (p++; p < e; p++) {
7216	if (!_PyUnicode_IsXidContinue(*p))
7217	    return 0;
7218    }
7219    return 1;
7220}
7221
7222PyDoc_STRVAR(isidentifier__doc__,
7223"S.isidentifier() -> bool\n\
7224\n\
7225Return True if S is a valid identifier according\n\
7226to the language definition.");
7227
7228static PyObject*
7229unicode_isidentifier(PyObject *self)
7230{
7231    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7232}
7233
7234PyDoc_STRVAR(join__doc__,
7235"S.join(sequence) -> str\n\
7236\n\
7237Return a string which is the concatenation of the strings in the\n\
7238sequence.  The separator between elements is S.");
7239
7240static PyObject*
7241unicode_join(PyObject *self, PyObject *data)
7242{
7243    return PyUnicode_Join(self, data);
7244}
7245
7246static Py_ssize_t
7247unicode_length(PyUnicodeObject *self)
7248{
7249    return self->length;
7250}
7251
7252PyDoc_STRVAR(ljust__doc__,
7253"S.ljust(width[, fillchar]) -> str\n\
7254\n\
7255Return S left justified in a Unicode string of length width. Padding is\n\
7256done using the specified fill character (default is a space).");
7257
7258static PyObject *
7259unicode_ljust(PyUnicodeObject *self, PyObject *args)
7260{
7261    Py_ssize_t width;
7262    Py_UNICODE fillchar = ' ';
7263
7264    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7265        return NULL;
7266
7267    if (self->length >= width && PyUnicode_CheckExact(self)) {
7268        Py_INCREF(self);
7269        return (PyObject*) self;
7270    }
7271
7272    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7273}
7274
7275PyDoc_STRVAR(lower__doc__,
7276"S.lower() -> str\n\
7277\n\
7278Return a copy of the string S converted to lowercase.");
7279
7280static PyObject*
7281unicode_lower(PyUnicodeObject *self)
7282{
7283    return fixup(self, fixlower);
7284}
7285
7286#define LEFTSTRIP 0
7287#define RIGHTSTRIP 1
7288#define BOTHSTRIP 2
7289
7290/* Arrays indexed by above */
7291static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7292
7293#define STRIPNAME(i) (stripformat[i]+3)
7294
7295/* externally visible for str.strip(unicode) */
7296PyObject *
7297_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7298{
7299	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7300	Py_ssize_t len = PyUnicode_GET_SIZE(self);
7301	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7302	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7303	Py_ssize_t i, j;
7304
7305        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7306
7307	i = 0;
7308	if (striptype != RIGHTSTRIP) {
7309            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7310                i++;
7311            }
7312	}
7313
7314	j = len;
7315	if (striptype != LEFTSTRIP) {
7316            do {
7317                j--;
7318            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7319            j++;
7320	}
7321
7322	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7323            Py_INCREF(self);
7324            return (PyObject*)self;
7325	}
7326	else
7327            return PyUnicode_FromUnicode(s+i, j-i);
7328}
7329
7330
7331static PyObject *
7332do_strip(PyUnicodeObject *self, int striptype)
7333{
7334	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7335	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7336
7337	i = 0;
7338	if (striptype != RIGHTSTRIP) {
7339		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7340			i++;
7341		}
7342	}
7343
7344	j = len;
7345	if (striptype != LEFTSTRIP) {
7346		do {
7347			j--;
7348		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7349		j++;
7350	}
7351
7352	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7353		Py_INCREF(self);
7354		return (PyObject*)self;
7355	}
7356	else
7357		return PyUnicode_FromUnicode(s+i, j-i);
7358}
7359
7360
7361static PyObject *
7362do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7363{
7364	PyObject *sep = NULL;
7365
7366	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7367		return NULL;
7368
7369	if (sep != NULL && sep != Py_None) {
7370		if (PyUnicode_Check(sep))
7371			return _PyUnicode_XStrip(self, striptype, sep);
7372		else {
7373			PyErr_Format(PyExc_TypeError,
7374				     "%s arg must be None, unicode or str",
7375				     STRIPNAME(striptype));
7376			return NULL;
7377		}
7378	}
7379
7380	return do_strip(self, striptype);
7381}
7382
7383
7384PyDoc_STRVAR(strip__doc__,
7385"S.strip([chars]) -> str\n\
7386\n\
7387Return a copy of the string S with leading and trailing\n\
7388whitespace removed.\n\
7389If chars is given and not None, remove characters in chars instead.\n\
7390If chars is a str, it will be converted to unicode before stripping");
7391
7392static PyObject *
7393unicode_strip(PyUnicodeObject *self, PyObject *args)
7394{
7395	if (PyTuple_GET_SIZE(args) == 0)
7396		return do_strip(self, BOTHSTRIP); /* Common case */
7397	else
7398		return do_argstrip(self, BOTHSTRIP, args);
7399}
7400
7401
7402PyDoc_STRVAR(lstrip__doc__,
7403"S.lstrip([chars]) -> str\n\
7404\n\
7405Return a copy of the string S with leading whitespace removed.\n\
7406If chars is given and not None, remove characters in chars instead.\n\
7407If chars is a str, it will be converted to unicode before stripping");
7408
7409static PyObject *
7410unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7411{
7412	if (PyTuple_GET_SIZE(args) == 0)
7413		return do_strip(self, LEFTSTRIP); /* Common case */
7414	else
7415		return do_argstrip(self, LEFTSTRIP, args);
7416}
7417
7418
7419PyDoc_STRVAR(rstrip__doc__,
7420"S.rstrip([chars]) -> str\n\
7421\n\
7422Return a copy of the string S with trailing whitespace removed.\n\
7423If chars is given and not None, remove characters in chars instead.\n\
7424If chars is a str, it will be converted to unicode before stripping");
7425
7426static PyObject *
7427unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7428{
7429	if (PyTuple_GET_SIZE(args) == 0)
7430		return do_strip(self, RIGHTSTRIP); /* Common case */
7431	else
7432		return do_argstrip(self, RIGHTSTRIP, args);
7433}
7434
7435
7436static PyObject*
7437unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7438{
7439    PyUnicodeObject *u;
7440    Py_UNICODE *p;
7441    Py_ssize_t nchars;
7442    size_t nbytes;
7443
7444    if (len < 0)
7445        len = 0;
7446
7447    if (len == 1 && PyUnicode_CheckExact(str)) {
7448        /* no repeat, return original string */
7449        Py_INCREF(str);
7450        return (PyObject*) str;
7451    }
7452
7453    /* ensure # of chars needed doesn't overflow int and # of bytes
7454     * needed doesn't overflow size_t
7455     */
7456    nchars = len * str->length;
7457    if (len && nchars / len != str->length) {
7458        PyErr_SetString(PyExc_OverflowError,
7459                        "repeated string is too long");
7460        return NULL;
7461    }
7462    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7463    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7464        PyErr_SetString(PyExc_OverflowError,
7465                        "repeated string is too long");
7466        return NULL;
7467    }
7468    u = _PyUnicode_New(nchars);
7469    if (!u)
7470        return NULL;
7471
7472    p = u->str;
7473
7474    if (str->length == 1 && len > 0) {
7475        Py_UNICODE_FILL(p, str->str[0], len);
7476    } else {
7477	Py_ssize_t done = 0; /* number of characters copied this far */
7478	if (done < nchars) {
7479            Py_UNICODE_COPY(p, str->str, str->length);
7480            done = str->length;
7481	}
7482	while (done < nchars) {
7483            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7484            Py_UNICODE_COPY(p+done, p, n);
7485            done += n;
7486	}
7487    }
7488
7489    return (PyObject*) u;
7490}
7491
7492PyObject *PyUnicode_Replace(PyObject *obj,
7493			    PyObject *subobj,
7494			    PyObject *replobj,
7495			    Py_ssize_t maxcount)
7496{
7497    PyObject *self;
7498    PyObject *str1;
7499    PyObject *str2;
7500    PyObject *result;
7501
7502    self = PyUnicode_FromObject(obj);
7503    if (self == NULL)
7504	return NULL;
7505    str1 = PyUnicode_FromObject(subobj);
7506    if (str1 == NULL) {
7507	Py_DECREF(self);
7508	return NULL;
7509    }
7510    str2 = PyUnicode_FromObject(replobj);
7511    if (str2 == NULL) {
7512	Py_DECREF(self);
7513	Py_DECREF(str1);
7514	return NULL;
7515    }
7516    result = replace((PyUnicodeObject *)self,
7517		     (PyUnicodeObject *)str1,
7518		     (PyUnicodeObject *)str2,
7519		     maxcount);
7520    Py_DECREF(self);
7521    Py_DECREF(str1);
7522    Py_DECREF(str2);
7523    return result;
7524}
7525
7526PyDoc_STRVAR(replace__doc__,
7527"S.replace (old, new[, count]) -> str\n\
7528\n\
7529Return a copy of S with all occurrences of substring\n\
7530old replaced by new.  If the optional argument count is\n\
7531given, only the first count occurrences are replaced.");
7532
7533static PyObject*
7534unicode_replace(PyUnicodeObject *self, PyObject *args)
7535{
7536    PyUnicodeObject *str1;
7537    PyUnicodeObject *str2;
7538    Py_ssize_t maxcount = -1;
7539    PyObject *result;
7540
7541    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7542        return NULL;
7543    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7544    if (str1 == NULL)
7545	return NULL;
7546    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7547    if (str2 == NULL) {
7548	Py_DECREF(str1);
7549	return NULL;
7550    }
7551
7552    result = replace(self, str1, str2, maxcount);
7553
7554    Py_DECREF(str1);
7555    Py_DECREF(str2);
7556    return result;
7557}
7558
7559static
7560PyObject *unicode_repr(PyObject *unicode)
7561{
7562    PyObject *repr;
7563    Py_UNICODE *p;
7564    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
7565    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
7566
7567    /* XXX(nnorwitz): rather than over-allocating, it would be
7568       better to choose a different scheme.  Perhaps scan the
7569       first N-chars of the string and allocate based on that size.
7570    */
7571    /* Initial allocation is based on the longest-possible unichr
7572       escape.
7573
7574       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
7575       unichr, so in this case it's the longest unichr escape. In
7576       narrow (UTF-16) builds this is five chars per source unichr
7577       since there are two unichrs in the surrogate pair, so in narrow
7578       (UTF-16) builds it's not the longest unichr escape.
7579
7580       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
7581       so in the narrow (UTF-16) build case it's the longest unichr
7582       escape.
7583    */
7584
7585    repr = PyUnicode_FromUnicode(NULL,
7586        2 /* quotes */
7587#ifdef Py_UNICODE_WIDE
7588        + 10*size
7589#else
7590        + 6*size
7591#endif
7592        + 1);
7593    if (repr == NULL)
7594        return NULL;
7595
7596    p = PyUnicode_AS_UNICODE(repr);
7597
7598    /* Add quote */
7599    *p++ = (findchar(s, size, '\'') &&
7600            !findchar(s, size, '"')) ? '"' : '\'';
7601    while (size-- > 0) {
7602        Py_UNICODE ch = *s++;
7603
7604        /* Escape quotes and backslashes */
7605        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
7606            *p++ = '\\';
7607            *p++ = ch;
7608            continue;
7609        }
7610
7611#ifdef Py_UNICODE_WIDE
7612        /* Map 21-bit characters to '\U00xxxxxx' */
7613        else if (ch >= 0x10000) {
7614            *p++ = '\\';
7615            *p++ = 'U';
7616            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
7617            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
7618            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
7619            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
7620            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
7621            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
7622            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
7623            *p++ = hexdigits[ch & 0x0000000F];
7624	    continue;
7625        }
7626#else
7627	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
7628	else if (ch >= 0xD800 && ch < 0xDC00) {
7629	    Py_UNICODE ch2;
7630	    Py_UCS4 ucs;
7631
7632	    ch2 = *s++;
7633	    size--;
7634	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
7635		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
7636		*p++ = '\\';
7637		*p++ = 'U';
7638		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
7639		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
7640		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
7641		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
7642		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
7643		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
7644		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
7645		*p++ = hexdigits[ucs & 0x0000000F];
7646		continue;
7647	    }
7648	    /* Fall through: isolated surrogates are copied as-is */
7649	    s--;
7650	    size++;
7651	}
7652#endif
7653
7654        /* Map 16-bit characters to '\uxxxx' */
7655        if (ch >= 256) {
7656            *p++ = '\\';
7657            *p++ = 'u';
7658            *p++ = hexdigits[(ch >> 12) & 0x000F];
7659            *p++ = hexdigits[(ch >> 8) & 0x000F];
7660            *p++ = hexdigits[(ch >> 4) & 0x000F];
7661            *p++ = hexdigits[ch & 0x000F];
7662        }
7663
7664        /* Map special whitespace to '\t', \n', '\r' */
7665        else if (ch == '\t') {
7666            *p++ = '\\';
7667            *p++ = 't';
7668        }
7669        else if (ch == '\n') {
7670            *p++ = '\\';
7671            *p++ = 'n';
7672        }
7673        else if (ch == '\r') {
7674            *p++ = '\\';
7675            *p++ = 'r';
7676        }
7677
7678        /* Map non-printable US ASCII to '\xhh' */
7679        else if (ch < ' ' || ch >= 0x7F) {
7680            *p++ = '\\';
7681            *p++ = 'x';
7682            *p++ = hexdigits[(ch >> 4) & 0x000F];
7683            *p++ = hexdigits[ch & 0x000F];
7684        }
7685
7686        /* Copy everything else as-is */
7687        else
7688            *p++ = (char) ch;
7689    }
7690    /* Add quote */
7691    *p++ = PyUnicode_AS_UNICODE(repr)[0];
7692
7693    *p = '\0';
7694    _PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
7695    return repr;
7696}
7697
7698PyDoc_STRVAR(rfind__doc__,
7699"S.rfind(sub[, start[, end]]) -> int\n\
7700\n\
7701Return the highest index in S where substring sub is found,\n\
7702such that sub is contained within s[start:end].  Optional\n\
7703arguments start and end are interpreted as in slice notation.\n\
7704\n\
7705Return -1 on failure.");
7706
7707static PyObject *
7708unicode_rfind(PyUnicodeObject *self, PyObject *args)
7709{
7710    PyObject *substring;
7711    Py_ssize_t start;
7712    Py_ssize_t end;
7713    Py_ssize_t result;
7714
7715    if (!_ParseTupleFinds(args, &substring, &start, &end))
7716	    return NULL;
7717
7718    result = stringlib_rfind_slice(
7719        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7720        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7721        start, end
7722        );
7723
7724    Py_DECREF(substring);
7725
7726    return PyLong_FromSsize_t(result);
7727}
7728
7729PyDoc_STRVAR(rindex__doc__,
7730"S.rindex(sub[, start[, end]]) -> int\n\
7731\n\
7732Like S.rfind() but raise ValueError when the substring is not found.");
7733
7734static PyObject *
7735unicode_rindex(PyUnicodeObject *self, PyObject *args)
7736{
7737    PyObject *substring;
7738    Py_ssize_t start;
7739    Py_ssize_t end;
7740    Py_ssize_t result;
7741
7742    if (!_ParseTupleFinds(args, &substring, &start, &end))
7743	    return NULL;
7744
7745    result = stringlib_rfind_slice(
7746        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7747        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7748        start, end
7749        );
7750
7751    Py_DECREF(substring);
7752
7753    if (result < 0) {
7754        PyErr_SetString(PyExc_ValueError, "substring not found");
7755        return NULL;
7756    }
7757    return PyLong_FromSsize_t(result);
7758}
7759
7760PyDoc_STRVAR(rjust__doc__,
7761"S.rjust(width[, fillchar]) -> str\n\
7762\n\
7763Return S right justified in a Unicode string of length width. Padding is\n\
7764done using the specified fill character (default is a space).");
7765
7766static PyObject *
7767unicode_rjust(PyUnicodeObject *self, PyObject *args)
7768{
7769    Py_ssize_t width;
7770    Py_UNICODE fillchar = ' ';
7771
7772    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7773        return NULL;
7774
7775    if (self->length >= width && PyUnicode_CheckExact(self)) {
7776        Py_INCREF(self);
7777        return (PyObject*) self;
7778    }
7779
7780    return (PyObject*) pad(self, width - self->length, 0, fillchar);
7781}
7782
7783PyObject *PyUnicode_Split(PyObject *s,
7784			  PyObject *sep,
7785			  Py_ssize_t maxsplit)
7786{
7787    PyObject *result;
7788
7789    s = PyUnicode_FromObject(s);
7790    if (s == NULL)
7791	return NULL;
7792    if (sep != NULL) {
7793	sep = PyUnicode_FromObject(sep);
7794	if (sep == NULL) {
7795	    Py_DECREF(s);
7796	    return NULL;
7797	}
7798    }
7799
7800    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7801
7802    Py_DECREF(s);
7803    Py_XDECREF(sep);
7804    return result;
7805}
7806
7807PyDoc_STRVAR(split__doc__,
7808"S.split([sep[, maxsplit]]) -> list of strings\n\
7809\n\
7810Return a list of the words in S, using sep as the\n\
7811delimiter string.  If maxsplit is given, at most maxsplit\n\
7812splits are done. If sep is not specified or is None, any\n\
7813whitespace string is a separator and empty strings are\n\
7814removed from the result.");
7815
7816static PyObject*
7817unicode_split(PyUnicodeObject *self, PyObject *args)
7818{
7819    PyObject *substring = Py_None;
7820    Py_ssize_t maxcount = -1;
7821
7822    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7823        return NULL;
7824
7825    if (substring == Py_None)
7826	return split(self, NULL, maxcount);
7827    else if (PyUnicode_Check(substring))
7828	return split(self, (PyUnicodeObject *)substring, maxcount);
7829    else
7830	return PyUnicode_Split((PyObject *)self, substring, maxcount);
7831}
7832
7833PyObject *
7834PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7835{
7836    PyObject* str_obj;
7837    PyObject* sep_obj;
7838    PyObject* out;
7839
7840    str_obj = PyUnicode_FromObject(str_in);
7841    if (!str_obj)
7842	return NULL;
7843    sep_obj = PyUnicode_FromObject(sep_in);
7844    if (!sep_obj) {
7845        Py_DECREF(str_obj);
7846        return NULL;
7847    }
7848
7849    out = stringlib_partition(
7850        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7851        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7852        );
7853
7854    Py_DECREF(sep_obj);
7855    Py_DECREF(str_obj);
7856
7857    return out;
7858}
7859
7860
7861PyObject *
7862PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7863{
7864    PyObject* str_obj;
7865    PyObject* sep_obj;
7866    PyObject* out;
7867
7868    str_obj = PyUnicode_FromObject(str_in);
7869    if (!str_obj)
7870	return NULL;
7871    sep_obj = PyUnicode_FromObject(sep_in);
7872    if (!sep_obj) {
7873        Py_DECREF(str_obj);
7874        return NULL;
7875    }
7876
7877    out = stringlib_rpartition(
7878        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7879        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7880        );
7881
7882    Py_DECREF(sep_obj);
7883    Py_DECREF(str_obj);
7884
7885    return out;
7886}
7887
7888PyDoc_STRVAR(partition__doc__,
7889"S.partition(sep) -> (head, sep, tail)\n\
7890\n\
7891Search for the separator sep in S, and return the part before it,\n\
7892the separator itself, and the part after it.  If the separator is not\n\
7893found, returns S and two empty strings.");
7894
7895static PyObject*
7896unicode_partition(PyUnicodeObject *self, PyObject *separator)
7897{
7898    return PyUnicode_Partition((PyObject *)self, separator);
7899}
7900
7901PyDoc_STRVAR(rpartition__doc__,
7902"S.rpartition(sep) -> (tail, sep, head)\n\
7903\n\
7904Search for the separator sep in S, starting at the end of S, and return\n\
7905the part before it, the separator itself, and the part after it.  If the\n\
7906separator is not found, returns two empty strings and S.");
7907
7908static PyObject*
7909unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7910{
7911    return PyUnicode_RPartition((PyObject *)self, separator);
7912}
7913
7914PyObject *PyUnicode_RSplit(PyObject *s,
7915			   PyObject *sep,
7916			   Py_ssize_t maxsplit)
7917{
7918    PyObject *result;
7919
7920    s = PyUnicode_FromObject(s);
7921    if (s == NULL)
7922	return NULL;
7923    if (sep != NULL) {
7924	sep = PyUnicode_FromObject(sep);
7925	if (sep == NULL) {
7926	    Py_DECREF(s);
7927	    return NULL;
7928	}
7929    }
7930
7931    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7932
7933    Py_DECREF(s);
7934    Py_XDECREF(sep);
7935    return result;
7936}
7937
7938PyDoc_STRVAR(rsplit__doc__,
7939"S.rsplit([sep[, maxsplit]]) -> list of strings\n\
7940\n\
7941Return a list of the words in S, using sep as the\n\
7942delimiter string, starting at the end of the string and\n\
7943working to the front.  If maxsplit is given, at most maxsplit\n\
7944splits are done. If sep is not specified, any whitespace string\n\
7945is a separator.");
7946
7947static PyObject*
7948unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7949{
7950    PyObject *substring = Py_None;
7951    Py_ssize_t maxcount = -1;
7952
7953    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7954        return NULL;
7955
7956    if (substring == Py_None)
7957	return rsplit(self, NULL, maxcount);
7958    else if (PyUnicode_Check(substring))
7959	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7960    else
7961	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7962}
7963
7964PyDoc_STRVAR(splitlines__doc__,
7965"S.splitlines([keepends]]) -> list of strings\n\
7966\n\
7967Return a list of the lines in S, breaking at line boundaries.\n\
7968Line breaks are not included in the resulting list unless keepends\n\
7969is given and true.");
7970
7971static PyObject*
7972unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7973{
7974    int keepends = 0;
7975
7976    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7977        return NULL;
7978
7979    return PyUnicode_Splitlines((PyObject *)self, keepends);
7980}
7981
7982static
7983PyObject *unicode_str(PyObject *self)
7984{
7985    if (PyUnicode_CheckExact(self)) {
7986        Py_INCREF(self);
7987        return self;
7988    } else
7989        /* Subtype -- return genuine unicode string with the same value. */
7990        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
7991                                     PyUnicode_GET_SIZE(self));
7992}
7993
7994PyDoc_STRVAR(swapcase__doc__,
7995"S.swapcase() -> str\n\
7996\n\
7997Return a copy of S with uppercase characters converted to lowercase\n\
7998and vice versa.");
7999
8000static PyObject*
8001unicode_swapcase(PyUnicodeObject *self)
8002{
8003    return fixup(self, fixswapcase);
8004}
8005
8006PyDoc_STRVAR(maketrans__doc__,
8007"str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8008\n\
8009Return a translation table usable for str.translate().\n\
8010If there is only one argument, it must be a dictionary mapping Unicode\n\
8011ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8012Character keys will then be converted to ordinals.\n\
8013If there are two arguments, they must be strings of equal length, and\n\
8014in the resulting dictionary, each character in x will be mapped to the\n\
8015character at the same position in y. If there is a third argument, it\n\
8016must be a string, whose characters will be mapped to None in the result.");
8017
8018static PyObject*
8019unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8020{
8021    PyObject *x, *y = NULL, *z = NULL;
8022    PyObject *new = NULL, *key, *value;
8023    Py_ssize_t i = 0;
8024    int res;
8025
8026    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8027        return NULL;
8028    new = PyDict_New();
8029    if (!new)
8030        return NULL;
8031    if (y != NULL) {
8032        /* x must be a string too, of equal length */
8033        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8034        if (!PyUnicode_Check(x)) {
8035            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8036                            "be a string if there is a second argument");
8037            goto err;
8038        }
8039        if (PyUnicode_GET_SIZE(x) != ylen) {
8040            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8041                            "arguments must have equal length");
8042            goto err;
8043        }
8044        /* create entries for translating chars in x to those in y */
8045        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8046            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8047            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8048            if (!key || !value)
8049                goto err;
8050            res = PyDict_SetItem(new, key, value);
8051            Py_DECREF(key);
8052            Py_DECREF(value);
8053            if (res < 0)
8054                goto err;
8055        }
8056        /* create entries for deleting chars in z */
8057        if (z != NULL) {
8058            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8059                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8060                if (!key)
8061                    goto err;
8062                res = PyDict_SetItem(new, key, Py_None);
8063                Py_DECREF(key);
8064                if (res < 0)
8065                    goto err;
8066            }
8067        }
8068    } else {
8069        /* x must be a dict */
8070        if (!PyDict_Check(x)) {
8071            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8072                            "to maketrans it must be a dict");
8073            goto err;
8074        }
8075        /* copy entries into the new dict, converting string keys to int keys */
8076        while (PyDict_Next(x, &i, &key, &value)) {
8077            if (PyUnicode_Check(key)) {
8078                /* convert string keys to integer keys */
8079                PyObject *newkey;
8080                if (PyUnicode_GET_SIZE(key) != 1) {
8081                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8082                                    "table must be of length 1");
8083                    goto err;
8084                }
8085                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8086                if (!newkey)
8087                    goto err;
8088                res = PyDict_SetItem(new, newkey, value);
8089                Py_DECREF(newkey);
8090                if (res < 0)
8091                    goto err;
8092            } else if (PyLong_Check(key)) {
8093                /* just keep integer keys */
8094                if (PyDict_SetItem(new, key, value) < 0)
8095                    goto err;
8096            } else {
8097                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8098                                "be strings or integers");
8099                goto err;
8100            }
8101        }
8102    }
8103    return new;
8104  err:
8105    Py_DECREF(new);
8106    return NULL;
8107}
8108
8109PyDoc_STRVAR(translate__doc__,
8110"S.translate(table) -> str\n\
8111\n\
8112Return a copy of the string S, where all characters have been mapped\n\
8113through the given translation table, which must be a mapping of\n\
8114Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
8115Unmapped characters are left untouched. Characters mapped to None\n\
8116are deleted.");
8117
8118static PyObject*
8119unicode_translate(PyUnicodeObject *self, PyObject *table)
8120{
8121    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8122}
8123
8124PyDoc_STRVAR(upper__doc__,
8125"S.upper() -> str\n\
8126\n\
8127Return a copy of S converted to uppercase.");
8128
8129static PyObject*
8130unicode_upper(PyUnicodeObject *self)
8131{
8132    return fixup(self, fixupper);
8133}
8134
8135PyDoc_STRVAR(zfill__doc__,
8136"S.zfill(width) -> str\n\
8137\n\
8138Pad a numeric string x with zeros on the left, to fill a field\n\
8139of the specified width. The string x is never truncated.");
8140
8141static PyObject *
8142unicode_zfill(PyUnicodeObject *self, PyObject *args)
8143{
8144    Py_ssize_t fill;
8145    PyUnicodeObject *u;
8146
8147    Py_ssize_t width;
8148    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8149        return NULL;
8150
8151    if (self->length >= width) {
8152        if (PyUnicode_CheckExact(self)) {
8153            Py_INCREF(self);
8154            return (PyObject*) self;
8155        }
8156        else
8157            return PyUnicode_FromUnicode(
8158                PyUnicode_AS_UNICODE(self),
8159                PyUnicode_GET_SIZE(self)
8160            );
8161    }
8162
8163    fill = width - self->length;
8164
8165    u = pad(self, fill, 0, '0');
8166
8167    if (u == NULL)
8168        return NULL;
8169
8170    if (u->str[fill] == '+' || u->str[fill] == '-') {
8171        /* move sign to beginning of string */
8172        u->str[0] = u->str[fill];
8173        u->str[fill] = '0';
8174    }
8175
8176    return (PyObject*) u;
8177}
8178
8179#if 0
8180static PyObject*
8181unicode_freelistsize(PyUnicodeObject *self)
8182{
8183    return PyLong_FromLong(numfree);
8184}
8185#endif
8186
8187PyDoc_STRVAR(startswith__doc__,
8188"S.startswith(prefix[, start[, end]]) -> bool\n\
8189\n\
8190Return True if S starts with the specified prefix, False otherwise.\n\
8191With optional start, test S beginning at that position.\n\
8192With optional end, stop comparing S at that position.\n\
8193prefix can also be a tuple of strings to try.");
8194
8195static PyObject *
8196unicode_startswith(PyUnicodeObject *self,
8197		   PyObject *args)
8198{
8199    PyObject *subobj;
8200    PyUnicodeObject *substring;
8201    Py_ssize_t start = 0;
8202    Py_ssize_t end = PY_SSIZE_T_MAX;
8203    int result;
8204
8205    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8206		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8207	return NULL;
8208    if (PyTuple_Check(subobj)) {
8209        Py_ssize_t i;
8210        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8211            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8212                            PyTuple_GET_ITEM(subobj, i));
8213            if (substring == NULL)
8214                return NULL;
8215            result = tailmatch(self, substring, start, end, -1);
8216            Py_DECREF(substring);
8217            if (result) {
8218                Py_RETURN_TRUE;
8219            }
8220        }
8221        /* nothing matched */
8222        Py_RETURN_FALSE;
8223    }
8224    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8225    if (substring == NULL)
8226         return NULL;
8227    result = tailmatch(self, substring, start, end, -1);
8228    Py_DECREF(substring);
8229    return PyBool_FromLong(result);
8230}
8231
8232
8233PyDoc_STRVAR(endswith__doc__,
8234"S.endswith(suffix[, start[, end]]) -> bool\n\
8235\n\
8236Return True if S ends with the specified suffix, False otherwise.\n\
8237With optional start, test S beginning at that position.\n\
8238With optional end, stop comparing S at that position.\n\
8239suffix can also be a tuple of strings to try.");
8240
8241static PyObject *
8242unicode_endswith(PyUnicodeObject *self,
8243		 PyObject *args)
8244{
8245    PyObject *subobj;
8246    PyUnicodeObject *substring;
8247    Py_ssize_t start = 0;
8248    Py_ssize_t end = PY_SSIZE_T_MAX;
8249    int result;
8250
8251    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8252        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8253	return NULL;
8254    if (PyTuple_Check(subobj)) {
8255        Py_ssize_t i;
8256        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8257            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8258                            PyTuple_GET_ITEM(subobj, i));
8259            if (substring == NULL)
8260            return NULL;
8261            result = tailmatch(self, substring, start, end, +1);
8262            Py_DECREF(substring);
8263            if (result) {
8264                Py_RETURN_TRUE;
8265            }
8266        }
8267        Py_RETURN_FALSE;
8268    }
8269    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8270    if (substring == NULL)
8271    return NULL;
8272
8273    result = tailmatch(self, substring, start, end, +1);
8274    Py_DECREF(substring);
8275    return PyBool_FromLong(result);
8276}
8277
8278#include "stringlib/string_format.h"
8279
8280PyDoc_STRVAR(format__doc__,
8281"S.format(*args, **kwargs) -> str\n\
8282\n\
8283");
8284
8285static PyObject *
8286unicode__format__(PyObject* self, PyObject* args)
8287{
8288    PyObject *format_spec;
8289
8290    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8291        return NULL;
8292
8293    return _PyUnicode_FormatAdvanced(self,
8294                                     PyUnicode_AS_UNICODE(format_spec),
8295                                     PyUnicode_GET_SIZE(format_spec));
8296}
8297
8298PyDoc_STRVAR(p_format__doc__,
8299"S.__format__(format_spec) -> str\n\
8300\n\
8301");
8302
8303static PyObject *
8304unicode__sizeof__(PyUnicodeObject *v)
8305{
8306    PyObject *res = NULL, *defsize = NULL;
8307
8308    res = PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8309                             sizeof(Py_UNICODE) * (v->length + 1));
8310    if (v->defenc) {
8311        defsize = PyObject_CallMethod(v->defenc, "__sizeof__", NULL);
8312        if (defsize == NULL) {
8313            Py_DECREF(res);
8314            return NULL;
8315        }
8316        res = PyNumber_Add(res, defsize);
8317        Py_DECREF(defsize);
8318    }
8319    return res;
8320}
8321
8322PyDoc_STRVAR(sizeof__doc__,
8323"S.__sizeof__() -> size of S in memory, in bytes");
8324
8325static PyObject *
8326unicode_getnewargs(PyUnicodeObject *v)
8327{
8328	return Py_BuildValue("(u#)", v->str, v->length);
8329}
8330
8331
8332static PyMethodDef unicode_methods[] = {
8333
8334    /* Order is according to common usage: often used methods should
8335       appear first, since lookup is done sequentially. */
8336
8337    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
8338    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8339    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8340    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8341    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8342    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8343    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8344    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8345    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8346    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8347    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8348    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8349    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8350    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8351    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8352    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8353    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8354    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8355    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8356    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8357    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8358    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8359    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8360    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8361    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8362    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8363    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8364    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8365    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8366    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8367    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8368    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8369    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8370    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8371    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8372    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8373    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8374    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8375    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8376    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8377    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8378    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8379    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8380    {"maketrans", (PyCFunction) unicode_maketrans,
8381     METH_VARARGS | METH_STATIC, maketrans__doc__},
8382    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8383#if 0
8384    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8385#endif
8386
8387#if 0
8388    /* This one is just used for debugging the implementation. */
8389    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8390#endif
8391
8392    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
8393    {NULL, NULL}
8394};
8395
8396static PyObject *
8397unicode_mod(PyObject *v, PyObject *w)
8398{
8399       if (!PyUnicode_Check(v)) {
8400               Py_INCREF(Py_NotImplemented);
8401               return Py_NotImplemented;
8402       }
8403       return PyUnicode_Format(v, w);
8404}
8405
8406static PyNumberMethods unicode_as_number = {
8407	0,				/*nb_add*/
8408	0,				/*nb_subtract*/
8409	0,				/*nb_multiply*/
8410	unicode_mod,			/*nb_remainder*/
8411};
8412
8413static PySequenceMethods unicode_as_sequence = {
8414    (lenfunc) unicode_length, 		/* sq_length */
8415    PyUnicode_Concat,		 	/* sq_concat */
8416    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
8417    (ssizeargfunc) unicode_getitem, 	/* sq_item */
8418    0,				 	/* sq_slice */
8419    0, 					/* sq_ass_item */
8420    0, 					/* sq_ass_slice */
8421    PyUnicode_Contains, 		/* sq_contains */
8422};
8423
8424static PyObject*
8425unicode_subscript(PyUnicodeObject* self, PyObject* item)
8426{
8427    if (PyIndex_Check(item)) {
8428        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8429        if (i == -1 && PyErr_Occurred())
8430            return NULL;
8431        if (i < 0)
8432            i += PyUnicode_GET_SIZE(self);
8433        return unicode_getitem(self, i);
8434    } else if (PySlice_Check(item)) {
8435        Py_ssize_t start, stop, step, slicelength, cur, i;
8436        Py_UNICODE* source_buf;
8437        Py_UNICODE* result_buf;
8438        PyObject* result;
8439
8440        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8441				 &start, &stop, &step, &slicelength) < 0) {
8442            return NULL;
8443        }
8444
8445        if (slicelength <= 0) {
8446            return PyUnicode_FromUnicode(NULL, 0);
8447        } else if (start == 0 && step == 1 && slicelength == self->length &&
8448                   PyUnicode_CheckExact(self)) {
8449            Py_INCREF(self);
8450            return (PyObject *)self;
8451        } else if (step == 1) {
8452            return PyUnicode_FromUnicode(self->str + start, slicelength);
8453        } else {
8454            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8455            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8456                                                       sizeof(Py_UNICODE));
8457
8458	    if (result_buf == NULL)
8459		    return PyErr_NoMemory();
8460
8461            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8462                result_buf[i] = source_buf[cur];
8463            }
8464
8465            result = PyUnicode_FromUnicode(result_buf, slicelength);
8466            PyObject_FREE(result_buf);
8467            return result;
8468        }
8469    } else {
8470        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8471        return NULL;
8472    }
8473}
8474
8475static PyMappingMethods unicode_as_mapping = {
8476    (lenfunc)unicode_length,		/* mp_length */
8477    (binaryfunc)unicode_subscript,	/* mp_subscript */
8478    (objobjargproc)0,			/* mp_ass_subscript */
8479};
8480
8481
8482/* Helpers for PyUnicode_Format() */
8483
8484static PyObject *
8485getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8486{
8487    Py_ssize_t argidx = *p_argidx;
8488    if (argidx < arglen) {
8489	(*p_argidx)++;
8490	if (arglen < 0)
8491	    return args;
8492	else
8493	    return PyTuple_GetItem(args, argidx);
8494    }
8495    PyErr_SetString(PyExc_TypeError,
8496		    "not enough arguments for format string");
8497    return NULL;
8498}
8499
8500static Py_ssize_t
8501strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8502{
8503    register Py_ssize_t i;
8504    Py_ssize_t len = strlen(charbuffer);
8505    for (i = len - 1; i >= 0; i--)
8506	buffer[i] = (Py_UNICODE) charbuffer[i];
8507
8508    return len;
8509}
8510
8511static int
8512doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8513{
8514    Py_ssize_t result;
8515
8516    PyOS_ascii_formatd((char *)buffer, len, format, x);
8517    result = strtounicode(buffer, (char *)buffer);
8518    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8519}
8520
8521#if 0
8522static int
8523longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8524{
8525    Py_ssize_t result;
8526
8527    PyOS_snprintf((char *)buffer, len, format, x);
8528    result = strtounicode(buffer, (char *)buffer);
8529    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8530}
8531#endif
8532
8533/* XXX To save some code duplication, formatfloat/long/int could have been
8534   shared with stringobject.c, converting from 8-bit to Unicode after the
8535   formatting is done. */
8536
8537static int
8538formatfloat(Py_UNICODE *buf,
8539	    size_t buflen,
8540	    int flags,
8541	    int prec,
8542	    int type,
8543	    PyObject *v)
8544{
8545    /* fmt = '%#.' + `prec` + `type`
8546       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8547    char fmt[20];
8548    double x;
8549
8550    x = PyFloat_AsDouble(v);
8551    if (x == -1.0 && PyErr_Occurred())
8552	return -1;
8553    if (prec < 0)
8554	prec = 6;
8555    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
8556	type = 'g';
8557    /* Worst case length calc to ensure no buffer overrun:
8558
8559       'g' formats:
8560	 fmt = %#.<prec>g
8561	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8562	    for any double rep.)
8563	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8564
8565       'f' formats:
8566	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8567	 len = 1 + 50 + 1 + prec = 52 + prec
8568
8569       If prec=0 the effective precision is 1 (the leading digit is
8570       always given), therefore increase the length by one.
8571
8572    */
8573    if (((type == 'g' || type == 'G') &&
8574          buflen <= (size_t)10 + (size_t)prec) ||
8575	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8576	PyErr_SetString(PyExc_OverflowError,
8577			"formatted float is too long (precision too large?)");
8578	return -1;
8579    }
8580    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
8581		  (flags&F_ALT) ? "#" : "",
8582		  prec, type);
8583    return doubletounicode(buf, buflen, fmt, x);
8584}
8585
8586static PyObject*
8587formatlong(PyObject *val, int flags, int prec, int type)
8588{
8589	char *buf;
8590	int len;
8591	PyObject *str; /* temporary string object. */
8592	PyObject *result;
8593
8594	str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8595	if (!str)
8596		return NULL;
8597	result = PyUnicode_FromStringAndSize(buf, len);
8598	Py_DECREF(str);
8599	return result;
8600}
8601
8602#if 0
8603static int
8604formatint(Py_UNICODE *buf,
8605	  size_t buflen,
8606	  int flags,
8607	  int prec,
8608	  int type,
8609	  PyObject *v)
8610{
8611    /* fmt = '%#.' + `prec` + 'l' + `type`
8612     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8613     *                     + 1 + 1
8614     *                   = 24
8615     */
8616    char fmt[64]; /* plenty big enough! */
8617    char *sign;
8618    long x;
8619
8620    x = PyLong_AsLong(v);
8621    if (x == -1 && PyErr_Occurred())
8622        return -1;
8623    if (x < 0 && type == 'u') {
8624        type = 'd';
8625    }
8626    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8627        sign = "-";
8628    else
8629        sign = "";
8630    if (prec < 0)
8631        prec = 1;
8632
8633    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8634     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8635     */
8636    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8637        PyErr_SetString(PyExc_OverflowError,
8638    	        "formatted integer is too long (precision too large?)");
8639        return -1;
8640    }
8641
8642    if ((flags & F_ALT) &&
8643        (type == 'x' || type == 'X' || type == 'o')) {
8644        /* When converting under %#o, %#x or %#X, there are a number
8645         * of issues that cause pain:
8646	 * - for %#o, we want a different base marker than C
8647         * - when 0 is being converted, the C standard leaves off
8648         *   the '0x' or '0X', which is inconsistent with other
8649         *   %#x/%#X conversions and inconsistent with Python's
8650         *   hex() function
8651         * - there are platforms that violate the standard and
8652         *   convert 0 with the '0x' or '0X'
8653         *   (Metrowerks, Compaq Tru64)
8654         * - there are platforms that give '0x' when converting
8655         *   under %#X, but convert 0 in accordance with the
8656         *   standard (OS/2 EMX)
8657         *
8658         * We can achieve the desired consistency by inserting our
8659         * own '0x' or '0X' prefix, and substituting %x/%X in place
8660         * of %#x/%#X.
8661         *
8662         * Note that this is the same approach as used in
8663         * formatint() in stringobject.c
8664         */
8665        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8666                      sign, type, prec, type);
8667    }
8668    else {
8669        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8670                      sign, (flags&F_ALT) ? "#" : "",
8671                      prec, type);
8672    }
8673    if (sign[0])
8674        return longtounicode(buf, buflen, fmt, -x);
8675    else
8676        return longtounicode(buf, buflen, fmt, x);
8677}
8678#endif
8679
8680static int
8681formatchar(Py_UNICODE *buf,
8682           size_t buflen,
8683           PyObject *v)
8684{
8685    /* presume that the buffer is at least 2 characters long */
8686    if (PyUnicode_Check(v)) {
8687	if (PyUnicode_GET_SIZE(v) != 1)
8688	    goto onError;
8689	buf[0] = PyUnicode_AS_UNICODE(v)[0];
8690    }
8691    else {
8692	/* Integer input truncated to a character */
8693        long x;
8694	x = PyLong_AsLong(v);
8695	if (x == -1 && PyErr_Occurred())
8696	    goto onError;
8697#ifdef Py_UNICODE_WIDE
8698	if (x < 0 || x > 0x10ffff) {
8699	    PyErr_SetString(PyExc_OverflowError,
8700			    "%c arg not in range(0x110000) "
8701			    "(wide Python build)");
8702	    return -1;
8703	}
8704#else
8705	if (x < 0 || x > 0xffff) {
8706	    PyErr_SetString(PyExc_OverflowError,
8707			    "%c arg not in range(0x10000) "
8708			    "(narrow Python build)");
8709	    return -1;
8710	}
8711#endif
8712	buf[0] = (Py_UNICODE) x;
8713    }
8714    buf[1] = '\0';
8715    return 1;
8716
8717 onError:
8718    PyErr_SetString(PyExc_TypeError,
8719		    "%c requires int or char");
8720    return -1;
8721}
8722
8723/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8724
8725   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8726   chars are formatted. XXX This is a magic number. Each formatting
8727   routine does bounds checking to ensure no overflow, but a better
8728   solution may be to malloc a buffer of appropriate size for each
8729   format. For now, the current solution is sufficient.
8730*/
8731#define FORMATBUFLEN (size_t)120
8732
8733PyObject *PyUnicode_Format(PyObject *format,
8734			   PyObject *args)
8735{
8736    Py_UNICODE *fmt, *res;
8737    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8738    int args_owned = 0;
8739    PyUnicodeObject *result = NULL;
8740    PyObject *dict = NULL;
8741    PyObject *uformat;
8742
8743    if (format == NULL || args == NULL) {
8744	PyErr_BadInternalCall();
8745	return NULL;
8746    }
8747    uformat = PyUnicode_FromObject(format);
8748    if (uformat == NULL)
8749	return NULL;
8750    fmt = PyUnicode_AS_UNICODE(uformat);
8751    fmtcnt = PyUnicode_GET_SIZE(uformat);
8752
8753    reslen = rescnt = fmtcnt + 100;
8754    result = _PyUnicode_New(reslen);
8755    if (result == NULL)
8756	goto onError;
8757    res = PyUnicode_AS_UNICODE(result);
8758
8759    if (PyTuple_Check(args)) {
8760	arglen = PyTuple_Size(args);
8761	argidx = 0;
8762    }
8763    else {
8764	arglen = -1;
8765	argidx = -2;
8766    }
8767    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8768        !PyUnicode_Check(args))
8769	dict = args;
8770
8771    while (--fmtcnt >= 0) {
8772	if (*fmt != '%') {
8773	    if (--rescnt < 0) {
8774		rescnt = fmtcnt + 100;
8775		reslen += rescnt;
8776		if (_PyUnicode_Resize(&result, reslen) < 0)
8777		    goto onError;
8778		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8779		--rescnt;
8780	    }
8781	    *res++ = *fmt++;
8782	}
8783	else {
8784	    /* Got a format specifier */
8785	    int flags = 0;
8786	    Py_ssize_t width = -1;
8787	    int prec = -1;
8788	    Py_UNICODE c = '\0';
8789	    Py_UNICODE fill;
8790	    int isnumok;
8791	    PyObject *v = NULL;
8792	    PyObject *temp = NULL;
8793	    Py_UNICODE *pbuf;
8794	    Py_UNICODE sign;
8795	    Py_ssize_t len;
8796	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8797
8798	    fmt++;
8799	    if (*fmt == '(') {
8800		Py_UNICODE *keystart;
8801		Py_ssize_t keylen;
8802		PyObject *key;
8803		int pcount = 1;
8804
8805		if (dict == NULL) {
8806		    PyErr_SetString(PyExc_TypeError,
8807				    "format requires a mapping");
8808		    goto onError;
8809		}
8810		++fmt;
8811		--fmtcnt;
8812		keystart = fmt;
8813		/* Skip over balanced parentheses */
8814		while (pcount > 0 && --fmtcnt >= 0) {
8815		    if (*fmt == ')')
8816			--pcount;
8817		    else if (*fmt == '(')
8818			++pcount;
8819		    fmt++;
8820		}
8821		keylen = fmt - keystart - 1;
8822		if (fmtcnt < 0 || pcount > 0) {
8823		    PyErr_SetString(PyExc_ValueError,
8824				    "incomplete format key");
8825		    goto onError;
8826		}
8827#if 0
8828		/* keys are converted to strings using UTF-8 and
8829		   then looked up since Python uses strings to hold
8830		   variables names etc. in its namespaces and we
8831		   wouldn't want to break common idioms. */
8832		key = PyUnicode_EncodeUTF8(keystart,
8833					   keylen,
8834					   NULL);
8835#else
8836		key = PyUnicode_FromUnicode(keystart, keylen);
8837#endif
8838		if (key == NULL)
8839		    goto onError;
8840		if (args_owned) {
8841		    Py_DECREF(args);
8842		    args_owned = 0;
8843		}
8844		args = PyObject_GetItem(dict, key);
8845		Py_DECREF(key);
8846		if (args == NULL) {
8847		    goto onError;
8848		}
8849		args_owned = 1;
8850		arglen = -1;
8851		argidx = -2;
8852	    }
8853	    while (--fmtcnt >= 0) {
8854		switch (c = *fmt++) {
8855		case '-': flags |= F_LJUST; continue;
8856		case '+': flags |= F_SIGN; continue;
8857		case ' ': flags |= F_BLANK; continue;
8858		case '#': flags |= F_ALT; continue;
8859		case '0': flags |= F_ZERO; continue;
8860		}
8861		break;
8862	    }
8863	    if (c == '*') {
8864		v = getnextarg(args, arglen, &argidx);
8865		if (v == NULL)
8866		    goto onError;
8867		if (!PyLong_Check(v)) {
8868		    PyErr_SetString(PyExc_TypeError,
8869				    "* wants int");
8870		    goto onError;
8871		}
8872		width = PyLong_AsLong(v);
8873		if (width == -1 && PyErr_Occurred())
8874			goto onError;
8875		if (width < 0) {
8876		    flags |= F_LJUST;
8877		    width = -width;
8878		}
8879		if (--fmtcnt >= 0)
8880		    c = *fmt++;
8881	    }
8882	    else if (c >= '0' && c <= '9') {
8883		width = c - '0';
8884		while (--fmtcnt >= 0) {
8885		    c = *fmt++;
8886		    if (c < '0' || c > '9')
8887			break;
8888		    if ((width*10) / 10 != width) {
8889			PyErr_SetString(PyExc_ValueError,
8890					"width too big");
8891			goto onError;
8892		    }
8893		    width = width*10 + (c - '0');
8894		}
8895	    }
8896	    if (c == '.') {
8897		prec = 0;
8898		if (--fmtcnt >= 0)
8899		    c = *fmt++;
8900		if (c == '*') {
8901		    v = getnextarg(args, arglen, &argidx);
8902		    if (v == NULL)
8903			goto onError;
8904		    if (!PyLong_Check(v)) {
8905			PyErr_SetString(PyExc_TypeError,
8906					"* wants int");
8907			goto onError;
8908		    }
8909		    prec = PyLong_AsLong(v);
8910		    if (prec == -1 && PyErr_Occurred())
8911			goto onError;
8912		    if (prec < 0)
8913			prec = 0;
8914		    if (--fmtcnt >= 0)
8915			c = *fmt++;
8916		}
8917		else if (c >= '0' && c <= '9') {
8918		    prec = c - '0';
8919		    while (--fmtcnt >= 0) {
8920			c = Py_CHARMASK(*fmt++);
8921			if (c < '0' || c > '9')
8922			    break;
8923			if ((prec*10) / 10 != prec) {
8924			    PyErr_SetString(PyExc_ValueError,
8925					    "prec too big");
8926			    goto onError;
8927			}
8928			prec = prec*10 + (c - '0');
8929		    }
8930		}
8931	    } /* prec */
8932	    if (fmtcnt >= 0) {
8933		if (c == 'h' || c == 'l' || c == 'L') {
8934		    if (--fmtcnt >= 0)
8935			c = *fmt++;
8936		}
8937	    }
8938	    if (fmtcnt < 0) {
8939		PyErr_SetString(PyExc_ValueError,
8940				"incomplete format");
8941		goto onError;
8942	    }
8943	    if (c != '%') {
8944		v = getnextarg(args, arglen, &argidx);
8945		if (v == NULL)
8946		    goto onError;
8947	    }
8948	    sign = 0;
8949	    fill = ' ';
8950	    switch (c) {
8951
8952	    case '%':
8953		pbuf = formatbuf;
8954		/* presume that buffer length is at least 1 */
8955		pbuf[0] = '%';
8956		len = 1;
8957		break;
8958
8959	    case 's':
8960	    case 'r':
8961		if (PyUnicode_Check(v) && c == 's') {
8962		    temp = v;
8963		    Py_INCREF(temp);
8964		}
8965		else {
8966		    if (c == 's')
8967			temp = PyObject_Str(v);
8968		    else
8969			temp = PyObject_Repr(v);
8970		    if (temp == NULL)
8971			goto onError;
8972                    if (PyUnicode_Check(temp))
8973                        /* nothing to do */;
8974		    else {
8975			Py_DECREF(temp);
8976			PyErr_SetString(PyExc_TypeError,
8977					"%s argument has non-string str()");
8978			goto onError;
8979		    }
8980		}
8981		pbuf = PyUnicode_AS_UNICODE(temp);
8982		len = PyUnicode_GET_SIZE(temp);
8983		if (prec >= 0 && len > prec)
8984		    len = prec;
8985		break;
8986
8987	    case 'i':
8988	    case 'd':
8989	    case 'u':
8990	    case 'o':
8991	    case 'x':
8992	    case 'X':
8993		if (c == 'i')
8994		    c = 'd';
8995		isnumok = 0;
8996		if (PyNumber_Check(v)) {
8997			PyObject *iobj=NULL;
8998
8999			if (PyLong_Check(v)) {
9000				iobj = v;
9001				Py_INCREF(iobj);
9002			}
9003			else {
9004				iobj = PyNumber_Long(v);
9005			}
9006			if (iobj!=NULL) {
9007				if (PyLong_Check(iobj)) {
9008					isnumok = 1;
9009					temp = formatlong(iobj, flags, prec, c);
9010					Py_DECREF(iobj);
9011					if (!temp)
9012					    goto onError;
9013					pbuf = PyUnicode_AS_UNICODE(temp);
9014					len = PyUnicode_GET_SIZE(temp);
9015					sign = 1;
9016				}
9017				else {
9018					Py_DECREF(iobj);
9019				}
9020			}
9021		}
9022		if (!isnumok) {
9023			PyErr_Format(PyExc_TypeError,
9024			    "%%%c format: a number is required, "
9025                                     "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9026			goto onError;
9027		}
9028		if (flags & F_ZERO)
9029		    fill = '0';
9030		break;
9031
9032	    case 'e':
9033	    case 'E':
9034	    case 'f':
9035	    case 'F':
9036	    case 'g':
9037	    case 'G':
9038		if (c == 'F')
9039			c = 'f';
9040		pbuf = formatbuf;
9041		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
9042			flags, prec, c, v);
9043		if (len < 0)
9044		    goto onError;
9045		sign = 1;
9046		if (flags & F_ZERO)
9047		    fill = '0';
9048		break;
9049
9050	    case 'c':
9051		pbuf = formatbuf;
9052		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9053		if (len < 0)
9054		    goto onError;
9055		break;
9056
9057	    default:
9058		PyErr_Format(PyExc_ValueError,
9059			     "unsupported format character '%c' (0x%x) "
9060			     "at index %zd",
9061			     (31<=c && c<=126) ? (char)c : '?',
9062                             (int)c,
9063			     (Py_ssize_t)(fmt - 1 -
9064					  PyUnicode_AS_UNICODE(uformat)));
9065		goto onError;
9066	    }
9067	    if (sign) {
9068		if (*pbuf == '-' || *pbuf == '+') {
9069		    sign = *pbuf++;
9070		    len--;
9071		}
9072		else if (flags & F_SIGN)
9073		    sign = '+';
9074		else if (flags & F_BLANK)
9075		    sign = ' ';
9076		else
9077		    sign = 0;
9078	    }
9079	    if (width < len)
9080		width = len;
9081	    if (rescnt - (sign != 0) < width) {
9082		reslen -= rescnt;
9083		rescnt = width + fmtcnt + 100;
9084		reslen += rescnt;
9085		if (reslen < 0) {
9086		    Py_XDECREF(temp);
9087		    PyErr_NoMemory();
9088		    goto onError;
9089		}
9090		if (_PyUnicode_Resize(&result, reslen) < 0) {
9091		    Py_XDECREF(temp);
9092		    goto onError;
9093		}
9094		res = PyUnicode_AS_UNICODE(result)
9095		    + reslen - rescnt;
9096	    }
9097	    if (sign) {
9098		if (fill != ' ')
9099		    *res++ = sign;
9100		rescnt--;
9101		if (width > len)
9102		    width--;
9103	    }
9104	    if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9105		assert(pbuf[0] == '0');
9106		assert(pbuf[1] == c);
9107		if (fill != ' ') {
9108		    *res++ = *pbuf++;
9109		    *res++ = *pbuf++;
9110		}
9111		rescnt -= 2;
9112		width -= 2;
9113		if (width < 0)
9114		    width = 0;
9115		len -= 2;
9116	    }
9117	    if (width > len && !(flags & F_LJUST)) {
9118		do {
9119		    --rescnt;
9120		    *res++ = fill;
9121		} while (--width > len);
9122	    }
9123	    if (fill == ' ') {
9124		if (sign)
9125		    *res++ = sign;
9126		if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9127		    assert(pbuf[0] == '0');
9128		    assert(pbuf[1] == c);
9129		    *res++ = *pbuf++;
9130		    *res++ = *pbuf++;
9131		}
9132	    }
9133	    Py_UNICODE_COPY(res, pbuf, len);
9134	    res += len;
9135	    rescnt -= len;
9136	    while (--width >= len) {
9137		--rescnt;
9138		*res++ = ' ';
9139	    }
9140	    if (dict && (argidx < arglen) && c != '%') {
9141		PyErr_SetString(PyExc_TypeError,
9142				"not all arguments converted during string formatting");
9143                Py_XDECREF(temp);
9144		goto onError;
9145	    }
9146	    Py_XDECREF(temp);
9147	} /* '%' */
9148    } /* until end */
9149    if (argidx < arglen && !dict) {
9150	PyErr_SetString(PyExc_TypeError,
9151			"not all arguments converted during string formatting");
9152	goto onError;
9153    }
9154
9155    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9156	goto onError;
9157    if (args_owned) {
9158	Py_DECREF(args);
9159    }
9160    Py_DECREF(uformat);
9161    return (PyObject *)result;
9162
9163 onError:
9164    Py_XDECREF(result);
9165    Py_DECREF(uformat);
9166    if (args_owned) {
9167	Py_DECREF(args);
9168    }
9169    return NULL;
9170}
9171
9172static PyObject *
9173unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9174
9175static PyObject *
9176unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9177{
9178        PyObject *x = NULL;
9179	static char *kwlist[] = {"object", "encoding", "errors", 0};
9180	char *encoding = NULL;
9181	char *errors = NULL;
9182
9183	if (type != &PyUnicode_Type)
9184		return unicode_subtype_new(type, args, kwds);
9185	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9186					  kwlist, &x, &encoding, &errors))
9187	    return NULL;
9188	if (x == NULL)
9189		return (PyObject *)_PyUnicode_New(0);
9190	if (encoding == NULL && errors == NULL)
9191	    return PyObject_Str(x);
9192	else
9193	return PyUnicode_FromEncodedObject(x, encoding, errors);
9194}
9195
9196static PyObject *
9197unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9198{
9199	PyUnicodeObject *tmp, *pnew;
9200	Py_ssize_t n;
9201
9202	assert(PyType_IsSubtype(type, &PyUnicode_Type));
9203	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9204	if (tmp == NULL)
9205		return NULL;
9206	assert(PyUnicode_Check(tmp));
9207	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9208	if (pnew == NULL) {
9209		Py_DECREF(tmp);
9210		return NULL;
9211	}
9212	pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9213	if (pnew->str == NULL) {
9214		_Py_ForgetReference((PyObject *)pnew);
9215		PyObject_Del(pnew);
9216		Py_DECREF(tmp);
9217		return PyErr_NoMemory();
9218	}
9219	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9220	pnew->length = n;
9221	pnew->hash = tmp->hash;
9222	Py_DECREF(tmp);
9223	return (PyObject *)pnew;
9224}
9225
9226PyDoc_STRVAR(unicode_doc,
9227"str(string[, encoding[, errors]]) -> str\n\
9228\n\
9229Create a new string object from the given encoded string.\n\
9230encoding defaults to the current default string encoding.\n\
9231errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9232
9233static PyObject *unicode_iter(PyObject *seq);
9234
9235PyTypeObject PyUnicode_Type = {
9236    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9237    "str", 				/* tp_name */
9238    sizeof(PyUnicodeObject), 		/* tp_size */
9239    0, 					/* tp_itemsize */
9240    /* Slots */
9241    (destructor)unicode_dealloc, 	/* tp_dealloc */
9242    0, 					/* tp_print */
9243    0,				 	/* tp_getattr */
9244    0, 					/* tp_setattr */
9245    0, 					/* tp_compare */
9246    unicode_repr, 			/* tp_repr */
9247    &unicode_as_number, 		/* tp_as_number */
9248    &unicode_as_sequence, 		/* tp_as_sequence */
9249    &unicode_as_mapping, 		/* tp_as_mapping */
9250    (hashfunc) unicode_hash, 		/* tp_hash*/
9251    0, 					/* tp_call*/
9252    (reprfunc) unicode_str,	 	/* tp_str */
9253    PyObject_GenericGetAttr, 		/* tp_getattro */
9254    0,			 		/* tp_setattro */
9255    0, 					/* tp_as_buffer */
9256    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9257        Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
9258    unicode_doc,			/* tp_doc */
9259    0,					/* tp_traverse */
9260    0,					/* tp_clear */
9261    PyUnicode_RichCompare,		/* tp_richcompare */
9262    0,					/* tp_weaklistoffset */
9263    unicode_iter,			/* tp_iter */
9264    0,					/* tp_iternext */
9265    unicode_methods,			/* tp_methods */
9266    0,					/* tp_members */
9267    0,					/* tp_getset */
9268    &PyBaseObject_Type,			/* tp_base */
9269    0,					/* tp_dict */
9270    0,					/* tp_descr_get */
9271    0,					/* tp_descr_set */
9272    0,					/* tp_dictoffset */
9273    0,					/* tp_init */
9274    0,					/* tp_alloc */
9275    unicode_new,			/* tp_new */
9276    PyObject_Del,      		/* tp_free */
9277};
9278
9279/* Initialize the Unicode implementation */
9280
9281void _PyUnicode_Init(void)
9282{
9283    int i;
9284
9285    /* XXX - move this array to unicodectype.c ? */
9286    Py_UNICODE linebreak[] = {
9287        0x000A, /* LINE FEED */
9288        0x000D, /* CARRIAGE RETURN */
9289        0x001C, /* FILE SEPARATOR */
9290        0x001D, /* GROUP SEPARATOR */
9291        0x001E, /* RECORD SEPARATOR */
9292        0x0085, /* NEXT LINE */
9293        0x2028, /* LINE SEPARATOR */
9294        0x2029, /* PARAGRAPH SEPARATOR */
9295    };
9296
9297    /* Init the implementation */
9298    free_list = NULL;
9299    numfree = 0;
9300    unicode_empty = _PyUnicode_New(0);
9301    if (!unicode_empty)
9302	return;
9303
9304    for (i = 0; i < 256; i++)
9305	unicode_latin1[i] = NULL;
9306    if (PyType_Ready(&PyUnicode_Type) < 0)
9307	Py_FatalError("Can't initialize 'unicode'");
9308
9309    /* initialize the linebreak bloom filter */
9310    bloom_linebreak = make_bloom_mask(
9311        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9312        );
9313
9314    PyType_Ready(&EncodingMapType);
9315}
9316
9317/* Finalize the Unicode implementation */
9318
9319int
9320PyUnicode_ClearFreeList(void)
9321{
9322    int freelist_size = numfree;
9323    PyUnicodeObject *u;
9324
9325    for (u = free_list; u != NULL;) {
9326	PyUnicodeObject *v = u;
9327	u = *(PyUnicodeObject **)u;
9328	if (v->str)
9329	    PyObject_DEL(v->str);
9330	Py_XDECREF(v->defenc);
9331	PyObject_Del(v);
9332	numfree--;
9333    }
9334    free_list = NULL;
9335    assert(numfree == 0);
9336    return freelist_size;
9337}
9338
9339void
9340_PyUnicode_Fini(void)
9341{
9342    int i;
9343
9344    Py_XDECREF(unicode_empty);
9345    unicode_empty = NULL;
9346
9347    for (i = 0; i < 256; i++) {
9348	if (unicode_latin1[i]) {
9349	    Py_DECREF(unicode_latin1[i]);
9350	    unicode_latin1[i] = NULL;
9351	}
9352    }
9353    (void)PyUnicode_ClearFreeList();
9354}
9355
9356void
9357PyUnicode_InternInPlace(PyObject **p)
9358{
9359	register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9360	PyObject *t;
9361	if (s == NULL || !PyUnicode_Check(s))
9362		Py_FatalError(
9363		    "PyUnicode_InternInPlace: unicode strings only please!");
9364	/* If it's a subclass, we don't really know what putting
9365	   it in the interned dict might do. */
9366	if (!PyUnicode_CheckExact(s))
9367		return;
9368	if (PyUnicode_CHECK_INTERNED(s))
9369		return;
9370	if (interned == NULL) {
9371		interned = PyDict_New();
9372		if (interned == NULL) {
9373			PyErr_Clear(); /* Don't leave an exception */
9374			return;
9375		}
9376	}
9377	/* It might be that the GetItem call fails even
9378	   though the key is present in the dictionary,
9379	   namely when this happens during a stack overflow. */
9380	Py_ALLOW_RECURSION
9381	t = PyDict_GetItem(interned, (PyObject *)s);
9382	Py_END_ALLOW_RECURSION
9383
9384	if (t) {
9385		Py_INCREF(t);
9386		Py_DECREF(*p);
9387		*p = t;
9388		return;
9389	}
9390
9391	PyThreadState_GET()->recursion_critical = 1;
9392	if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9393		PyErr_Clear();
9394		PyThreadState_GET()->recursion_critical = 0;
9395		return;
9396	}
9397	PyThreadState_GET()->recursion_critical = 0;
9398	/* The two references in interned are not counted by refcnt.
9399	   The deallocator will take care of this */
9400	Py_REFCNT(s) -= 2;
9401	PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9402}
9403
9404void
9405PyUnicode_InternImmortal(PyObject **p)
9406{
9407	PyUnicode_InternInPlace(p);
9408	if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9409		PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9410		Py_INCREF(*p);
9411	}
9412}
9413
9414PyObject *
9415PyUnicode_InternFromString(const char *cp)
9416{
9417	PyObject *s = PyUnicode_FromString(cp);
9418	if (s == NULL)
9419		return NULL;
9420	PyUnicode_InternInPlace(&s);
9421	return s;
9422}
9423
9424void _Py_ReleaseInternedUnicodeStrings(void)
9425{
9426	PyObject *keys;
9427	PyUnicodeObject *s;
9428	Py_ssize_t i, n;
9429	Py_ssize_t immortal_size = 0, mortal_size = 0;
9430
9431	if (interned == NULL || !PyDict_Check(interned))
9432		return;
9433	keys = PyDict_Keys(interned);
9434	if (keys == NULL || !PyList_Check(keys)) {
9435		PyErr_Clear();
9436		return;
9437	}
9438
9439	/* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9440	   detector, interned unicode strings are not forcibly deallocated;
9441	   rather, we give them their stolen references back, and then clear
9442	   and DECREF the interned dict. */
9443
9444	n = PyList_GET_SIZE(keys);
9445	fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9446		n);
9447	for (i = 0; i < n; i++) {
9448		s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9449		switch (s->state) {
9450		case SSTATE_NOT_INTERNED:
9451			/* XXX Shouldn't happen */
9452			break;
9453		case SSTATE_INTERNED_IMMORTAL:
9454			Py_REFCNT(s) += 1;
9455			immortal_size += s->length;
9456			break;
9457		case SSTATE_INTERNED_MORTAL:
9458			Py_REFCNT(s) += 2;
9459			mortal_size += s->length;
9460			break;
9461		default:
9462			Py_FatalError("Inconsistent interned string state.");
9463		}
9464		s->state = SSTATE_NOT_INTERNED;
9465	}
9466	fprintf(stderr, "total size of all interned strings: "
9467			"%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9468			"mortal/immortal\n", mortal_size, immortal_size);
9469	Py_DECREF(keys);
9470	PyDict_Clear(interned);
9471	Py_DECREF(interned);
9472	interned = NULL;
9473}
9474
9475
9476/********************* Unicode Iterator **************************/
9477
9478typedef struct {
9479	PyObject_HEAD
9480	Py_ssize_t it_index;
9481	PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9482} unicodeiterobject;
9483
9484static void
9485unicodeiter_dealloc(unicodeiterobject *it)
9486{
9487	_PyObject_GC_UNTRACK(it);
9488	Py_XDECREF(it->it_seq);
9489	PyObject_GC_Del(it);
9490}
9491
9492static int
9493unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9494{
9495	Py_VISIT(it->it_seq);
9496	return 0;
9497}
9498
9499static PyObject *
9500unicodeiter_next(unicodeiterobject *it)
9501{
9502	PyUnicodeObject *seq;
9503	PyObject *item;
9504
9505	assert(it != NULL);
9506	seq = it->it_seq;
9507	if (seq == NULL)
9508		return NULL;
9509	assert(PyUnicode_Check(seq));
9510
9511	if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9512		item = PyUnicode_FromUnicode(
9513                    PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9514		if (item != NULL)
9515			++it->it_index;
9516		return item;
9517	}
9518
9519	Py_DECREF(seq);
9520	it->it_seq = NULL;
9521	return NULL;
9522}
9523
9524static PyObject *
9525unicodeiter_len(unicodeiterobject *it)
9526{
9527	Py_ssize_t len = 0;
9528	if (it->it_seq)
9529		len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9530	return PyLong_FromSsize_t(len);
9531}
9532
9533PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9534
9535static PyMethodDef unicodeiter_methods[] = {
9536	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9537         length_hint_doc},
9538 	{NULL,		NULL}		/* sentinel */
9539};
9540
9541PyTypeObject PyUnicodeIter_Type = {
9542	PyVarObject_HEAD_INIT(&PyType_Type, 0)
9543	"str_iterator",			/* tp_name */
9544	sizeof(unicodeiterobject),		/* tp_basicsize */
9545	0,					/* tp_itemsize */
9546	/* methods */
9547	(destructor)unicodeiter_dealloc,	/* tp_dealloc */
9548	0,					/* tp_print */
9549	0,					/* tp_getattr */
9550	0,					/* tp_setattr */
9551	0,					/* tp_compare */
9552	0,					/* tp_repr */
9553	0,					/* tp_as_number */
9554	0,					/* tp_as_sequence */
9555	0,					/* tp_as_mapping */
9556	0,					/* tp_hash */
9557	0,					/* tp_call */
9558	0,					/* tp_str */
9559	PyObject_GenericGetAttr,		/* tp_getattro */
9560	0,					/* tp_setattro */
9561	0,					/* tp_as_buffer */
9562	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9563	0,					/* tp_doc */
9564	(traverseproc)unicodeiter_traverse,	/* tp_traverse */
9565	0,					/* tp_clear */
9566	0,					/* tp_richcompare */
9567	0,					/* tp_weaklistoffset */
9568	PyObject_SelfIter,			/* tp_iter */
9569	(iternextfunc)unicodeiter_next,		/* tp_iternext */
9570	unicodeiter_methods,			/* tp_methods */
9571	0,
9572};
9573
9574static PyObject *
9575unicode_iter(PyObject *seq)
9576{
9577	unicodeiterobject *it;
9578
9579	if (!PyUnicode_Check(seq)) {
9580		PyErr_BadInternalCall();
9581		return NULL;
9582	}
9583	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9584	if (it == NULL)
9585		return NULL;
9586	it->it_index = 0;
9587	Py_INCREF(seq);
9588	it->it_seq = (PyUnicodeObject *)seq;
9589	_PyObject_GC_TRACK(it);
9590	return (PyObject *)it;
9591}
9592
9593size_t
9594Py_UNICODE_strlen(const Py_UNICODE *u)
9595{
9596    int res = 0;
9597    while(*u++)
9598        res++;
9599    return res;
9600}
9601
9602Py_UNICODE*
9603Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9604{
9605    Py_UNICODE *u = s1;
9606    while ((*u++ = *s2++));
9607    return s1;
9608}
9609
9610Py_UNICODE*
9611Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9612{
9613    Py_UNICODE *u = s1;
9614    while ((*u++ = *s2++))
9615        if (n-- == 0)
9616            break;
9617    return s1;
9618}
9619
9620int
9621Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9622{
9623    while (*s1 && *s2 && *s1 == *s2)
9624        s1++, s2++;
9625    if (*s1 && *s2)
9626        return (*s1 < *s2) ? -1 : +1;
9627    if (*s1)
9628        return 1;
9629    if (*s2)
9630        return -1;
9631    return 0;
9632}
9633
9634Py_UNICODE*
9635Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9636{
9637    const Py_UNICODE *p;
9638    for (p = s; *p; p++)
9639        if (*p == c)
9640            return (Py_UNICODE*)p;
9641    return NULL;
9642}
9643
9644
9645#ifdef __cplusplus
9646}
9647#endif
9648
9649
9650/*
9651Local variables:
9652c-basic-offset: 4
9653indent-tabs-mode: nil
9654End:
9655*/
9656