unicodeobject.c revision 84d79ddce2176ae54825da32e096d6332a8d5138
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44
45#include "unicodeobject.h"
46#include "ucnhash.h"
47
48#ifdef MS_WINDOWS
49#include <windows.h>
50#endif
51
52/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE       1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58   The implementation will keep allocated Unicode memory intact for
59   all objects on the free list having a size less than this
60   limit. This reduces malloc() overhead for small Unicode objects.
61
62   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64   malloc()-overhead) bytes of unused garbage.
65
66   Setting the limit to 0 effectively turns the feature off.
67
68   Note: This is an experimental feature ! If you get core dumps when
69   using Unicode objects, turn this feature off.
70
71*/
72
73#define KEEPALIVE_SIZE_LIMIT       9
74
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
83/* --- Globals ------------------------------------------------------------
84
85   The globals are initialized by the _PyUnicode_Init() API and should
86   not be used before calling that API.
87
88*/
89
90
91#ifdef __cplusplus
92extern "C" {
93#endif
94
95/* Free list for Unicode objects */
96static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
98
99/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103   shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
106/* Default encoding to use and assume when NULL is passed as encoding
107   parameter; it is initialized by _PyUnicode_Init().
108
109   Always use the PyUnicode_SetDefaultEncoding() and
110   PyUnicode_GetDefaultEncoding() APIs to access this global.
111
112*/
113static char unicode_default_encoding[100];
114
115Py_UNICODE
116PyUnicode_GetMax(void)
117{
118#ifdef Py_UNICODE_WIDE
119	return 0x10FFFF;
120#else
121	/* This is actually an illegal character, so it should
122	   not be passed to unichr. */
123	return 0xFFFF;
124#endif
125}
126
127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130   to keep things simple, we use a single bitmask, using the least 5
131   bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
145{
146    /* calculate simple bloom-style bitmask for a given unicode string */
147
148    long mask;
149    Py_ssize_t i;
150
151    mask = 0;
152    for (i = 0; i < len; i++)
153        mask |= (1 << (ptr[i] & 0x1F));
154
155    return mask;
156}
157
158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
159{
160    Py_ssize_t i;
161
162    for (i = 0; i < setlen; i++)
163        if (set[i] == chr)
164            return 1;
165
166    return 0;
167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
172/* --- Unicode Object ----------------------------------------------------- */
173
174static
175int unicode_resize(register PyUnicodeObject *unicode,
176                      Py_ssize_t length)
177{
178    void *oldstr;
179
180    /* Shortcut if there's nothing much to do. */
181    if (unicode->length == length)
182	goto reset;
183
184    /* Resizing shared object (unicode_empty or single character
185       objects) in-place is not allowed. Use PyUnicode_Resize()
186       instead ! */
187
188    if (unicode == unicode_empty ||
189	(unicode->length == 1 &&
190	 unicode->str[0] < 256U &&
191	 unicode_latin1[unicode->str[0]] == unicode)) {
192        PyErr_SetString(PyExc_SystemError,
193                        "can't resize shared unicode objects");
194        return -1;
195    }
196
197    /* We allocate one more byte to make sure the string is Ux0000 terminated.
198       The overallocation is also used by fastsearch, which assumes that it's
199       safe to look at str[length] (without making any assumptions about what
200       it contains). */
201
202    oldstr = unicode->str;
203    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204    if (!unicode->str) {
205	unicode->str = (Py_UNICODE *)oldstr;
206        PyErr_NoMemory();
207        return -1;
208    }
209    unicode->str[length] = 0;
210    unicode->length = length;
211
212 reset:
213    /* Reset the object caches */
214    if (unicode->defenc) {
215        Py_DECREF(unicode->defenc);
216        unicode->defenc = NULL;
217    }
218    unicode->hash = -1;
219
220    return 0;
221}
222
223/* We allocate one more byte to make sure the string is
224   Ux0000 terminated -- XXX is this needed ?
225
226   XXX This allocator could further be enhanced by assuring that the
227       free list never reduces its size below 1.
228
229*/
230
231static
232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
233{
234    register PyUnicodeObject *unicode;
235
236    /* Optimization for empty strings */
237    if (length == 0 && unicode_empty != NULL) {
238        Py_INCREF(unicode_empty);
239        return unicode_empty;
240    }
241
242    /* Unicode freelist & memory allocation */
243    if (unicode_freelist) {
244        unicode = unicode_freelist;
245        unicode_freelist = *(PyUnicodeObject **)unicode;
246        unicode_freelist_size--;
247	if (unicode->str) {
248	    /* Keep-Alive optimization: we only upsize the buffer,
249	       never downsize it. */
250	    if ((unicode->length < length) &&
251                unicode_resize(unicode, length) < 0) {
252		PyMem_DEL(unicode->str);
253		goto onError;
254	    }
255	}
256        else {
257	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
258        }
259        PyObject_INIT(unicode, &PyUnicode_Type);
260    }
261    else {
262        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
263        if (unicode == NULL)
264            return NULL;
265	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266    }
267
268    if (!unicode->str) {
269	PyErr_NoMemory();
270	goto onError;
271    }
272    /* Initialize the first element to guard against cases where
273     * the caller fails before initializing str -- unicode_resize()
274     * reads str[0], and the Keep-Alive optimization can keep memory
275     * allocated for str alive across a call to unicode_dealloc(unicode).
276     * We don't want unicode_resize to read uninitialized memory in
277     * that case.
278     */
279    unicode->str[0] = 0;
280    unicode->str[length] = 0;
281    unicode->length = length;
282    unicode->hash = -1;
283    unicode->defenc = NULL;
284    return unicode;
285
286 onError:
287    _Py_ForgetReference((PyObject *)unicode);
288    PyObject_Del(unicode);
289    return NULL;
290}
291
292static
293void unicode_dealloc(register PyUnicodeObject *unicode)
294{
295    if (PyUnicode_CheckExact(unicode) &&
296	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
297        /* Keep-Alive optimization */
298	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
299	    PyMem_DEL(unicode->str);
300	    unicode->str = NULL;
301	    unicode->length = 0;
302	}
303	if (unicode->defenc) {
304	    Py_DECREF(unicode->defenc);
305	    unicode->defenc = NULL;
306	}
307	/* Add to free list */
308        *(PyUnicodeObject **)unicode = unicode_freelist;
309        unicode_freelist = unicode;
310        unicode_freelist_size++;
311    }
312    else {
313	PyMem_DEL(unicode->str);
314	Py_XDECREF(unicode->defenc);
315	unicode->ob_type->tp_free((PyObject *)unicode);
316    }
317}
318
319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
320{
321    register PyUnicodeObject *v;
322
323    /* Argument checks */
324    if (unicode == NULL) {
325	PyErr_BadInternalCall();
326	return -1;
327    }
328    v = (PyUnicodeObject *)*unicode;
329    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
330	PyErr_BadInternalCall();
331	return -1;
332    }
333
334    /* Resizing unicode_empty and single character objects is not
335       possible since these are being shared. We simply return a fresh
336       copy with the same Unicode content. */
337    if (v->length != length &&
338	(v == unicode_empty || v->length == 1)) {
339	PyUnicodeObject *w = _PyUnicode_New(length);
340	if (w == NULL)
341	    return -1;
342	Py_UNICODE_COPY(w->str, v->str,
343			length < v->length ? length : v->length);
344	Py_DECREF(*unicode);
345	*unicode = (PyObject *)w;
346	return 0;
347    }
348
349    /* Note that we don't have to modify *unicode for unshared Unicode
350       objects, since we can modify them in-place. */
351    return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
359				Py_ssize_t size)
360{
361    PyUnicodeObject *unicode;
362
363    /* If the Unicode data is known at construction time, we can apply
364       some optimizations which share commonly used objects. */
365    if (u != NULL) {
366
367	/* Optimization for empty strings */
368	if (size == 0 && unicode_empty != NULL) {
369	    Py_INCREF(unicode_empty);
370	    return (PyObject *)unicode_empty;
371	}
372
373	/* Single character Unicode objects in the Latin-1 range are
374	   shared when using this constructor */
375	if (size == 1 && *u < 256) {
376	    unicode = unicode_latin1[*u];
377	    if (!unicode) {
378		unicode = _PyUnicode_New(1);
379		if (!unicode)
380		    return NULL;
381		unicode->str[0] = *u;
382		unicode_latin1[*u] = unicode;
383	    }
384	    Py_INCREF(unicode);
385	    return (PyObject *)unicode;
386	}
387    }
388
389    unicode = _PyUnicode_New(size);
390    if (!unicode)
391        return NULL;
392
393    /* Copy the Unicode data into the new object */
394    if (u != NULL)
395	Py_UNICODE_COPY(unicode->str, u, size);
396
397    return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
403				 Py_ssize_t size)
404{
405    PyUnicodeObject *unicode;
406
407    if (w == NULL) {
408	PyErr_BadInternalCall();
409	return NULL;
410    }
411
412    unicode = _PyUnicode_New(size);
413    if (!unicode)
414        return NULL;
415
416    /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418    memcpy(unicode->str, w, size * sizeof(wchar_t));
419#else
420    {
421	register Py_UNICODE *u;
422	register Py_ssize_t i;
423	u = PyUnicode_AS_UNICODE(unicode);
424	for (i = size; i > 0; i--)
425	    *u++ = *w++;
426    }
427#endif
428
429    return (PyObject *)unicode;
430}
431
432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433				wchar_t *w,
434				Py_ssize_t size)
435{
436    if (unicode == NULL) {
437	PyErr_BadInternalCall();
438	return -1;
439    }
440
441    /* If possible, try to copy the 0-termination as well */
442    if (size > PyUnicode_GET_SIZE(unicode))
443	size = PyUnicode_GET_SIZE(unicode) + 1;
444
445#ifdef HAVE_USABLE_WCHAR_T
446    memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448    {
449	register Py_UNICODE *u;
450	register Py_ssize_t i;
451	u = PyUnicode_AS_UNICODE(unicode);
452	for (i = size; i > 0; i--)
453	    *w++ = *u++;
454    }
455#endif
456
457    if (size > PyUnicode_GET_SIZE(unicode))
458        return PyUnicode_GET_SIZE(unicode);
459    else
460    return size;
461}
462
463#endif
464
465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
467    Py_UNICODE s[1];
468
469#ifdef Py_UNICODE_WIDE
470    if (ordinal < 0 || ordinal > 0x10ffff) {
471	PyErr_SetString(PyExc_ValueError,
472			"unichr() arg not in range(0x110000) "
473			"(wide Python build)");
474	return NULL;
475    }
476#else
477    if (ordinal < 0 || ordinal > 0xffff) {
478	PyErr_SetString(PyExc_ValueError,
479			"unichr() arg not in range(0x10000) "
480			"(narrow Python build)");
481	return NULL;
482    }
483#endif
484
485    s[0] = (Py_UNICODE)ordinal;
486    return PyUnicode_FromUnicode(s, 1);
487}
488
489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
491    /* XXX Perhaps we should make this API an alias of
492           PyObject_Unicode() instead ?! */
493    if (PyUnicode_CheckExact(obj)) {
494	Py_INCREF(obj);
495	return obj;
496    }
497    if (PyUnicode_Check(obj)) {
498	/* For a Unicode subtype that's not a Unicode object,
499	   return a true Unicode object with the same data. */
500	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501				     PyUnicode_GET_SIZE(obj));
502    }
503    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507				      const char *encoding,
508				      const char *errors)
509{
510    const char *s = NULL;
511    Py_ssize_t len;
512    PyObject *v;
513
514    if (obj == NULL) {
515	PyErr_BadInternalCall();
516	return NULL;
517    }
518
519#if 0
520    /* For b/w compatibility we also accept Unicode objects provided
521       that no encodings is given and then redirect to
522       PyObject_Unicode() which then applies the additional logic for
523       Unicode subclasses.
524
525       NOTE: This API should really only be used for object which
526             represent *encoded* Unicode !
527
528    */
529	if (PyUnicode_Check(obj)) {
530	    if (encoding) {
531		PyErr_SetString(PyExc_TypeError,
532				"decoding Unicode is not supported");
533	    return NULL;
534	    }
535	return PyObject_Unicode(obj);
536	    }
537#else
538    if (PyUnicode_Check(obj)) {
539	PyErr_SetString(PyExc_TypeError,
540			"decoding Unicode is not supported");
541	return NULL;
542	}
543#endif
544
545    /* Coerce object */
546    if (PyString_Check(obj)) {
547	    s = PyString_AS_STRING(obj);
548	    len = PyString_GET_SIZE(obj);
549	    }
550    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551	/* Overwrite the error message with something more useful in
552	   case of a TypeError. */
553	if (PyErr_ExceptionMatches(PyExc_TypeError))
554	PyErr_Format(PyExc_TypeError,
555			 "coercing to Unicode: need string or buffer, "
556			 "%.80s found",
557		     obj->ob_type->tp_name);
558	goto onError;
559    }
560
561    /* Convert to Unicode */
562    if (len == 0) {
563	Py_INCREF(unicode_empty);
564	v = (PyObject *)unicode_empty;
565    }
566    else
567	v = PyUnicode_Decode(s, len, encoding, errors);
568
569    return v;
570
571 onError:
572    return NULL;
573}
574
575PyObject *PyUnicode_Decode(const char *s,
576			   Py_ssize_t size,
577			   const char *encoding,
578			   const char *errors)
579{
580    PyObject *buffer = NULL, *unicode;
581
582    if (encoding == NULL)
583	encoding = PyUnicode_GetDefaultEncoding();
584
585    /* Shortcuts for common default encodings */
586    if (strcmp(encoding, "utf-8") == 0)
587        return PyUnicode_DecodeUTF8(s, size, errors);
588    else if (strcmp(encoding, "latin-1") == 0)
589        return PyUnicode_DecodeLatin1(s, size, errors);
590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591    else if (strcmp(encoding, "mbcs") == 0)
592        return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
594    else if (strcmp(encoding, "ascii") == 0)
595        return PyUnicode_DecodeASCII(s, size, errors);
596
597    /* Decode via the codec registry */
598    buffer = PyBuffer_FromMemory((void *)s, size);
599    if (buffer == NULL)
600        goto onError;
601    unicode = PyCodec_Decode(buffer, encoding, errors);
602    if (unicode == NULL)
603        goto onError;
604    if (!PyUnicode_Check(unicode)) {
605        PyErr_Format(PyExc_TypeError,
606                     "decoder did not return an unicode object (type=%.400s)",
607                     unicode->ob_type->tp_name);
608        Py_DECREF(unicode);
609        goto onError;
610    }
611    Py_DECREF(buffer);
612    return unicode;
613
614 onError:
615    Py_XDECREF(buffer);
616    return NULL;
617}
618
619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620                                    const char *encoding,
621                                    const char *errors)
622{
623    PyObject *v;
624
625    if (!PyUnicode_Check(unicode)) {
626        PyErr_BadArgument();
627        goto onError;
628    }
629
630    if (encoding == NULL)
631	encoding = PyUnicode_GetDefaultEncoding();
632
633    /* Decode via the codec registry */
634    v = PyCodec_Decode(unicode, encoding, errors);
635    if (v == NULL)
636        goto onError;
637    return v;
638
639 onError:
640    return NULL;
641}
642
643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
644			   Py_ssize_t size,
645			   const char *encoding,
646			   const char *errors)
647{
648    PyObject *v, *unicode;
649
650    unicode = PyUnicode_FromUnicode(s, size);
651    if (unicode == NULL)
652	return NULL;
653    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654    Py_DECREF(unicode);
655    return v;
656}
657
658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659                                    const char *encoding,
660                                    const char *errors)
661{
662    PyObject *v;
663
664    if (!PyUnicode_Check(unicode)) {
665        PyErr_BadArgument();
666        goto onError;
667    }
668
669    if (encoding == NULL)
670	encoding = PyUnicode_GetDefaultEncoding();
671
672    /* Encode via the codec registry */
673    v = PyCodec_Encode(unicode, encoding, errors);
674    if (v == NULL)
675        goto onError;
676    return v;
677
678 onError:
679    return NULL;
680}
681
682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683                                    const char *encoding,
684                                    const char *errors)
685{
686    PyObject *v;
687
688    if (!PyUnicode_Check(unicode)) {
689        PyErr_BadArgument();
690        goto onError;
691    }
692
693    if (encoding == NULL)
694	encoding = PyUnicode_GetDefaultEncoding();
695
696    /* Shortcuts for common default encodings */
697    if (errors == NULL) {
698	if (strcmp(encoding, "utf-8") == 0)
699	    return PyUnicode_AsUTF8String(unicode);
700	else if (strcmp(encoding, "latin-1") == 0)
701	    return PyUnicode_AsLatin1String(unicode);
702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703	else if (strcmp(encoding, "mbcs") == 0)
704	    return PyUnicode_AsMBCSString(unicode);
705#endif
706	else if (strcmp(encoding, "ascii") == 0)
707	    return PyUnicode_AsASCIIString(unicode);
708    }
709
710    /* Encode via the codec registry */
711    v = PyCodec_Encode(unicode, encoding, errors);
712    if (v == NULL)
713        goto onError;
714    if (!PyString_Check(v)) {
715        PyErr_Format(PyExc_TypeError,
716                     "encoder did not return a string object (type=%.400s)",
717                     v->ob_type->tp_name);
718        Py_DECREF(v);
719        goto onError;
720    }
721    return v;
722
723 onError:
724    return NULL;
725}
726
727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728					    const char *errors)
729{
730    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732    if (v)
733        return v;
734    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735    if (v && errors == NULL)
736        ((PyUnicodeObject *)unicode)->defenc = v;
737    return v;
738}
739
740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742    if (!PyUnicode_Check(unicode)) {
743        PyErr_BadArgument();
744        goto onError;
745    }
746    return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749    return NULL;
750}
751
752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
753{
754    if (!PyUnicode_Check(unicode)) {
755        PyErr_BadArgument();
756        goto onError;
757    }
758    return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761    return -1;
762}
763
764const char *PyUnicode_GetDefaultEncoding(void)
765{
766    return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771    PyObject *v;
772
773    /* Make sure the encoding is valid. As side effect, this also
774       loads the encoding into the codec registry cache. */
775    v = _PyCodec_Lookup(encoding);
776    if (v == NULL)
777	goto onError;
778    Py_DECREF(v);
779    strncpy(unicode_default_encoding,
780	    encoding,
781	    sizeof(unicode_default_encoding));
782    return 0;
783
784 onError:
785    return -1;
786}
787
788/* error handling callback helper:
789   build arguments, call the callback and check the arguments,
790   if no exception occurred, copy the replacement to the output
791   and adjust various state variables.
792   return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797                 const char *encoding, const char *reason,
798                 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
800{
801    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
802
803    PyObject *restuple = NULL;
804    PyObject *repunicode = NULL;
805    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806    Py_ssize_t requiredsize;
807    Py_ssize_t newpos;
808    Py_UNICODE *repptr;
809    Py_ssize_t repsize;
810    int res = -1;
811
812    if (*errorHandler == NULL) {
813	*errorHandler = PyCodec_LookupError(errors);
814	if (*errorHandler == NULL)
815	   goto onError;
816    }
817
818    if (*exceptionObject == NULL) {
819    	*exceptionObject = PyUnicodeDecodeError_Create(
820	    encoding, input, insize, *startinpos, *endinpos, reason);
821	if (*exceptionObject == NULL)
822	   goto onError;
823    }
824    else {
825	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826	    goto onError;
827	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828	    goto onError;
829	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830	    goto onError;
831    }
832
833    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834    if (restuple == NULL)
835	goto onError;
836    if (!PyTuple_Check(restuple)) {
837	PyErr_Format(PyExc_TypeError, &argparse[4]);
838	goto onError;
839    }
840    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841	goto onError;
842    if (newpos<0)
843	newpos = insize+newpos;
844    if (newpos<0 || newpos>insize) {
845	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
846	goto onError;
847    }
848
849    /* need more space? (at least enough for what we
850       have+the replacement+the rest of the string (starting
851       at the new input position), so we won't have to check space
852       when there are no errors in the rest of the string) */
853    repptr = PyUnicode_AS_UNICODE(repunicode);
854    repsize = PyUnicode_GET_SIZE(repunicode);
855    requiredsize = *outpos + repsize + insize-newpos;
856    if (requiredsize > outsize) {
857	if (requiredsize<2*outsize)
858	    requiredsize = 2*outsize;
859	if (PyUnicode_Resize(output, requiredsize) < 0)
860	    goto onError;
861	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862    }
863    *endinpos = newpos;
864    *inptr = input + newpos;
865    Py_UNICODE_COPY(*outptr, repptr, repsize);
866    *outptr += repsize;
867    *outpos += repsize;
868    /* we made it! */
869    res = 0;
870
871    onError:
872    Py_XDECREF(restuple);
873    return res;
874}
875
876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
880static
881char utf7_special[128] = {
882    /* indicate whether a UTF-7 character is special i.e. cannot be directly
883       encoded:
884	   0 - not special
885	   1 - special
886	   2 - whitespace (optional)
887	   3 - RFC2152 Set O (optional) */
888    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900   warnings about the comparison always being false; since
901   utf7_special[0] is 1, we can safely make that one comparison
902   true  */
903
904#define SPECIAL(c, encodeO, encodeWS) \
905    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
906     (encodeWS && (utf7_special[(c)] == 2)) || \
907     (encodeO && (utf7_special[(c)] == 3)))
908
909#define B64(n)  \
910    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912    (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
915     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
916
917#define ENCODE(out, ch, bits)                   \
918    while (bits >= 6) {                         \
919        *out++ = B64(ch >> (bits-6));           \
920        bits -= 6;                              \
921    }
922
923#define DECODE(out, ch, bits, surrogate)                                \
924    while (bits >= 16) {                                                \
925        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
926        bits -= 16;                                                     \
927        if (surrogate) {                                                \
928            /* We have already generated an error for the high surrogate \
929               so let's not bother seeing if the low surrogate is correct or not */ \
930            surrogate = 0;                                              \
931        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
932            /* This is a surrogate pair. Unfortunately we can't represent \
933               it in a 16-bit character */                              \
934            surrogate = 1;                                              \
935            errmsg = "code pairs are not supported";                    \
936            goto utf7Error;                                             \
937        } else {                                                        \
938            *out++ = outCh;                                             \
939        }                                                               \
940    }
941
942PyObject *PyUnicode_DecodeUTF7(const char *s,
943			       Py_ssize_t size,
944			       const char *errors)
945{
946    const char *starts = s;
947    Py_ssize_t startinpos;
948    Py_ssize_t endinpos;
949    Py_ssize_t outpos;
950    const char *e;
951    PyUnicodeObject *unicode;
952    Py_UNICODE *p;
953    const char *errmsg = "";
954    int inShift = 0;
955    unsigned int bitsleft = 0;
956    unsigned long charsleft = 0;
957    int surrogate = 0;
958    PyObject *errorHandler = NULL;
959    PyObject *exc = NULL;
960
961    unicode = _PyUnicode_New(size);
962    if (!unicode)
963        return NULL;
964    if (size == 0)
965        return (PyObject *)unicode;
966
967    p = unicode->str;
968    e = s + size;
969
970    while (s < e) {
971        Py_UNICODE ch;
972        restart:
973        ch = *s;
974
975        if (inShift) {
976            if ((ch == '-') || !B64CHAR(ch)) {
977                inShift = 0;
978                s++;
979
980                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981                if (bitsleft >= 6) {
982                    /* The shift sequence has a partial character in it. If
983                       bitsleft < 6 then we could just classify it as padding
984                       but that is not the case here */
985
986                    errmsg = "partial character in shift sequence";
987                    goto utf7Error;
988                }
989                /* According to RFC2152 the remaining bits should be zero. We
990                   choose to signal an error/insert a replacement character
991                   here so indicate the potential of a misencoded character. */
992
993                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995                    errmsg = "non-zero padding bits in shift sequence";
996                    goto utf7Error;
997                }
998
999                if (ch == '-') {
1000                    if ((s < e) && (*(s) == '-')) {
1001                        *p++ = '-';
1002                        inShift = 1;
1003                    }
1004                } else if (SPECIAL(ch,0,0)) {
1005                    errmsg = "unexpected special character";
1006	                goto utf7Error;
1007                } else  {
1008                    *p++ = ch;
1009                }
1010            } else {
1011                charsleft = (charsleft << 6) | UB64(ch);
1012                bitsleft += 6;
1013                s++;
1014                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015            }
1016        }
1017        else if ( ch == '+' ) {
1018            startinpos = s-starts;
1019            s++;
1020            if (s < e && *s == '-') {
1021                s++;
1022                *p++ = '+';
1023            } else
1024            {
1025                inShift = 1;
1026                bitsleft = 0;
1027            }
1028        }
1029        else if (SPECIAL(ch,0,0)) {
1030            errmsg = "unexpected special character";
1031            s++;
1032	        goto utf7Error;
1033        }
1034        else {
1035            *p++ = ch;
1036            s++;
1037        }
1038        continue;
1039    utf7Error:
1040        outpos = p-PyUnicode_AS_UNICODE(unicode);
1041        endinpos = s-starts;
1042        if (unicode_decode_call_errorhandler(
1043             errors, &errorHandler,
1044             "utf7", errmsg,
1045             starts, size, &startinpos, &endinpos, &exc, &s,
1046             (PyObject **)&unicode, &outpos, &p))
1047        goto onError;
1048    }
1049
1050    if (inShift) {
1051        outpos = p-PyUnicode_AS_UNICODE(unicode);
1052        endinpos = size;
1053        if (unicode_decode_call_errorhandler(
1054             errors, &errorHandler,
1055             "utf7", "unterminated shift sequence",
1056             starts, size, &startinpos, &endinpos, &exc, &s,
1057             (PyObject **)&unicode, &outpos, &p))
1058            goto onError;
1059        if (s < e)
1060           goto restart;
1061    }
1062
1063    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1064        goto onError;
1065
1066    Py_XDECREF(errorHandler);
1067    Py_XDECREF(exc);
1068    return (PyObject *)unicode;
1069
1070onError:
1071    Py_XDECREF(errorHandler);
1072    Py_XDECREF(exc);
1073    Py_DECREF(unicode);
1074    return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1079                   Py_ssize_t size,
1080                   int encodeSetO,
1081                   int encodeWhiteSpace,
1082                   const char *errors)
1083{
1084    PyObject *v;
1085    /* It might be possible to tighten this worst case */
1086    Py_ssize_t cbAllocated = 5 * size;
1087    int inShift = 0;
1088    Py_ssize_t i = 0;
1089    unsigned int bitsleft = 0;
1090    unsigned long charsleft = 0;
1091    char * out;
1092    char * start;
1093
1094    if (size == 0)
1095		return PyString_FromStringAndSize(NULL, 0);
1096
1097    v = PyString_FromStringAndSize(NULL, cbAllocated);
1098    if (v == NULL)
1099        return NULL;
1100
1101    start = out = PyString_AS_STRING(v);
1102    for (;i < size; ++i) {
1103        Py_UNICODE ch = s[i];
1104
1105        if (!inShift) {
1106            if (ch == '+') {
1107                *out++ = '+';
1108                *out++ = '-';
1109            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110                charsleft = ch;
1111                bitsleft = 16;
1112                *out++ = '+';
1113                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1114                inShift = bitsleft > 0;
1115            } else {
1116                *out++ = (char) ch;
1117            }
1118        } else {
1119            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120                *out++ = B64(charsleft << (6-bitsleft));
1121                charsleft = 0;
1122                bitsleft = 0;
1123                /* Characters not in the BASE64 set implicitly unshift the sequence
1124                   so no '-' is required, except if the character is itself a '-' */
1125                if (B64CHAR(ch) || ch == '-') {
1126                    *out++ = '-';
1127                }
1128                inShift = 0;
1129                *out++ = (char) ch;
1130            } else {
1131                bitsleft += 16;
1132                charsleft = (charsleft << 16) | ch;
1133                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135                /* If the next character is special then we dont' need to terminate
1136                   the shift sequence. If the next character is not a BASE64 character
1137                   or '-' then the shift sequence will be terminated implicitly and we
1138                   don't have to insert a '-'. */
1139
1140                if (bitsleft == 0) {
1141                    if (i + 1 < size) {
1142                        Py_UNICODE ch2 = s[i+1];
1143
1144                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1145
1146                        } else if (B64CHAR(ch2) || ch2 == '-') {
1147                            *out++ = '-';
1148                            inShift = 0;
1149                        } else {
1150                            inShift = 0;
1151                        }
1152
1153                    }
1154                    else {
1155                        *out++ = '-';
1156                        inShift = 0;
1157                    }
1158                }
1159            }
1160        }
1161    }
1162    if (bitsleft) {
1163        *out++= B64(charsleft << (6-bitsleft) );
1164        *out++ = '-';
1165    }
1166
1167    _PyString_Resize(&v, out - start);
1168    return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
1178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
1180static
1181char utf8_code_length[256] = {
1182    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1183       illegal prefix.  see RFC 2279 for details */
1184    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
1202PyObject *PyUnicode_DecodeUTF8(const char *s,
1203			       Py_ssize_t size,
1204			       const char *errors)
1205{
1206    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1210			                Py_ssize_t size,
1211			                const char *errors,
1212			                Py_ssize_t *consumed)
1213{
1214    const char *starts = s;
1215    int n;
1216    Py_ssize_t startinpos;
1217    Py_ssize_t endinpos;
1218    Py_ssize_t outpos;
1219    const char *e;
1220    PyUnicodeObject *unicode;
1221    Py_UNICODE *p;
1222    const char *errmsg = "";
1223    PyObject *errorHandler = NULL;
1224    PyObject *exc = NULL;
1225
1226    /* Note: size will always be longer than the resulting Unicode
1227       character count */
1228    unicode = _PyUnicode_New(size);
1229    if (!unicode)
1230        return NULL;
1231    if (size == 0) {
1232        if (consumed)
1233            *consumed = 0;
1234        return (PyObject *)unicode;
1235    }
1236
1237    /* Unpack UTF-8 encoded data */
1238    p = unicode->str;
1239    e = s + size;
1240
1241    while (s < e) {
1242        Py_UCS4 ch = (unsigned char)*s;
1243
1244        if (ch < 0x80) {
1245            *p++ = (Py_UNICODE)ch;
1246            s++;
1247            continue;
1248        }
1249
1250        n = utf8_code_length[ch];
1251
1252        if (s + n > e) {
1253	    if (consumed)
1254		break;
1255	    else {
1256		errmsg = "unexpected end of data";
1257		startinpos = s-starts;
1258		endinpos = size;
1259		goto utf8Error;
1260	    }
1261	}
1262
1263        switch (n) {
1264
1265        case 0:
1266            errmsg = "unexpected code byte";
1267	    startinpos = s-starts;
1268	    endinpos = startinpos+1;
1269	    goto utf8Error;
1270
1271        case 1:
1272            errmsg = "internal error";
1273	    startinpos = s-starts;
1274	    endinpos = startinpos+1;
1275	    goto utf8Error;
1276
1277        case 2:
1278            if ((s[1] & 0xc0) != 0x80) {
1279                errmsg = "invalid data";
1280		startinpos = s-starts;
1281		endinpos = startinpos+2;
1282		goto utf8Error;
1283	    }
1284            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1285            if (ch < 0x80) {
1286		startinpos = s-starts;
1287		endinpos = startinpos+2;
1288                errmsg = "illegal encoding";
1289		goto utf8Error;
1290	    }
1291	    else
1292		*p++ = (Py_UNICODE)ch;
1293            break;
1294
1295        case 3:
1296            if ((s[1] & 0xc0) != 0x80 ||
1297                (s[2] & 0xc0) != 0x80) {
1298                errmsg = "invalid data";
1299		startinpos = s-starts;
1300		endinpos = startinpos+3;
1301		goto utf8Error;
1302	    }
1303            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1304            if (ch < 0x0800) {
1305		/* Note: UTF-8 encodings of surrogates are considered
1306		   legal UTF-8 sequences;
1307
1308		   XXX For wide builds (UCS-4) we should probably try
1309		       to recombine the surrogates into a single code
1310		       unit.
1311		*/
1312                errmsg = "illegal encoding";
1313		startinpos = s-starts;
1314		endinpos = startinpos+3;
1315		goto utf8Error;
1316	    }
1317	    else
1318		*p++ = (Py_UNICODE)ch;
1319            break;
1320
1321        case 4:
1322            if ((s[1] & 0xc0) != 0x80 ||
1323                (s[2] & 0xc0) != 0x80 ||
1324                (s[3] & 0xc0) != 0x80) {
1325                errmsg = "invalid data";
1326		startinpos = s-starts;
1327		endinpos = startinpos+4;
1328		goto utf8Error;
1329	    }
1330            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332            /* validate and convert to UTF-16 */
1333            if ((ch < 0x10000)        /* minimum value allowed for 4
1334					 byte encoding */
1335                || (ch > 0x10ffff))   /* maximum value allowed for
1336					 UTF-16 */
1337	    {
1338                errmsg = "illegal encoding";
1339		startinpos = s-starts;
1340		endinpos = startinpos+4;
1341		goto utf8Error;
1342	    }
1343#ifdef Py_UNICODE_WIDE
1344	    *p++ = (Py_UNICODE)ch;
1345#else
1346            /*  compute and append the two surrogates: */
1347
1348            /*  translate from 10000..10FFFF to 0..FFFF */
1349            ch -= 0x10000;
1350
1351            /*  high surrogate = top 10 bits added to D800 */
1352            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1353
1354            /*  low surrogate = bottom 10 bits added to DC00 */
1355            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1356#endif
1357            break;
1358
1359        default:
1360            /* Other sizes are only needed for UCS-4 */
1361            errmsg = "unsupported Unicode code range";
1362	    startinpos = s-starts;
1363	    endinpos = startinpos+n;
1364	    goto utf8Error;
1365        }
1366        s += n;
1367	continue;
1368
1369    utf8Error:
1370    outpos = p-PyUnicode_AS_UNICODE(unicode);
1371    if (unicode_decode_call_errorhandler(
1372	     errors, &errorHandler,
1373	     "utf8", errmsg,
1374	     starts, size, &startinpos, &endinpos, &exc, &s,
1375	     (PyObject **)&unicode, &outpos, &p))
1376	goto onError;
1377    }
1378    if (consumed)
1379	*consumed = s-starts;
1380
1381    /* Adjust length */
1382    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1383        goto onError;
1384
1385    Py_XDECREF(errorHandler);
1386    Py_XDECREF(exc);
1387    return (PyObject *)unicode;
1388
1389onError:
1390    Py_XDECREF(errorHandler);
1391    Py_XDECREF(exc);
1392    Py_DECREF(unicode);
1393    return NULL;
1394}
1395
1396/* Allocation strategy:  if the string is short, convert into a stack buffer
1397   and allocate exactly as much space needed at the end.  Else allocate the
1398   maximum possible needed (4 result bytes per Unicode character), and return
1399   the excess memory at the end.
1400*/
1401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1403		     Py_ssize_t size,
1404		     const char *errors)
1405{
1406#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1407
1408    Py_ssize_t i;           /* index into s of next input byte */
1409    PyObject *v;        /* result string object */
1410    char *p;            /* next free byte in output buffer */
1411    Py_ssize_t nallocated;  /* number of result bytes allocated */
1412    Py_ssize_t nneeded;        /* number of result bytes needed */
1413    char stackbuf[MAX_SHORT_UNICHARS * 4];
1414
1415    assert(s != NULL);
1416    assert(size >= 0);
1417
1418    if (size <= MAX_SHORT_UNICHARS) {
1419        /* Write into the stack buffer; nallocated can't overflow.
1420         * At the end, we'll allocate exactly as much heap space as it
1421         * turns out we need.
1422         */
1423        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424        v = NULL;   /* will allocate after we're done */
1425        p = stackbuf;
1426    }
1427    else {
1428        /* Overallocate on the heap, and give the excess back at the end. */
1429        nallocated = size * 4;
1430        if (nallocated / 4 != size)  /* overflow! */
1431            return PyErr_NoMemory();
1432        v = PyString_FromStringAndSize(NULL, nallocated);
1433        if (v == NULL)
1434            return NULL;
1435        p = PyString_AS_STRING(v);
1436    }
1437
1438    for (i = 0; i < size;) {
1439        Py_UCS4 ch = s[i++];
1440
1441        if (ch < 0x80)
1442            /* Encode ASCII */
1443            *p++ = (char) ch;
1444
1445        else if (ch < 0x0800) {
1446            /* Encode Latin-1 */
1447            *p++ = (char)(0xc0 | (ch >> 6));
1448            *p++ = (char)(0x80 | (ch & 0x3f));
1449        }
1450        else {
1451            /* Encode UCS2 Unicode ordinals */
1452            if (ch < 0x10000) {
1453                /* Special case: check for high surrogate */
1454                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455                    Py_UCS4 ch2 = s[i];
1456                    /* Check for low surrogate and combine the two to
1457                       form a UCS4 value */
1458                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1459                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1460                        i++;
1461                        goto encodeUCS4;
1462                    }
1463                    /* Fall through: handles isolated high surrogates */
1464                }
1465                *p++ = (char)(0xe0 | (ch >> 12));
1466                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467                *p++ = (char)(0x80 | (ch & 0x3f));
1468                continue;
1469    	    }
1470encodeUCS4:
1471            /* Encode UCS4 Unicode ordinals */
1472            *p++ = (char)(0xf0 | (ch >> 18));
1473            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475            *p++ = (char)(0x80 | (ch & 0x3f));
1476        }
1477    }
1478
1479    if (v == NULL) {
1480        /* This was stack allocated. */
1481        nneeded = p - stackbuf;
1482        assert(nneeded <= nallocated);
1483        v = PyString_FromStringAndSize(stackbuf, nneeded);
1484    }
1485    else {
1486    	/* Cut back to size actually needed. */
1487        nneeded = p - PyString_AS_STRING(v);
1488        assert(nneeded <= nallocated);
1489        _PyString_Resize(&v, nneeded);
1490    }
1491    return v;
1492
1493#undef MAX_SHORT_UNICHARS
1494}
1495
1496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
1498    if (!PyUnicode_Check(unicode)) {
1499        PyErr_BadArgument();
1500        return NULL;
1501    }
1502    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503				PyUnicode_GET_SIZE(unicode),
1504				NULL);
1505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
1509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
1511		      Py_ssize_t size,
1512		      const char *errors,
1513		      int *byteorder)
1514{
1515    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
1520			      Py_ssize_t size,
1521			      const char *errors,
1522			      int *byteorder,
1523			      Py_ssize_t *consumed)
1524{
1525    const char *starts = s;
1526    Py_ssize_t startinpos;
1527    Py_ssize_t endinpos;
1528    Py_ssize_t outpos;
1529    PyUnicodeObject *unicode;
1530    Py_UNICODE *p;
1531    const unsigned char *q, *e;
1532    int bo = 0;       /* assume native ordering by default */
1533    const char *errmsg = "";
1534    /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536    int ihi = 1, ilo = 0;
1537#else
1538    int ihi = 0, ilo = 1;
1539#endif
1540    PyObject *errorHandler = NULL;
1541    PyObject *exc = NULL;
1542
1543    /* Note: size will always be longer than the resulting Unicode
1544       character count */
1545    unicode = _PyUnicode_New(size);
1546    if (!unicode)
1547        return NULL;
1548    if (size == 0)
1549        return (PyObject *)unicode;
1550
1551    /* Unpack UTF-16 encoded data */
1552    p = unicode->str;
1553    q = (unsigned char *)s;
1554    e = q + size;
1555
1556    if (byteorder)
1557        bo = *byteorder;
1558
1559    /* Check for BOM marks (U+FEFF) in the input and adjust current
1560       byte order setting accordingly. In native mode, the leading BOM
1561       mark is skipped, in all other modes, it is copied to the output
1562       stream as-is (giving a ZWNBSP character). */
1563    if (bo == 0) {
1564        if (size >= 2) {
1565            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1567	    if (bom == 0xFEFF) {
1568		q += 2;
1569		bo = -1;
1570	    }
1571	    else if (bom == 0xFFFE) {
1572		q += 2;
1573		bo = 1;
1574	    }
1575#else
1576	    if (bom == 0xFEFF) {
1577		q += 2;
1578		bo = 1;
1579	    }
1580	    else if (bom == 0xFFFE) {
1581		q += 2;
1582		bo = -1;
1583	    }
1584#endif
1585	}
1586    }
1587
1588    if (bo == -1) {
1589        /* force LE */
1590        ihi = 1;
1591        ilo = 0;
1592    }
1593    else if (bo == 1) {
1594        /* force BE */
1595        ihi = 0;
1596        ilo = 1;
1597    }
1598
1599    while (q < e) {
1600	Py_UNICODE ch;
1601	/* remaining bytes at the end? (size should be even) */
1602	if (e-q<2) {
1603	    if (consumed)
1604		break;
1605	    errmsg = "truncated data";
1606	    startinpos = ((const char *)q)-starts;
1607	    endinpos = ((const char *)e)-starts;
1608	    goto utf16Error;
1609	    /* The remaining input chars are ignored if the callback
1610	       chooses to skip the input */
1611	}
1612	ch = (q[ihi] << 8) | q[ilo];
1613
1614	q += 2;
1615
1616	if (ch < 0xD800 || ch > 0xDFFF) {
1617	    *p++ = ch;
1618	    continue;
1619	}
1620
1621	/* UTF-16 code pair: */
1622	if (q >= e) {
1623	    errmsg = "unexpected end of data";
1624	    startinpos = (((const char *)q)-2)-starts;
1625	    endinpos = ((const char *)e)-starts;
1626	    goto utf16Error;
1627	}
1628	if (0xD800 <= ch && ch <= 0xDBFF) {
1629	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630	    q += 2;
1631	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1632#ifndef Py_UNICODE_WIDE
1633		*p++ = ch;
1634		*p++ = ch2;
1635#else
1636		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1637#endif
1638		continue;
1639	    }
1640	    else {
1641                errmsg = "illegal UTF-16 surrogate";
1642		startinpos = (((const char *)q)-4)-starts;
1643		endinpos = startinpos+2;
1644		goto utf16Error;
1645	    }
1646
1647	}
1648	errmsg = "illegal encoding";
1649	startinpos = (((const char *)q)-2)-starts;
1650	endinpos = startinpos+2;
1651	/* Fall through to report the error */
1652
1653    utf16Error:
1654	outpos = p-PyUnicode_AS_UNICODE(unicode);
1655	if (unicode_decode_call_errorhandler(
1656	         errors, &errorHandler,
1657	         "utf16", errmsg,
1658	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659	         (PyObject **)&unicode, &outpos, &p))
1660	    goto onError;
1661    }
1662
1663    if (byteorder)
1664        *byteorder = bo;
1665
1666    if (consumed)
1667	*consumed = (const char *)q-starts;
1668
1669    /* Adjust length */
1670    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1671        goto onError;
1672
1673    Py_XDECREF(errorHandler);
1674    Py_XDECREF(exc);
1675    return (PyObject *)unicode;
1676
1677onError:
1678    Py_DECREF(unicode);
1679    Py_XDECREF(errorHandler);
1680    Py_XDECREF(exc);
1681    return NULL;
1682}
1683
1684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1686		      Py_ssize_t size,
1687		      const char *errors,
1688		      int byteorder)
1689{
1690    PyObject *v;
1691    unsigned char *p;
1692#ifdef Py_UNICODE_WIDE
1693    int i, pairs;
1694#else
1695    const int pairs = 0;
1696#endif
1697    /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699    int ihi = 1, ilo = 0;
1700#else
1701    int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH)                   \
1705    do {                                \
1706        p[ihi] = ((CH) >> 8) & 0xff;    \
1707        p[ilo] = (CH) & 0xff;           \
1708        p += 2;                         \
1709    } while(0)
1710
1711#ifdef Py_UNICODE_WIDE
1712    for (i = pairs = 0; i < size; i++)
1713	if (s[i] >= 0x10000)
1714	    pairs++;
1715#endif
1716    v = PyString_FromStringAndSize(NULL,
1717		  2 * (size + pairs + (byteorder == 0)));
1718    if (v == NULL)
1719        return NULL;
1720
1721    p = (unsigned char *)PyString_AS_STRING(v);
1722    if (byteorder == 0)
1723	STORECHAR(0xFEFF);
1724    if (size == 0)
1725        return v;
1726
1727    if (byteorder == -1) {
1728        /* force LE */
1729        ihi = 1;
1730        ilo = 0;
1731    }
1732    else if (byteorder == 1) {
1733        /* force BE */
1734        ihi = 0;
1735        ilo = 1;
1736    }
1737
1738    while (size-- > 0) {
1739	Py_UNICODE ch = *s++;
1740	Py_UNICODE ch2 = 0;
1741#ifdef Py_UNICODE_WIDE
1742	if (ch >= 0x10000) {
1743	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1745	}
1746#endif
1747        STORECHAR(ch);
1748        if (ch2)
1749            STORECHAR(ch2);
1750    }
1751    return v;
1752#undef STORECHAR
1753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757    if (!PyUnicode_Check(unicode)) {
1758        PyErr_BadArgument();
1759        return NULL;
1760    }
1761    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762				 PyUnicode_GET_SIZE(unicode),
1763				 NULL,
1764				 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
1769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1770
1771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1772					Py_ssize_t size,
1773					const char *errors)
1774{
1775    const char *starts = s;
1776    Py_ssize_t startinpos;
1777    Py_ssize_t endinpos;
1778    Py_ssize_t outpos;
1779    int i;
1780    PyUnicodeObject *v;
1781    Py_UNICODE *p;
1782    const char *end;
1783    char* message;
1784    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1785    PyObject *errorHandler = NULL;
1786    PyObject *exc = NULL;
1787
1788    /* Escaped strings will always be longer than the resulting
1789       Unicode string, so we start with size here and then reduce the
1790       length after conversion to the true value.
1791       (but if the error callback returns a long replacement string
1792       we'll have to allocate more space) */
1793    v = _PyUnicode_New(size);
1794    if (v == NULL)
1795        goto onError;
1796    if (size == 0)
1797        return (PyObject *)v;
1798
1799    p = PyUnicode_AS_UNICODE(v);
1800    end = s + size;
1801
1802    while (s < end) {
1803        unsigned char c;
1804        Py_UNICODE x;
1805        int digits;
1806
1807        /* Non-escape characters are interpreted as Unicode ordinals */
1808        if (*s != '\\') {
1809            *p++ = (unsigned char) *s++;
1810            continue;
1811        }
1812
1813        startinpos = s-starts;
1814        /* \ - Escapes */
1815        s++;
1816        switch (*s++) {
1817
1818        /* \x escapes */
1819        case '\n': break;
1820        case '\\': *p++ = '\\'; break;
1821        case '\'': *p++ = '\''; break;
1822        case '\"': *p++ = '\"'; break;
1823        case 'b': *p++ = '\b'; break;
1824        case 'f': *p++ = '\014'; break; /* FF */
1825        case 't': *p++ = '\t'; break;
1826        case 'n': *p++ = '\n'; break;
1827        case 'r': *p++ = '\r'; break;
1828        case 'v': *p++ = '\013'; break; /* VT */
1829        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831        /* \OOO (octal) escapes */
1832        case '0': case '1': case '2': case '3':
1833        case '4': case '5': case '6': case '7':
1834            x = s[-1] - '0';
1835            if ('0' <= *s && *s <= '7') {
1836                x = (x<<3) + *s++ - '0';
1837                if ('0' <= *s && *s <= '7')
1838                    x = (x<<3) + *s++ - '0';
1839            }
1840            *p++ = x;
1841            break;
1842
1843        /* hex escapes */
1844        /* \xXX */
1845        case 'x':
1846            digits = 2;
1847            message = "truncated \\xXX escape";
1848            goto hexescape;
1849
1850        /* \uXXXX */
1851        case 'u':
1852            digits = 4;
1853            message = "truncated \\uXXXX escape";
1854            goto hexescape;
1855
1856        /* \UXXXXXXXX */
1857        case 'U':
1858            digits = 8;
1859            message = "truncated \\UXXXXXXXX escape";
1860        hexescape:
1861            chr = 0;
1862            outpos = p-PyUnicode_AS_UNICODE(v);
1863            if (s+digits>end) {
1864                endinpos = size;
1865                if (unicode_decode_call_errorhandler(
1866                    errors, &errorHandler,
1867                    "unicodeescape", "end of string in escape sequence",
1868                    starts, size, &startinpos, &endinpos, &exc, &s,
1869                    (PyObject **)&v, &outpos, &p))
1870                    goto onError;
1871                goto nextByte;
1872            }
1873            for (i = 0; i < digits; ++i) {
1874                c = (unsigned char) s[i];
1875                if (!isxdigit(c)) {
1876                    endinpos = (s+i+1)-starts;
1877                    if (unicode_decode_call_errorhandler(
1878                        errors, &errorHandler,
1879                        "unicodeescape", message,
1880                        starts, size, &startinpos, &endinpos, &exc, &s,
1881                        (PyObject **)&v, &outpos, &p))
1882                        goto onError;
1883                    goto nextByte;
1884                }
1885                chr = (chr<<4) & ~0xF;
1886                if (c >= '0' && c <= '9')
1887                    chr += c - '0';
1888                else if (c >= 'a' && c <= 'f')
1889                    chr += 10 + c - 'a';
1890                else
1891                    chr += 10 + c - 'A';
1892            }
1893            s += i;
1894            if (chr == 0xffffffff && PyErr_Occurred())
1895                /* _decoding_error will have already written into the
1896                   target buffer. */
1897                break;
1898        store:
1899            /* when we get here, chr is a 32-bit unicode character */
1900            if (chr <= 0xffff)
1901                /* UCS-2 character */
1902                *p++ = (Py_UNICODE) chr;
1903            else if (chr <= 0x10ffff) {
1904                /* UCS-4 character. Either store directly, or as
1905                   surrogate pair. */
1906#ifdef Py_UNICODE_WIDE
1907                *p++ = chr;
1908#else
1909                chr -= 0x10000L;
1910                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1911                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1912#endif
1913            } else {
1914                endinpos = s-starts;
1915                outpos = p-PyUnicode_AS_UNICODE(v);
1916                if (unicode_decode_call_errorhandler(
1917                    errors, &errorHandler,
1918                    "unicodeescape", "illegal Unicode character",
1919                    starts, size, &startinpos, &endinpos, &exc, &s,
1920                    (PyObject **)&v, &outpos, &p))
1921                    goto onError;
1922            }
1923            break;
1924
1925        /* \N{name} */
1926        case 'N':
1927            message = "malformed \\N character escape";
1928            if (ucnhash_CAPI == NULL) {
1929                /* load the unicode data module */
1930                PyObject *m, *api;
1931                m = PyImport_ImportModule("unicodedata");
1932                if (m == NULL)
1933                    goto ucnhashError;
1934                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
1935                Py_DECREF(m);
1936                if (api == NULL)
1937                    goto ucnhashError;
1938                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
1939                Py_DECREF(api);
1940                if (ucnhash_CAPI == NULL)
1941                    goto ucnhashError;
1942            }
1943            if (*s == '{') {
1944                const char *start = s+1;
1945                /* look for the closing brace */
1946                while (*s != '}' && s < end)
1947                    s++;
1948                if (s > start && s < end && *s == '}') {
1949                    /* found a name.  look it up in the unicode database */
1950                    message = "unknown Unicode character name";
1951                    s++;
1952                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
1953                        goto store;
1954                }
1955            }
1956            endinpos = s-starts;
1957            outpos = p-PyUnicode_AS_UNICODE(v);
1958            if (unicode_decode_call_errorhandler(
1959                errors, &errorHandler,
1960                "unicodeescape", message,
1961                starts, size, &startinpos, &endinpos, &exc, &s,
1962                (PyObject **)&v, &outpos, &p))
1963                goto onError;
1964            break;
1965
1966        default:
1967            if (s > end) {
1968                message = "\\ at end of string";
1969                s--;
1970                endinpos = s-starts;
1971                outpos = p-PyUnicode_AS_UNICODE(v);
1972                if (unicode_decode_call_errorhandler(
1973                    errors, &errorHandler,
1974                    "unicodeescape", message,
1975                    starts, size, &startinpos, &endinpos, &exc, &s,
1976                    (PyObject **)&v, &outpos, &p))
1977                    goto onError;
1978            }
1979            else {
1980                *p++ = '\\';
1981                *p++ = (unsigned char)s[-1];
1982            }
1983            break;
1984        }
1985        nextByte:
1986        ;
1987    }
1988    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
1989        goto onError;
1990    Py_XDECREF(errorHandler);
1991    Py_XDECREF(exc);
1992    return (PyObject *)v;
1993
1994ucnhashError:
1995    PyErr_SetString(
1996        PyExc_UnicodeError,
1997        "\\N escapes not supported (can't load unicodedata module)"
1998        );
1999    Py_XDECREF(v);
2000    Py_XDECREF(errorHandler);
2001    Py_XDECREF(exc);
2002    return NULL;
2003
2004onError:
2005    Py_XDECREF(v);
2006    Py_XDECREF(errorHandler);
2007    Py_XDECREF(exc);
2008    return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013   If quotes is true, the string is enclosed in u"" or u'' quotes as
2014   appropriate.
2015
2016*/
2017
2018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019                                      Py_ssize_t size,
2020                                      Py_UNICODE ch)
2021{
2022    /* like wcschr, but doesn't stop at NULL characters */
2023
2024    while (size-- > 0) {
2025        if (*s == ch)
2026            return s;
2027        s++;
2028    }
2029
2030    return NULL;
2031}
2032
2033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
2035                               Py_ssize_t size,
2036                               int quotes)
2037{
2038    PyObject *repr;
2039    char *p;
2040
2041    static const char *hexdigit = "0123456789abcdef";
2042
2043    /* XXX(nnorwitz): rather than over-allocating, it would be
2044       better to choose a different scheme.  Perhaps scan the
2045       first N-chars of the string and allocate based on that size.
2046    */
2047    /* Initial allocation is based on the longest-possible unichr
2048       escape.
2049
2050       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2051       unichr, so in this case it's the longest unichr escape. In
2052       narrow (UTF-16) builds this is five chars per source unichr
2053       since there are two unichrs in the surrogate pair, so in narrow
2054       (UTF-16) builds it's not the longest unichr escape.
2055
2056       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2057       so in the narrow (UTF-16) build case it's the longest unichr
2058       escape.
2059    */
2060
2061    repr = PyString_FromStringAndSize(NULL,
2062        2
2063#ifdef Py_UNICODE_WIDE
2064        + 10*size
2065#else
2066        + 6*size
2067#endif
2068        + 1);
2069    if (repr == NULL)
2070        return NULL;
2071
2072    p = PyString_AS_STRING(repr);
2073
2074    if (quotes) {
2075        *p++ = 'u';
2076        *p++ = (findchar(s, size, '\'') &&
2077                !findchar(s, size, '"')) ? '"' : '\'';
2078    }
2079    while (size-- > 0) {
2080        Py_UNICODE ch = *s++;
2081
2082        /* Escape quotes and backslashes */
2083        if ((quotes &&
2084	     ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2085            *p++ = '\\';
2086            *p++ = (char) ch;
2087	    continue;
2088        }
2089
2090#ifdef Py_UNICODE_WIDE
2091        /* Map 21-bit characters to '\U00xxxxxx' */
2092        else if (ch >= 0x10000) {
2093            *p++ = '\\';
2094            *p++ = 'U';
2095            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2096            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2097            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2098            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2099            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2100            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2101            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2102            *p++ = hexdigit[ch & 0x0000000F];
2103	    continue;
2104        }
2105#else
2106	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2107	else if (ch >= 0xD800 && ch < 0xDC00) {
2108	    Py_UNICODE ch2;
2109	    Py_UCS4 ucs;
2110
2111	    ch2 = *s++;
2112	    size--;
2113	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2114		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2115		*p++ = '\\';
2116		*p++ = 'U';
2117		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2118		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2119		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2120		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2121		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2122		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2123		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2124		*p++ = hexdigit[ucs & 0x0000000F];
2125		continue;
2126	    }
2127	    /* Fall through: isolated surrogates are copied as-is */
2128	    s--;
2129	    size++;
2130	}
2131#endif
2132
2133        /* Map 16-bit characters to '\uxxxx' */
2134        if (ch >= 256) {
2135            *p++ = '\\';
2136            *p++ = 'u';
2137            *p++ = hexdigit[(ch >> 12) & 0x000F];
2138            *p++ = hexdigit[(ch >> 8) & 0x000F];
2139            *p++ = hexdigit[(ch >> 4) & 0x000F];
2140            *p++ = hexdigit[ch & 0x000F];
2141        }
2142
2143        /* Map special whitespace to '\t', \n', '\r' */
2144        else if (ch == '\t') {
2145            *p++ = '\\';
2146            *p++ = 't';
2147        }
2148        else if (ch == '\n') {
2149            *p++ = '\\';
2150            *p++ = 'n';
2151        }
2152        else if (ch == '\r') {
2153            *p++ = '\\';
2154            *p++ = 'r';
2155        }
2156
2157        /* Map non-printable US ASCII to '\xhh' */
2158        else if (ch < ' ' || ch >= 0x7F) {
2159            *p++ = '\\';
2160            *p++ = 'x';
2161            *p++ = hexdigit[(ch >> 4) & 0x000F];
2162            *p++ = hexdigit[ch & 0x000F];
2163        }
2164
2165        /* Copy everything else as-is */
2166        else
2167            *p++ = (char) ch;
2168    }
2169    if (quotes)
2170        *p++ = PyString_AS_STRING(repr)[1];
2171
2172    *p = '\0';
2173    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2174    return repr;
2175}
2176
2177PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2178					Py_ssize_t size)
2179{
2180    return unicodeescape_string(s, size, 0);
2181}
2182
2183PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2184{
2185    if (!PyUnicode_Check(unicode)) {
2186        PyErr_BadArgument();
2187        return NULL;
2188    }
2189    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2190					 PyUnicode_GET_SIZE(unicode));
2191}
2192
2193/* --- Raw Unicode Escape Codec ------------------------------------------- */
2194
2195PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2196					   Py_ssize_t size,
2197					   const char *errors)
2198{
2199    const char *starts = s;
2200    Py_ssize_t startinpos;
2201    Py_ssize_t endinpos;
2202    Py_ssize_t outpos;
2203    PyUnicodeObject *v;
2204    Py_UNICODE *p;
2205    const char *end;
2206    const char *bs;
2207    PyObject *errorHandler = NULL;
2208    PyObject *exc = NULL;
2209
2210    /* Escaped strings will always be longer than the resulting
2211       Unicode string, so we start with size here and then reduce the
2212       length after conversion to the true value. (But decoding error
2213       handler might have to resize the string) */
2214    v = _PyUnicode_New(size);
2215    if (v == NULL)
2216	goto onError;
2217    if (size == 0)
2218	return (PyObject *)v;
2219    p = PyUnicode_AS_UNICODE(v);
2220    end = s + size;
2221    while (s < end) {
2222	unsigned char c;
2223	Py_UCS4 x;
2224	int i;
2225        int count;
2226
2227	/* Non-escape characters are interpreted as Unicode ordinals */
2228	if (*s != '\\') {
2229	    *p++ = (unsigned char)*s++;
2230	    continue;
2231	}
2232	startinpos = s-starts;
2233
2234	/* \u-escapes are only interpreted iff the number of leading
2235	   backslashes if odd */
2236	bs = s;
2237	for (;s < end;) {
2238	    if (*s != '\\')
2239		break;
2240	    *p++ = (unsigned char)*s++;
2241	}
2242	if (((s - bs) & 1) == 0 ||
2243	    s >= end ||
2244	    (*s != 'u' && *s != 'U')) {
2245	    continue;
2246	}
2247	p--;
2248        count = *s=='u' ? 4 : 8;
2249	s++;
2250
2251	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2252	outpos = p-PyUnicode_AS_UNICODE(v);
2253	for (x = 0, i = 0; i < count; ++i, ++s) {
2254	    c = (unsigned char)*s;
2255	    if (!isxdigit(c)) {
2256		endinpos = s-starts;
2257		if (unicode_decode_call_errorhandler(
2258		    errors, &errorHandler,
2259		    "rawunicodeescape", "truncated \\uXXXX",
2260		    starts, size, &startinpos, &endinpos, &exc, &s,
2261		    (PyObject **)&v, &outpos, &p))
2262		    goto onError;
2263		goto nextByte;
2264	    }
2265	    x = (x<<4) & ~0xF;
2266	    if (c >= '0' && c <= '9')
2267		x += c - '0';
2268	    else if (c >= 'a' && c <= 'f')
2269		x += 10 + c - 'a';
2270	    else
2271		x += 10 + c - 'A';
2272	}
2273#ifndef Py_UNICODE_WIDE
2274        if (x > 0x10000) {
2275            if (unicode_decode_call_errorhandler(
2276                    errors, &errorHandler,
2277                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
2278		    starts, size, &startinpos, &endinpos, &exc, &s,
2279		    (PyObject **)&v, &outpos, &p))
2280		    goto onError;
2281        }
2282#endif
2283	*p++ = x;
2284	nextByte:
2285	;
2286    }
2287    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2288	goto onError;
2289    Py_XDECREF(errorHandler);
2290    Py_XDECREF(exc);
2291    return (PyObject *)v;
2292
2293 onError:
2294    Py_XDECREF(v);
2295    Py_XDECREF(errorHandler);
2296    Py_XDECREF(exc);
2297    return NULL;
2298}
2299
2300PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2301					   Py_ssize_t size)
2302{
2303    PyObject *repr;
2304    char *p;
2305    char *q;
2306
2307    static const char *hexdigit = "0123456789abcdef";
2308
2309#ifdef Py_UNICODE_WIDE
2310    repr = PyString_FromStringAndSize(NULL, 10 * size);
2311#else
2312    repr = PyString_FromStringAndSize(NULL, 6 * size);
2313#endif
2314    if (repr == NULL)
2315        return NULL;
2316    if (size == 0)
2317	return repr;
2318
2319    p = q = PyString_AS_STRING(repr);
2320    while (size-- > 0) {
2321        Py_UNICODE ch = *s++;
2322#ifdef Py_UNICODE_WIDE
2323	/* Map 32-bit characters to '\Uxxxxxxxx' */
2324	if (ch >= 0x10000) {
2325            *p++ = '\\';
2326            *p++ = 'U';
2327            *p++ = hexdigit[(ch >> 28) & 0xf];
2328            *p++ = hexdigit[(ch >> 24) & 0xf];
2329            *p++ = hexdigit[(ch >> 20) & 0xf];
2330            *p++ = hexdigit[(ch >> 16) & 0xf];
2331            *p++ = hexdigit[(ch >> 12) & 0xf];
2332            *p++ = hexdigit[(ch >> 8) & 0xf];
2333            *p++ = hexdigit[(ch >> 4) & 0xf];
2334            *p++ = hexdigit[ch & 15];
2335        }
2336        else
2337#endif
2338	/* Map 16-bit characters to '\uxxxx' */
2339	if (ch >= 256) {
2340            *p++ = '\\';
2341            *p++ = 'u';
2342            *p++ = hexdigit[(ch >> 12) & 0xf];
2343            *p++ = hexdigit[(ch >> 8) & 0xf];
2344            *p++ = hexdigit[(ch >> 4) & 0xf];
2345            *p++ = hexdigit[ch & 15];
2346        }
2347	/* Copy everything else as-is */
2348	else
2349            *p++ = (char) ch;
2350    }
2351    *p = '\0';
2352    _PyString_Resize(&repr, p - q);
2353    return repr;
2354}
2355
2356PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2357{
2358    if (!PyUnicode_Check(unicode)) {
2359	PyErr_BadArgument();
2360	return NULL;
2361    }
2362    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2363					    PyUnicode_GET_SIZE(unicode));
2364}
2365
2366/* --- Unicode Internal Codec ------------------------------------------- */
2367
2368PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2369					   Py_ssize_t size,
2370					   const char *errors)
2371{
2372    const char *starts = s;
2373    Py_ssize_t startinpos;
2374    Py_ssize_t endinpos;
2375    Py_ssize_t outpos;
2376    PyUnicodeObject *v;
2377    Py_UNICODE *p;
2378    const char *end;
2379    const char *reason;
2380    PyObject *errorHandler = NULL;
2381    PyObject *exc = NULL;
2382
2383#ifdef Py_UNICODE_WIDE
2384    Py_UNICODE unimax = PyUnicode_GetMax();
2385#endif
2386
2387    /* XXX overflow detection missing */
2388    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2389    if (v == NULL)
2390	goto onError;
2391    if (PyUnicode_GetSize((PyObject *)v) == 0)
2392	return (PyObject *)v;
2393    p = PyUnicode_AS_UNICODE(v);
2394    end = s + size;
2395
2396    while (s < end) {
2397        memcpy(p, s, sizeof(Py_UNICODE));
2398        /* We have to sanity check the raw data, otherwise doom looms for
2399           some malformed UCS-4 data. */
2400        if (
2401            #ifdef Py_UNICODE_WIDE
2402            *p > unimax || *p < 0 ||
2403            #endif
2404            end-s < Py_UNICODE_SIZE
2405            )
2406            {
2407            startinpos = s - starts;
2408            if (end-s < Py_UNICODE_SIZE) {
2409                endinpos = end-starts;
2410                reason = "truncated input";
2411            }
2412            else {
2413                endinpos = s - starts + Py_UNICODE_SIZE;
2414                reason = "illegal code point (> 0x10FFFF)";
2415            }
2416            outpos = p - PyUnicode_AS_UNICODE(v);
2417            if (unicode_decode_call_errorhandler(
2418                    errors, &errorHandler,
2419                    "unicode_internal", reason,
2420                    starts, size, &startinpos, &endinpos, &exc, &s,
2421                    (PyObject **)&v, &outpos, &p)) {
2422                goto onError;
2423            }
2424        }
2425        else {
2426            p++;
2427            s += Py_UNICODE_SIZE;
2428        }
2429    }
2430
2431    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2432        goto onError;
2433    Py_XDECREF(errorHandler);
2434    Py_XDECREF(exc);
2435    return (PyObject *)v;
2436
2437 onError:
2438    Py_XDECREF(v);
2439    Py_XDECREF(errorHandler);
2440    Py_XDECREF(exc);
2441    return NULL;
2442}
2443
2444/* --- Latin-1 Codec ------------------------------------------------------ */
2445
2446PyObject *PyUnicode_DecodeLatin1(const char *s,
2447				 Py_ssize_t size,
2448				 const char *errors)
2449{
2450    PyUnicodeObject *v;
2451    Py_UNICODE *p;
2452
2453    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2454    if (size == 1) {
2455	Py_UNICODE r = *(unsigned char*)s;
2456	return PyUnicode_FromUnicode(&r, 1);
2457    }
2458
2459    v = _PyUnicode_New(size);
2460    if (v == NULL)
2461	goto onError;
2462    if (size == 0)
2463	return (PyObject *)v;
2464    p = PyUnicode_AS_UNICODE(v);
2465    while (size-- > 0)
2466	*p++ = (unsigned char)*s++;
2467    return (PyObject *)v;
2468
2469 onError:
2470    Py_XDECREF(v);
2471    return NULL;
2472}
2473
2474/* create or adjust a UnicodeEncodeError */
2475static void make_encode_exception(PyObject **exceptionObject,
2476    const char *encoding,
2477    const Py_UNICODE *unicode, Py_ssize_t size,
2478    Py_ssize_t startpos, Py_ssize_t endpos,
2479    const char *reason)
2480{
2481    if (*exceptionObject == NULL) {
2482	*exceptionObject = PyUnicodeEncodeError_Create(
2483	    encoding, unicode, size, startpos, endpos, reason);
2484    }
2485    else {
2486	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2487	    goto onError;
2488	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2489	    goto onError;
2490	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2491	    goto onError;
2492	return;
2493	onError:
2494	Py_DECREF(*exceptionObject);
2495	*exceptionObject = NULL;
2496    }
2497}
2498
2499/* raises a UnicodeEncodeError */
2500static void raise_encode_exception(PyObject **exceptionObject,
2501    const char *encoding,
2502    const Py_UNICODE *unicode, Py_ssize_t size,
2503    Py_ssize_t startpos, Py_ssize_t endpos,
2504    const char *reason)
2505{
2506    make_encode_exception(exceptionObject,
2507	encoding, unicode, size, startpos, endpos, reason);
2508    if (*exceptionObject != NULL)
2509	PyCodec_StrictErrors(*exceptionObject);
2510}
2511
2512/* error handling callback helper:
2513   build arguments, call the callback and check the arguments,
2514   put the result into newpos and return the replacement string, which
2515   has to be freed by the caller */
2516static PyObject *unicode_encode_call_errorhandler(const char *errors,
2517    PyObject **errorHandler,
2518    const char *encoding, const char *reason,
2519    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2520    Py_ssize_t startpos, Py_ssize_t endpos,
2521    Py_ssize_t *newpos)
2522{
2523    static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
2524
2525    PyObject *restuple;
2526    PyObject *resunicode;
2527
2528    if (*errorHandler == NULL) {
2529	*errorHandler = PyCodec_LookupError(errors);
2530        if (*errorHandler == NULL)
2531	    return NULL;
2532    }
2533
2534    make_encode_exception(exceptionObject,
2535	encoding, unicode, size, startpos, endpos, reason);
2536    if (*exceptionObject == NULL)
2537	return NULL;
2538
2539    restuple = PyObject_CallFunctionObjArgs(
2540	*errorHandler, *exceptionObject, NULL);
2541    if (restuple == NULL)
2542	return NULL;
2543    if (!PyTuple_Check(restuple)) {
2544	PyErr_Format(PyExc_TypeError, &argparse[4]);
2545	Py_DECREF(restuple);
2546	return NULL;
2547    }
2548    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2549	&resunicode, newpos)) {
2550	Py_DECREF(restuple);
2551	return NULL;
2552    }
2553    if (*newpos<0)
2554	*newpos = size+*newpos;
2555    if (*newpos<0 || *newpos>size) {
2556	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
2557	Py_DECREF(restuple);
2558	return NULL;
2559    }
2560    Py_INCREF(resunicode);
2561    Py_DECREF(restuple);
2562    return resunicode;
2563}
2564
2565static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2566				 Py_ssize_t size,
2567				 const char *errors,
2568				 int limit)
2569{
2570    /* output object */
2571    PyObject *res;
2572    /* pointers to the beginning and end+1 of input */
2573    const Py_UNICODE *startp = p;
2574    const Py_UNICODE *endp = p + size;
2575    /* pointer to the beginning of the unencodable characters */
2576    /* const Py_UNICODE *badp = NULL; */
2577    /* pointer into the output */
2578    char *str;
2579    /* current output position */
2580    Py_ssize_t respos = 0;
2581    Py_ssize_t ressize;
2582    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2583    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2584    PyObject *errorHandler = NULL;
2585    PyObject *exc = NULL;
2586    /* the following variable is used for caching string comparisons
2587     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2588    int known_errorHandler = -1;
2589
2590    /* allocate enough for a simple encoding without
2591       replacements, if we need more, we'll resize */
2592    res = PyString_FromStringAndSize(NULL, size);
2593    if (res == NULL)
2594        goto onError;
2595    if (size == 0)
2596	return res;
2597    str = PyString_AS_STRING(res);
2598    ressize = size;
2599
2600    while (p<endp) {
2601	Py_UNICODE c = *p;
2602
2603	/* can we encode this? */
2604	if (c<limit) {
2605	    /* no overflow check, because we know that the space is enough */
2606	    *str++ = (char)c;
2607	    ++p;
2608	}
2609	else {
2610	    Py_ssize_t unicodepos = p-startp;
2611	    Py_ssize_t requiredsize;
2612	    PyObject *repunicode;
2613	    Py_ssize_t repsize;
2614	    Py_ssize_t newpos;
2615	    Py_ssize_t respos;
2616	    Py_UNICODE *uni2;
2617	    /* startpos for collecting unencodable chars */
2618	    const Py_UNICODE *collstart = p;
2619	    const Py_UNICODE *collend = p;
2620	    /* find all unecodable characters */
2621	    while ((collend < endp) && ((*collend)>=limit))
2622		++collend;
2623	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2624	    if (known_errorHandler==-1) {
2625		if ((errors==NULL) || (!strcmp(errors, "strict")))
2626		    known_errorHandler = 1;
2627		else if (!strcmp(errors, "replace"))
2628		    known_errorHandler = 2;
2629		else if (!strcmp(errors, "ignore"))
2630		    known_errorHandler = 3;
2631		else if (!strcmp(errors, "xmlcharrefreplace"))
2632		    known_errorHandler = 4;
2633		else
2634		    known_errorHandler = 0;
2635	    }
2636	    switch (known_errorHandler) {
2637		case 1: /* strict */
2638		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2639		    goto onError;
2640		case 2: /* replace */
2641		    while (collstart++<collend)
2642			*str++ = '?'; /* fall through */
2643		case 3: /* ignore */
2644		    p = collend;
2645		    break;
2646		case 4: /* xmlcharrefreplace */
2647		    respos = str-PyString_AS_STRING(res);
2648		    /* determine replacement size (temporarily (mis)uses p) */
2649		    for (p = collstart, repsize = 0; p < collend; ++p) {
2650			if (*p<10)
2651			    repsize += 2+1+1;
2652			else if (*p<100)
2653			    repsize += 2+2+1;
2654			else if (*p<1000)
2655			    repsize += 2+3+1;
2656			else if (*p<10000)
2657			    repsize += 2+4+1;
2658#ifndef Py_UNICODE_WIDE
2659			else
2660			    repsize += 2+5+1;
2661#else
2662			else if (*p<100000)
2663			    repsize += 2+5+1;
2664			else if (*p<1000000)
2665			    repsize += 2+6+1;
2666			else
2667			    repsize += 2+7+1;
2668#endif
2669		    }
2670		    requiredsize = respos+repsize+(endp-collend);
2671		    if (requiredsize > ressize) {
2672			if (requiredsize<2*ressize)
2673			    requiredsize = 2*ressize;
2674			if (_PyString_Resize(&res, requiredsize))
2675			    goto onError;
2676			str = PyString_AS_STRING(res) + respos;
2677			ressize = requiredsize;
2678		    }
2679		    /* generate replacement (temporarily (mis)uses p) */
2680		    for (p = collstart; p < collend; ++p) {
2681			str += sprintf(str, "&#%d;", (int)*p);
2682		    }
2683		    p = collend;
2684		    break;
2685		default:
2686		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2687			encoding, reason, startp, size, &exc,
2688			collstart-startp, collend-startp, &newpos);
2689		    if (repunicode == NULL)
2690			goto onError;
2691		    /* need more space? (at least enough for what we
2692		       have+the replacement+the rest of the string, so
2693		       we won't have to check space for encodable characters) */
2694		    respos = str-PyString_AS_STRING(res);
2695		    repsize = PyUnicode_GET_SIZE(repunicode);
2696		    requiredsize = respos+repsize+(endp-collend);
2697		    if (requiredsize > ressize) {
2698			if (requiredsize<2*ressize)
2699			    requiredsize = 2*ressize;
2700			if (_PyString_Resize(&res, requiredsize)) {
2701			    Py_DECREF(repunicode);
2702			    goto onError;
2703			}
2704			str = PyString_AS_STRING(res) + respos;
2705			ressize = requiredsize;
2706		    }
2707		    /* check if there is anything unencodable in the replacement
2708		       and copy it to the output */
2709		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2710			c = *uni2;
2711			if (c >= limit) {
2712			    raise_encode_exception(&exc, encoding, startp, size,
2713				unicodepos, unicodepos+1, reason);
2714			    Py_DECREF(repunicode);
2715			    goto onError;
2716			}
2717			*str = (char)c;
2718		    }
2719		    p = startp + newpos;
2720		    Py_DECREF(repunicode);
2721	    }
2722	}
2723    }
2724    /* Resize if we allocated to much */
2725    respos = str-PyString_AS_STRING(res);
2726    if (respos<ressize)
2727       /* If this falls res will be NULL */
2728	_PyString_Resize(&res, respos);
2729    Py_XDECREF(errorHandler);
2730    Py_XDECREF(exc);
2731    return res;
2732
2733    onError:
2734    Py_XDECREF(res);
2735    Py_XDECREF(errorHandler);
2736    Py_XDECREF(exc);
2737    return NULL;
2738}
2739
2740PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2741				 Py_ssize_t size,
2742				 const char *errors)
2743{
2744    return unicode_encode_ucs1(p, size, errors, 256);
2745}
2746
2747PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2748{
2749    if (!PyUnicode_Check(unicode)) {
2750	PyErr_BadArgument();
2751	return NULL;
2752    }
2753    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2754				  PyUnicode_GET_SIZE(unicode),
2755				  NULL);
2756}
2757
2758/* --- 7-bit ASCII Codec -------------------------------------------------- */
2759
2760PyObject *PyUnicode_DecodeASCII(const char *s,
2761				Py_ssize_t size,
2762				const char *errors)
2763{
2764    const char *starts = s;
2765    PyUnicodeObject *v;
2766    Py_UNICODE *p;
2767    Py_ssize_t startinpos;
2768    Py_ssize_t endinpos;
2769    Py_ssize_t outpos;
2770    const char *e;
2771    PyObject *errorHandler = NULL;
2772    PyObject *exc = NULL;
2773
2774    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2775    if (size == 1 && *(unsigned char*)s < 128) {
2776	Py_UNICODE r = *(unsigned char*)s;
2777	return PyUnicode_FromUnicode(&r, 1);
2778    }
2779
2780    v = _PyUnicode_New(size);
2781    if (v == NULL)
2782	goto onError;
2783    if (size == 0)
2784	return (PyObject *)v;
2785    p = PyUnicode_AS_UNICODE(v);
2786    e = s + size;
2787    while (s < e) {
2788	register unsigned char c = (unsigned char)*s;
2789	if (c < 128) {
2790	    *p++ = c;
2791	    ++s;
2792	}
2793	else {
2794	    startinpos = s-starts;
2795	    endinpos = startinpos + 1;
2796	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2797	    if (unicode_decode_call_errorhandler(
2798		 errors, &errorHandler,
2799		 "ascii", "ordinal not in range(128)",
2800		 starts, size, &startinpos, &endinpos, &exc, &s,
2801		 (PyObject **)&v, &outpos, &p))
2802		goto onError;
2803	}
2804    }
2805    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2806	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2807	    goto onError;
2808    Py_XDECREF(errorHandler);
2809    Py_XDECREF(exc);
2810    return (PyObject *)v;
2811
2812 onError:
2813    Py_XDECREF(v);
2814    Py_XDECREF(errorHandler);
2815    Py_XDECREF(exc);
2816    return NULL;
2817}
2818
2819PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2820				Py_ssize_t size,
2821				const char *errors)
2822{
2823    return unicode_encode_ucs1(p, size, errors, 128);
2824}
2825
2826PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2827{
2828    if (!PyUnicode_Check(unicode)) {
2829	PyErr_BadArgument();
2830	return NULL;
2831    }
2832    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2833				 PyUnicode_GET_SIZE(unicode),
2834				 NULL);
2835}
2836
2837#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2838
2839/* --- MBCS codecs for Windows -------------------------------------------- */
2840
2841#if SIZEOF_INT < SIZEOF_SSIZE_T
2842#define NEED_RETRY
2843#endif
2844
2845/* XXX This code is limited to "true" double-byte encodings, as
2846   a) it assumes an incomplete character consists of a single byte, and
2847   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2848      encodings, see IsDBCSLeadByteEx documentation. */
2849
2850static int is_dbcs_lead_byte(const char *s, int offset)
2851{
2852    const char *curr = s + offset;
2853
2854    if (IsDBCSLeadByte(*curr)) {
2855	const char *prev = CharPrev(s, curr);
2856	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2857    }
2858    return 0;
2859}
2860
2861/*
2862 * Decode MBCS string into unicode object. If 'final' is set, converts
2863 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2864 */
2865static int decode_mbcs(PyUnicodeObject **v,
2866			const char *s, /* MBCS string */
2867			int size, /* sizeof MBCS string */
2868			int final)
2869{
2870    Py_UNICODE *p;
2871    Py_ssize_t n = 0;
2872    int usize = 0;
2873
2874    assert(size >= 0);
2875
2876    /* Skip trailing lead-byte unless 'final' is set */
2877    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2878	--size;
2879
2880    /* First get the size of the result */
2881    if (size > 0) {
2882	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2883	if (usize == 0) {
2884	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
2885	    return -1;
2886	}
2887    }
2888
2889    if (*v == NULL) {
2890	/* Create unicode object */
2891	*v = _PyUnicode_New(usize);
2892	if (*v == NULL)
2893	    return -1;
2894    }
2895    else {
2896	/* Extend unicode object */
2897	n = PyUnicode_GET_SIZE(*v);
2898	if (_PyUnicode_Resize(v, n + usize) < 0)
2899	    return -1;
2900    }
2901
2902    /* Do the conversion */
2903    if (size > 0) {
2904	p = PyUnicode_AS_UNICODE(*v) + n;
2905	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2906	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
2907	    return -1;
2908	}
2909    }
2910
2911    return size;
2912}
2913
2914PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2915					Py_ssize_t size,
2916					const char *errors,
2917					Py_ssize_t *consumed)
2918{
2919    PyUnicodeObject *v = NULL;
2920    int done;
2921
2922    if (consumed)
2923	*consumed = 0;
2924
2925#ifdef NEED_RETRY
2926  retry:
2927    if (size > INT_MAX)
2928	done = decode_mbcs(&v, s, INT_MAX, 0);
2929    else
2930#endif
2931	done = decode_mbcs(&v, s, (int)size, !consumed);
2932
2933    if (done < 0) {
2934        Py_XDECREF(v);
2935	return NULL;
2936    }
2937
2938    if (consumed)
2939	*consumed += done;
2940
2941#ifdef NEED_RETRY
2942    if (size > INT_MAX) {
2943	s += done;
2944	size -= done;
2945	goto retry;
2946    }
2947#endif
2948
2949    return (PyObject *)v;
2950}
2951
2952PyObject *PyUnicode_DecodeMBCS(const char *s,
2953				Py_ssize_t size,
2954				const char *errors)
2955{
2956    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2957}
2958
2959/*
2960 * Convert unicode into string object (MBCS).
2961 * Returns 0 if succeed, -1 otherwise.
2962 */
2963static int encode_mbcs(PyObject **repr,
2964			const Py_UNICODE *p, /* unicode */
2965			int size) /* size of unicode */
2966{
2967    int mbcssize = 0;
2968    Py_ssize_t n = 0;
2969
2970    assert(size >= 0);
2971
2972    /* First get the size of the result */
2973    if (size > 0) {
2974	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2975	if (mbcssize == 0) {
2976	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
2977	    return -1;
2978	}
2979    }
2980
2981    if (*repr == NULL) {
2982	/* Create string object */
2983	*repr = PyString_FromStringAndSize(NULL, mbcssize);
2984	if (*repr == NULL)
2985	    return -1;
2986    }
2987    else {
2988	/* Extend string object */
2989	n = PyString_Size(*repr);
2990	if (_PyString_Resize(repr, n + mbcssize) < 0)
2991	    return -1;
2992    }
2993
2994    /* Do the conversion */
2995    if (size > 0) {
2996	char *s = PyString_AS_STRING(*repr) + n;
2997	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2998	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
2999	    return -1;
3000	}
3001    }
3002
3003    return 0;
3004}
3005
3006PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3007				Py_ssize_t size,
3008				const char *errors)
3009{
3010    PyObject *repr = NULL;
3011    int ret;
3012
3013#ifdef NEED_RETRY
3014 retry:
3015    if (size > INT_MAX)
3016	ret = encode_mbcs(&repr, p, INT_MAX);
3017    else
3018#endif
3019	ret = encode_mbcs(&repr, p, (int)size);
3020
3021    if (ret < 0) {
3022	Py_XDECREF(repr);
3023	return NULL;
3024    }
3025
3026#ifdef NEED_RETRY
3027    if (size > INT_MAX) {
3028	p += INT_MAX;
3029	size -= INT_MAX;
3030	goto retry;
3031    }
3032#endif
3033
3034    return repr;
3035}
3036
3037PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3038{
3039    if (!PyUnicode_Check(unicode)) {
3040        PyErr_BadArgument();
3041        return NULL;
3042    }
3043    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3044				PyUnicode_GET_SIZE(unicode),
3045				NULL);
3046}
3047
3048#undef NEED_RETRY
3049
3050#endif /* MS_WINDOWS */
3051
3052/* --- Character Mapping Codec -------------------------------------------- */
3053
3054PyObject *PyUnicode_DecodeCharmap(const char *s,
3055				  Py_ssize_t size,
3056				  PyObject *mapping,
3057				  const char *errors)
3058{
3059    const char *starts = s;
3060    Py_ssize_t startinpos;
3061    Py_ssize_t endinpos;
3062    Py_ssize_t outpos;
3063    const char *e;
3064    PyUnicodeObject *v;
3065    Py_UNICODE *p;
3066    Py_ssize_t extrachars = 0;
3067    PyObject *errorHandler = NULL;
3068    PyObject *exc = NULL;
3069    Py_UNICODE *mapstring = NULL;
3070    Py_ssize_t maplen = 0;
3071
3072    /* Default to Latin-1 */
3073    if (mapping == NULL)
3074	return PyUnicode_DecodeLatin1(s, size, errors);
3075
3076    v = _PyUnicode_New(size);
3077    if (v == NULL)
3078	goto onError;
3079    if (size == 0)
3080	return (PyObject *)v;
3081    p = PyUnicode_AS_UNICODE(v);
3082    e = s + size;
3083    if (PyUnicode_CheckExact(mapping)) {
3084	mapstring = PyUnicode_AS_UNICODE(mapping);
3085	maplen = PyUnicode_GET_SIZE(mapping);
3086	while (s < e) {
3087	    unsigned char ch = *s;
3088	    Py_UNICODE x = 0xfffe; /* illegal value */
3089
3090	    if (ch < maplen)
3091		x = mapstring[ch];
3092
3093	    if (x == 0xfffe) {
3094		/* undefined mapping */
3095		outpos = p-PyUnicode_AS_UNICODE(v);
3096		startinpos = s-starts;
3097		endinpos = startinpos+1;
3098		if (unicode_decode_call_errorhandler(
3099		     errors, &errorHandler,
3100		     "charmap", "character maps to <undefined>",
3101		     starts, size, &startinpos, &endinpos, &exc, &s,
3102		     (PyObject **)&v, &outpos, &p)) {
3103		    goto onError;
3104		}
3105		continue;
3106	    }
3107	    *p++ = x;
3108	    ++s;
3109	}
3110    }
3111    else {
3112	while (s < e) {
3113	    unsigned char ch = *s;
3114	    PyObject *w, *x;
3115
3116	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
3117	    w = PyInt_FromLong((long)ch);
3118	    if (w == NULL)
3119		goto onError;
3120	    x = PyObject_GetItem(mapping, w);
3121	    Py_DECREF(w);
3122	    if (x == NULL) {
3123		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3124		    /* No mapping found means: mapping is undefined. */
3125		    PyErr_Clear();
3126		    x = Py_None;
3127		    Py_INCREF(x);
3128		} else
3129		    goto onError;
3130	    }
3131
3132	    /* Apply mapping */
3133	    if (PyInt_Check(x)) {
3134		long value = PyInt_AS_LONG(x);
3135		if (value < 0 || value > 65535) {
3136		    PyErr_SetString(PyExc_TypeError,
3137				    "character mapping must be in range(65536)");
3138		    Py_DECREF(x);
3139		    goto onError;
3140		}
3141		*p++ = (Py_UNICODE)value;
3142	    }
3143	    else if (x == Py_None) {
3144		/* undefined mapping */
3145		outpos = p-PyUnicode_AS_UNICODE(v);
3146		startinpos = s-starts;
3147		endinpos = startinpos+1;
3148		if (unicode_decode_call_errorhandler(
3149		     errors, &errorHandler,
3150		     "charmap", "character maps to <undefined>",
3151		     starts, size, &startinpos, &endinpos, &exc, &s,
3152		     (PyObject **)&v, &outpos, &p)) {
3153		    Py_DECREF(x);
3154		    goto onError;
3155		}
3156		Py_DECREF(x);
3157		continue;
3158	    }
3159	    else if (PyUnicode_Check(x)) {
3160		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3161
3162		if (targetsize == 1)
3163		    /* 1-1 mapping */
3164		    *p++ = *PyUnicode_AS_UNICODE(x);
3165
3166		else if (targetsize > 1) {
3167		    /* 1-n mapping */
3168		    if (targetsize > extrachars) {
3169			/* resize first */
3170			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3171			Py_ssize_t needed = (targetsize - extrachars) + \
3172				     (targetsize << 2);
3173			extrachars += needed;
3174			/* XXX overflow detection missing */
3175			if (_PyUnicode_Resize(&v,
3176					     PyUnicode_GET_SIZE(v) + needed) < 0) {
3177			    Py_DECREF(x);
3178			    goto onError;
3179			}
3180			p = PyUnicode_AS_UNICODE(v) + oldpos;
3181		    }
3182		    Py_UNICODE_COPY(p,
3183				    PyUnicode_AS_UNICODE(x),
3184				    targetsize);
3185		    p += targetsize;
3186		    extrachars -= targetsize;
3187		}
3188		/* 1-0 mapping: skip the character */
3189	    }
3190	    else {
3191		/* wrong return value */
3192		PyErr_SetString(PyExc_TypeError,
3193		      "character mapping must return integer, None or unicode");
3194		Py_DECREF(x);
3195		goto onError;
3196	    }
3197	    Py_DECREF(x);
3198	    ++s;
3199	}
3200    }
3201    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3202	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3203	    goto onError;
3204    Py_XDECREF(errorHandler);
3205    Py_XDECREF(exc);
3206    return (PyObject *)v;
3207
3208 onError:
3209    Py_XDECREF(errorHandler);
3210    Py_XDECREF(exc);
3211    Py_XDECREF(v);
3212    return NULL;
3213}
3214
3215/* Charmap encoding: the lookup table */
3216
3217struct encoding_map{
3218  PyObject_HEAD
3219  unsigned char level1[32];
3220  int count2, count3;
3221  unsigned char level23[1];
3222};
3223
3224static PyObject*
3225encoding_map_size(PyObject *obj, PyObject* args)
3226{
3227    struct encoding_map *map = (struct encoding_map*)obj;
3228    return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3229                          128*map->count3);
3230}
3231
3232static PyMethodDef encoding_map_methods[] = {
3233	{"size", encoding_map_size, METH_NOARGS,
3234         PyDoc_STR("Return the size (in bytes) of this object") },
3235        { 0 }
3236};
3237
3238static void
3239encoding_map_dealloc(PyObject* o)
3240{
3241	PyObject_FREE(o);
3242}
3243
3244static PyTypeObject EncodingMapType = {
3245	PyObject_HEAD_INIT(NULL)
3246        0,                      /*ob_size*/
3247        "EncodingMap",          /*tp_name*/
3248        sizeof(struct encoding_map),   /*tp_basicsize*/
3249        0,                      /*tp_itemsize*/
3250        /* methods */
3251        encoding_map_dealloc,   /*tp_dealloc*/
3252        0,                      /*tp_print*/
3253        0,                      /*tp_getattr*/
3254        0,                      /*tp_setattr*/
3255        0,                      /*tp_compare*/
3256        0,                      /*tp_repr*/
3257        0,                      /*tp_as_number*/
3258        0,                      /*tp_as_sequence*/
3259        0,                      /*tp_as_mapping*/
3260        0,                      /*tp_hash*/
3261        0,                      /*tp_call*/
3262        0,                      /*tp_str*/
3263        0,                      /*tp_getattro*/
3264        0,                      /*tp_setattro*/
3265        0,                      /*tp_as_buffer*/
3266        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
3267        0,                      /*tp_doc*/
3268        0,                      /*tp_traverse*/
3269        0,                      /*tp_clear*/
3270        0,                      /*tp_richcompare*/
3271        0,                      /*tp_weaklistoffset*/
3272        0,                      /*tp_iter*/
3273        0,                      /*tp_iternext*/
3274        encoding_map_methods,   /*tp_methods*/
3275        0,                      /*tp_members*/
3276        0,                      /*tp_getset*/
3277        0,                      /*tp_base*/
3278        0,                      /*tp_dict*/
3279        0,                      /*tp_descr_get*/
3280        0,                      /*tp_descr_set*/
3281        0,                      /*tp_dictoffset*/
3282        0,                      /*tp_init*/
3283        0,                      /*tp_alloc*/
3284        0,                      /*tp_new*/
3285        0,                      /*tp_free*/
3286        0,                      /*tp_is_gc*/
3287};
3288
3289PyObject*
3290PyUnicode_BuildEncodingMap(PyObject* string)
3291{
3292    Py_UNICODE *decode;
3293    PyObject *result;
3294    struct encoding_map *mresult;
3295    int i;
3296    int need_dict = 0;
3297    unsigned char level1[32];
3298    unsigned char level2[512];
3299    unsigned char *mlevel1, *mlevel2, *mlevel3;
3300    int count2 = 0, count3 = 0;
3301
3302    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3303        PyErr_BadArgument();
3304        return NULL;
3305    }
3306    decode = PyUnicode_AS_UNICODE(string);
3307    memset(level1, 0xFF, sizeof level1);
3308    memset(level2, 0xFF, sizeof level2);
3309
3310    /* If there isn't a one-to-one mapping of NULL to \0,
3311       or if there are non-BMP characters, we need to use
3312       a mapping dictionary. */
3313    if (decode[0] != 0)
3314        need_dict = 1;
3315    for (i = 1; i < 256; i++) {
3316        int l1, l2;
3317        if (decode[i] == 0
3318            #ifdef Py_UNICODE_WIDE
3319            || decode[i] > 0xFFFF
3320            #endif
3321        ) {
3322            need_dict = 1;
3323            break;
3324        }
3325        if (decode[i] == 0xFFFE)
3326            /* unmapped character */
3327            continue;
3328        l1 = decode[i] >> 11;
3329        l2 = decode[i] >> 7;
3330        if (level1[l1] == 0xFF)
3331            level1[l1] = count2++;
3332        if (level2[l2] == 0xFF)
3333            level2[l2] = count3++;
3334    }
3335
3336    if (count2 >= 0xFF || count3 >= 0xFF)
3337        need_dict = 1;
3338
3339    if (need_dict) {
3340        PyObject *result = PyDict_New();
3341        PyObject *key, *value;
3342        if (!result)
3343            return NULL;
3344        for (i = 0; i < 256; i++) {
3345            key = value = NULL;
3346            key = PyInt_FromLong(decode[i]);
3347            value = PyInt_FromLong(i);
3348            if (!key || !value)
3349                goto failed1;
3350            if (PyDict_SetItem(result, key, value) == -1)
3351                goto failed1;
3352            Py_DECREF(key);
3353            Py_DECREF(value);
3354        }
3355        return result;
3356      failed1:
3357        Py_XDECREF(key);
3358        Py_XDECREF(value);
3359        Py_DECREF(result);
3360        return NULL;
3361    }
3362
3363    /* Create a three-level trie */
3364    result = PyObject_MALLOC(sizeof(struct encoding_map) +
3365                             16*count2 + 128*count3 - 1);
3366    if (!result)
3367        return PyErr_NoMemory();
3368    PyObject_Init(result, &EncodingMapType);
3369    mresult = (struct encoding_map*)result;
3370    mresult->count2 = count2;
3371    mresult->count3 = count3;
3372    mlevel1 = mresult->level1;
3373    mlevel2 = mresult->level23;
3374    mlevel3 = mresult->level23 + 16*count2;
3375    memcpy(mlevel1, level1, 32);
3376    memset(mlevel2, 0xFF, 16*count2);
3377    memset(mlevel3, 0, 128*count3);
3378    count3 = 0;
3379    for (i = 1; i < 256; i++) {
3380        int o1, o2, o3, i2, i3;
3381        if (decode[i] == 0xFFFE)
3382            /* unmapped character */
3383            continue;
3384        o1 = decode[i]>>11;
3385        o2 = (decode[i]>>7) & 0xF;
3386        i2 = 16*mlevel1[o1] + o2;
3387        if (mlevel2[i2] == 0xFF)
3388            mlevel2[i2] = count3++;
3389        o3 = decode[i] & 0x7F;
3390        i3 = 128*mlevel2[i2] + o3;
3391        mlevel3[i3] = i;
3392    }
3393    return result;
3394}
3395
3396static int
3397encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3398{
3399    struct encoding_map *map = (struct encoding_map*)mapping;
3400    int l1 = c>>11;
3401    int l2 = (c>>7) & 0xF;
3402    int l3 = c & 0x7F;
3403    int i;
3404
3405#ifdef Py_UNICODE_WIDE
3406    if (c > 0xFFFF) {
3407	return -1;
3408    }
3409#endif
3410    if (c == 0)
3411        return 0;
3412    /* level 1*/
3413    i = map->level1[l1];
3414    if (i == 0xFF) {
3415        return -1;
3416    }
3417    /* level 2*/
3418    i = map->level23[16*i+l2];
3419    if (i == 0xFF) {
3420        return -1;
3421    }
3422    /* level 3 */
3423    i = map->level23[16*map->count2 + 128*i + l3];
3424    if (i == 0) {
3425        return -1;
3426    }
3427    return i;
3428}
3429
3430/* Lookup the character ch in the mapping. If the character
3431   can't be found, Py_None is returned (or NULL, if another
3432   error occurred). */
3433static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
3434{
3435    PyObject *w = PyInt_FromLong((long)c);
3436    PyObject *x;
3437
3438    if (w == NULL)
3439	 return NULL;
3440    x = PyObject_GetItem(mapping, w);
3441    Py_DECREF(w);
3442    if (x == NULL) {
3443	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3444	    /* No mapping found means: mapping is undefined. */
3445	    PyErr_Clear();
3446	    x = Py_None;
3447	    Py_INCREF(x);
3448	    return x;
3449	} else
3450	    return NULL;
3451    }
3452    else if (x == Py_None)
3453	return x;
3454    else if (PyInt_Check(x)) {
3455	long value = PyInt_AS_LONG(x);
3456	if (value < 0 || value > 255) {
3457	    PyErr_SetString(PyExc_TypeError,
3458			     "character mapping must be in range(256)");
3459	    Py_DECREF(x);
3460	    return NULL;
3461	}
3462	return x;
3463    }
3464    else if (PyString_Check(x))
3465	return x;
3466    else {
3467	/* wrong return value */
3468	PyErr_SetString(PyExc_TypeError,
3469	      "character mapping must return integer, None or str");
3470	Py_DECREF(x);
3471	return NULL;
3472    }
3473}
3474
3475static int
3476charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3477{
3478	Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3479	/* exponentially overallocate to minimize reallocations */
3480	if (requiredsize < 2*outsize)
3481	    requiredsize = 2*outsize;
3482	if (_PyString_Resize(outobj, requiredsize)) {
3483	    return 0;
3484	}
3485	return 1;
3486}
3487
3488typedef enum charmapencode_result {
3489  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3490}charmapencode_result;
3491/* lookup the character, put the result in the output string and adjust
3492   various state variables. Reallocate the output string if not enough
3493   space is available. Return a new reference to the object that
3494   was put in the output buffer, or Py_None, if the mapping was undefined
3495   (in which case no character was written) or NULL, if a
3496   reallocation error occurred. The caller must decref the result */
3497static
3498charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3499    PyObject **outobj, Py_ssize_t *outpos)
3500{
3501    PyObject *rep;
3502    char *outstart;
3503    Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3504
3505    if (mapping->ob_type == &EncodingMapType) {
3506        int res = encoding_map_lookup(c, mapping);
3507	Py_ssize_t requiredsize = *outpos+1;
3508        if (res == -1)
3509            return enc_FAILED;
3510	if (outsize<requiredsize)
3511	    if (!charmapencode_resize(outobj, outpos, requiredsize))
3512		return enc_EXCEPTION;
3513        outstart = PyString_AS_STRING(*outobj);
3514	outstart[(*outpos)++] = (char)res;
3515	return enc_SUCCESS;
3516    }
3517
3518    rep = charmapencode_lookup(c, mapping);
3519    if (rep==NULL)
3520	return enc_EXCEPTION;
3521    else if (rep==Py_None) {
3522	Py_DECREF(rep);
3523	return enc_FAILED;
3524    } else {
3525	if (PyInt_Check(rep)) {
3526	    Py_ssize_t requiredsize = *outpos+1;
3527	    if (outsize<requiredsize)
3528		if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3529		    Py_DECREF(rep);
3530		    return enc_EXCEPTION;
3531		}
3532            outstart = PyString_AS_STRING(*outobj);
3533	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3534	}
3535	else {
3536	    const char *repchars = PyString_AS_STRING(rep);
3537	    Py_ssize_t repsize = PyString_GET_SIZE(rep);
3538	    Py_ssize_t requiredsize = *outpos+repsize;
3539	    if (outsize<requiredsize)
3540		if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3541		    Py_DECREF(rep);
3542		    return enc_EXCEPTION;
3543		}
3544            outstart = PyString_AS_STRING(*outobj);
3545	    memcpy(outstart + *outpos, repchars, repsize);
3546	    *outpos += repsize;
3547	}
3548    }
3549    Py_DECREF(rep);
3550    return enc_SUCCESS;
3551}
3552
3553/* handle an error in PyUnicode_EncodeCharmap
3554   Return 0 on success, -1 on error */
3555static
3556int charmap_encoding_error(
3557    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
3558    PyObject **exceptionObject,
3559    int *known_errorHandler, PyObject **errorHandler, const char *errors,
3560    PyObject **res, Py_ssize_t *respos)
3561{
3562    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3563    Py_ssize_t repsize;
3564    Py_ssize_t newpos;
3565    Py_UNICODE *uni2;
3566    /* startpos for collecting unencodable chars */
3567    Py_ssize_t collstartpos = *inpos;
3568    Py_ssize_t collendpos = *inpos+1;
3569    Py_ssize_t collpos;
3570    char *encoding = "charmap";
3571    char *reason = "character maps to <undefined>";
3572    charmapencode_result x;
3573
3574    /* find all unencodable characters */
3575    while (collendpos < size) {
3576        PyObject *rep;
3577        if (mapping->ob_type == &EncodingMapType) {
3578	    int res = encoding_map_lookup(p[collendpos], mapping);
3579	    if (res != -1)
3580		break;
3581	    ++collendpos;
3582	    continue;
3583	}
3584
3585	rep = charmapencode_lookup(p[collendpos], mapping);
3586	if (rep==NULL)
3587	    return -1;
3588	else if (rep!=Py_None) {
3589	    Py_DECREF(rep);
3590	    break;
3591	}
3592	Py_DECREF(rep);
3593	++collendpos;
3594    }
3595    /* cache callback name lookup
3596     * (if not done yet, i.e. it's the first error) */
3597    if (*known_errorHandler==-1) {
3598	if ((errors==NULL) || (!strcmp(errors, "strict")))
3599	    *known_errorHandler = 1;
3600	else if (!strcmp(errors, "replace"))
3601	    *known_errorHandler = 2;
3602	else if (!strcmp(errors, "ignore"))
3603	    *known_errorHandler = 3;
3604	else if (!strcmp(errors, "xmlcharrefreplace"))
3605	    *known_errorHandler = 4;
3606	else
3607	    *known_errorHandler = 0;
3608    }
3609    switch (*known_errorHandler) {
3610	case 1: /* strict */
3611	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3612	    return -1;
3613	case 2: /* replace */
3614	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3615		x = charmapencode_output('?', mapping, res, respos);
3616		if (x==enc_EXCEPTION) {
3617		    return -1;
3618		}
3619		else if (x==enc_FAILED) {
3620		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3621		    return -1;
3622		}
3623	    }
3624	    /* fall through */
3625	case 3: /* ignore */
3626	    *inpos = collendpos;
3627	    break;
3628	case 4: /* xmlcharrefreplace */
3629	    /* generate replacement (temporarily (mis)uses p) */
3630	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3631		char buffer[2+29+1+1];
3632		char *cp;
3633		sprintf(buffer, "&#%d;", (int)p[collpos]);
3634		for (cp = buffer; *cp; ++cp) {
3635		    x = charmapencode_output(*cp, mapping, res, respos);
3636		    if (x==enc_EXCEPTION)
3637			return -1;
3638		    else if (x==enc_FAILED) {
3639			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3640			return -1;
3641		    }
3642		}
3643	    }
3644	    *inpos = collendpos;
3645	    break;
3646	default:
3647	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3648		encoding, reason, p, size, exceptionObject,
3649		collstartpos, collendpos, &newpos);
3650	    if (repunicode == NULL)
3651		return -1;
3652	    /* generate replacement  */
3653	    repsize = PyUnicode_GET_SIZE(repunicode);
3654	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3655		x = charmapencode_output(*uni2, mapping, res, respos);
3656		if (x==enc_EXCEPTION) {
3657		    return -1;
3658		}
3659		else if (x==enc_FAILED) {
3660		    Py_DECREF(repunicode);
3661		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3662		    return -1;
3663		}
3664	    }
3665	    *inpos = newpos;
3666	    Py_DECREF(repunicode);
3667    }
3668    return 0;
3669}
3670
3671PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3672				  Py_ssize_t size,
3673				  PyObject *mapping,
3674				  const char *errors)
3675{
3676    /* output object */
3677    PyObject *res = NULL;
3678    /* current input position */
3679    Py_ssize_t inpos = 0;
3680    /* current output position */
3681    Py_ssize_t respos = 0;
3682    PyObject *errorHandler = NULL;
3683    PyObject *exc = NULL;
3684    /* the following variable is used for caching string comparisons
3685     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3686     * 3=ignore, 4=xmlcharrefreplace */
3687    int known_errorHandler = -1;
3688
3689    /* Default to Latin-1 */
3690    if (mapping == NULL)
3691	return PyUnicode_EncodeLatin1(p, size, errors);
3692
3693    /* allocate enough for a simple encoding without
3694       replacements, if we need more, we'll resize */
3695    res = PyString_FromStringAndSize(NULL, size);
3696    if (res == NULL)
3697        goto onError;
3698    if (size == 0)
3699	return res;
3700
3701    while (inpos<size) {
3702	/* try to encode it */
3703	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3704	if (x==enc_EXCEPTION) /* error */
3705	    goto onError;
3706	if (x==enc_FAILED) { /* unencodable character */
3707	    if (charmap_encoding_error(p, size, &inpos, mapping,
3708		&exc,
3709		&known_errorHandler, &errorHandler, errors,
3710		&res, &respos)) {
3711		goto onError;
3712	    }
3713	}
3714	else
3715	    /* done with this character => adjust input position */
3716	    ++inpos;
3717    }
3718
3719    /* Resize if we allocated to much */
3720    if (respos<PyString_GET_SIZE(res)) {
3721	if (_PyString_Resize(&res, respos))
3722	    goto onError;
3723    }
3724    Py_XDECREF(exc);
3725    Py_XDECREF(errorHandler);
3726    return res;
3727
3728    onError:
3729    Py_XDECREF(res);
3730    Py_XDECREF(exc);
3731    Py_XDECREF(errorHandler);
3732    return NULL;
3733}
3734
3735PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3736				    PyObject *mapping)
3737{
3738    if (!PyUnicode_Check(unicode) || mapping == NULL) {
3739	PyErr_BadArgument();
3740	return NULL;
3741    }
3742    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3743				   PyUnicode_GET_SIZE(unicode),
3744				   mapping,
3745				   NULL);
3746}
3747
3748/* create or adjust a UnicodeTranslateError */
3749static void make_translate_exception(PyObject **exceptionObject,
3750    const Py_UNICODE *unicode, Py_ssize_t size,
3751    Py_ssize_t startpos, Py_ssize_t endpos,
3752    const char *reason)
3753{
3754    if (*exceptionObject == NULL) {
3755    	*exceptionObject = PyUnicodeTranslateError_Create(
3756	    unicode, size, startpos, endpos, reason);
3757    }
3758    else {
3759	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3760	    goto onError;
3761	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3762	    goto onError;
3763	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3764	    goto onError;
3765	return;
3766	onError:
3767	Py_DECREF(*exceptionObject);
3768	*exceptionObject = NULL;
3769    }
3770}
3771
3772/* raises a UnicodeTranslateError */
3773static void raise_translate_exception(PyObject **exceptionObject,
3774    const Py_UNICODE *unicode, Py_ssize_t size,
3775    Py_ssize_t startpos, Py_ssize_t endpos,
3776    const char *reason)
3777{
3778    make_translate_exception(exceptionObject,
3779	unicode, size, startpos, endpos, reason);
3780    if (*exceptionObject != NULL)
3781	PyCodec_StrictErrors(*exceptionObject);
3782}
3783
3784/* error handling callback helper:
3785   build arguments, call the callback and check the arguments,
3786   put the result into newpos and return the replacement string, which
3787   has to be freed by the caller */
3788static PyObject *unicode_translate_call_errorhandler(const char *errors,
3789    PyObject **errorHandler,
3790    const char *reason,
3791    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3792    Py_ssize_t startpos, Py_ssize_t endpos,
3793    Py_ssize_t *newpos)
3794{
3795    static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
3796
3797    Py_ssize_t i_newpos;
3798    PyObject *restuple;
3799    PyObject *resunicode;
3800
3801    if (*errorHandler == NULL) {
3802	*errorHandler = PyCodec_LookupError(errors);
3803        if (*errorHandler == NULL)
3804	    return NULL;
3805    }
3806
3807    make_translate_exception(exceptionObject,
3808	unicode, size, startpos, endpos, reason);
3809    if (*exceptionObject == NULL)
3810	return NULL;
3811
3812    restuple = PyObject_CallFunctionObjArgs(
3813	*errorHandler, *exceptionObject, NULL);
3814    if (restuple == NULL)
3815	return NULL;
3816    if (!PyTuple_Check(restuple)) {
3817	PyErr_Format(PyExc_TypeError, &argparse[4]);
3818	Py_DECREF(restuple);
3819	return NULL;
3820    }
3821    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3822	&resunicode, &i_newpos)) {
3823	Py_DECREF(restuple);
3824	return NULL;
3825    }
3826    if (i_newpos<0)
3827	*newpos = size+i_newpos;
3828    else
3829        *newpos = i_newpos;
3830    if (*newpos<0 || *newpos>size) {
3831	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3832	Py_DECREF(restuple);
3833	return NULL;
3834    }
3835    Py_INCREF(resunicode);
3836    Py_DECREF(restuple);
3837    return resunicode;
3838}
3839
3840/* Lookup the character ch in the mapping and put the result in result,
3841   which must be decrefed by the caller.
3842   Return 0 on success, -1 on error */
3843static
3844int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3845{
3846    PyObject *w = PyInt_FromLong((long)c);
3847    PyObject *x;
3848
3849    if (w == NULL)
3850	 return -1;
3851    x = PyObject_GetItem(mapping, w);
3852    Py_DECREF(w);
3853    if (x == NULL) {
3854	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3855	    /* No mapping found means: use 1:1 mapping. */
3856	    PyErr_Clear();
3857	    *result = NULL;
3858	    return 0;
3859	} else
3860	    return -1;
3861    }
3862    else if (x == Py_None) {
3863	*result = x;
3864	return 0;
3865    }
3866    else if (PyInt_Check(x)) {
3867	long value = PyInt_AS_LONG(x);
3868	long max = PyUnicode_GetMax();
3869	if (value < 0 || value > max) {
3870	    PyErr_Format(PyExc_TypeError,
3871			     "character mapping must be in range(0x%lx)", max+1);
3872	    Py_DECREF(x);
3873	    return -1;
3874	}
3875	*result = x;
3876	return 0;
3877    }
3878    else if (PyUnicode_Check(x)) {
3879	*result = x;
3880	return 0;
3881    }
3882    else {
3883	/* wrong return value */
3884	PyErr_SetString(PyExc_TypeError,
3885	      "character mapping must return integer, None or unicode");
3886	Py_DECREF(x);
3887	return -1;
3888    }
3889}
3890/* ensure that *outobj is at least requiredsize characters long,
3891if not reallocate and adjust various state variables.
3892Return 0 on success, -1 on error */
3893static
3894int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
3895    Py_ssize_t requiredsize)
3896{
3897    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
3898    if (requiredsize > oldsize) {
3899	/* remember old output position */
3900	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3901	/* exponentially overallocate to minimize reallocations */
3902	if (requiredsize < 2 * oldsize)
3903	    requiredsize = 2 * oldsize;
3904	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3905	    return -1;
3906	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3907    }
3908    return 0;
3909}
3910/* lookup the character, put the result in the output string and adjust
3911   various state variables. Return a new reference to the object that
3912   was put in the output buffer in *result, or Py_None, if the mapping was
3913   undefined (in which case no character was written).
3914   The called must decref result.
3915   Return 0 on success, -1 on error. */
3916static
3917int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3918    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3919    PyObject **res)
3920{
3921    if (charmaptranslate_lookup(*curinp, mapping, res))
3922	return -1;
3923    if (*res==NULL) {
3924	/* not found => default to 1:1 mapping */
3925	*(*outp)++ = *curinp;
3926    }
3927    else if (*res==Py_None)
3928	;
3929    else if (PyInt_Check(*res)) {
3930	/* no overflow check, because we know that the space is enough */
3931	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3932    }
3933    else if (PyUnicode_Check(*res)) {
3934	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
3935	if (repsize==1) {
3936	    /* no overflow check, because we know that the space is enough */
3937	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3938	}
3939	else if (repsize!=0) {
3940	    /* more than one character */
3941	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3942		(insize - (curinp-startinp)) +
3943		repsize - 1;
3944	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
3945		return -1;
3946	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3947	    *outp += repsize;
3948	}
3949    }
3950    else
3951	return -1;
3952    return 0;
3953}
3954
3955PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3956				     Py_ssize_t size,
3957				     PyObject *mapping,
3958				     const char *errors)
3959{
3960    /* output object */
3961    PyObject *res = NULL;
3962    /* pointers to the beginning and end+1 of input */
3963    const Py_UNICODE *startp = p;
3964    const Py_UNICODE *endp = p + size;
3965    /* pointer into the output */
3966    Py_UNICODE *str;
3967    /* current output position */
3968    Py_ssize_t respos = 0;
3969    char *reason = "character maps to <undefined>";
3970    PyObject *errorHandler = NULL;
3971    PyObject *exc = NULL;
3972    /* the following variable is used for caching string comparisons
3973     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3974     * 3=ignore, 4=xmlcharrefreplace */
3975    int known_errorHandler = -1;
3976
3977    if (mapping == NULL) {
3978	PyErr_BadArgument();
3979	return NULL;
3980    }
3981
3982    /* allocate enough for a simple 1:1 translation without
3983       replacements, if we need more, we'll resize */
3984    res = PyUnicode_FromUnicode(NULL, size);
3985    if (res == NULL)
3986	goto onError;
3987    if (size == 0)
3988	return res;
3989    str = PyUnicode_AS_UNICODE(res);
3990
3991    while (p<endp) {
3992	/* try to encode it */
3993	PyObject *x = NULL;
3994	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3995	    Py_XDECREF(x);
3996	    goto onError;
3997	}
3998	Py_XDECREF(x);
3999	if (x!=Py_None) /* it worked => adjust input pointer */
4000	    ++p;
4001	else { /* untranslatable character */
4002	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4003	    Py_ssize_t repsize;
4004	    Py_ssize_t newpos;
4005	    Py_UNICODE *uni2;
4006	    /* startpos for collecting untranslatable chars */
4007	    const Py_UNICODE *collstart = p;
4008	    const Py_UNICODE *collend = p+1;
4009	    const Py_UNICODE *coll;
4010
4011	    /* find all untranslatable characters */
4012	    while (collend < endp) {
4013		if (charmaptranslate_lookup(*collend, mapping, &x))
4014		    goto onError;
4015		Py_XDECREF(x);
4016		if (x!=Py_None)
4017		    break;
4018		++collend;
4019	    }
4020	    /* cache callback name lookup
4021	     * (if not done yet, i.e. it's the first error) */
4022	    if (known_errorHandler==-1) {
4023		if ((errors==NULL) || (!strcmp(errors, "strict")))
4024		    known_errorHandler = 1;
4025		else if (!strcmp(errors, "replace"))
4026		    known_errorHandler = 2;
4027		else if (!strcmp(errors, "ignore"))
4028		    known_errorHandler = 3;
4029		else if (!strcmp(errors, "xmlcharrefreplace"))
4030		    known_errorHandler = 4;
4031		else
4032		    known_errorHandler = 0;
4033	    }
4034	    switch (known_errorHandler) {
4035		case 1: /* strict */
4036		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4037		    goto onError;
4038		case 2: /* replace */
4039		    /* No need to check for space, this is a 1:1 replacement */
4040		    for (coll = collstart; coll<collend; ++coll)
4041			*str++ = '?';
4042		    /* fall through */
4043		case 3: /* ignore */
4044		    p = collend;
4045		    break;
4046		case 4: /* xmlcharrefreplace */
4047		    /* generate replacement (temporarily (mis)uses p) */
4048		    for (p = collstart; p < collend; ++p) {
4049			char buffer[2+29+1+1];
4050			char *cp;
4051			sprintf(buffer, "&#%d;", (int)*p);
4052			if (charmaptranslate_makespace(&res, &str,
4053			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4054			    goto onError;
4055			for (cp = buffer; *cp; ++cp)
4056			    *str++ = *cp;
4057		    }
4058		    p = collend;
4059		    break;
4060		default:
4061		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4062			reason, startp, size, &exc,
4063			collstart-startp, collend-startp, &newpos);
4064		    if (repunicode == NULL)
4065			goto onError;
4066		    /* generate replacement  */
4067		    repsize = PyUnicode_GET_SIZE(repunicode);
4068		    if (charmaptranslate_makespace(&res, &str,
4069			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4070			Py_DECREF(repunicode);
4071			goto onError;
4072		    }
4073		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4074			*str++ = *uni2;
4075		    p = startp + newpos;
4076		    Py_DECREF(repunicode);
4077	    }
4078	}
4079    }
4080    /* Resize if we allocated to much */
4081    respos = str-PyUnicode_AS_UNICODE(res);
4082    if (respos<PyUnicode_GET_SIZE(res)) {
4083	if (_PyUnicode_Resize(&res, respos) < 0)
4084	    goto onError;
4085    }
4086    Py_XDECREF(exc);
4087    Py_XDECREF(errorHandler);
4088    return res;
4089
4090    onError:
4091    Py_XDECREF(res);
4092    Py_XDECREF(exc);
4093    Py_XDECREF(errorHandler);
4094    return NULL;
4095}
4096
4097PyObject *PyUnicode_Translate(PyObject *str,
4098			      PyObject *mapping,
4099			      const char *errors)
4100{
4101    PyObject *result;
4102
4103    str = PyUnicode_FromObject(str);
4104    if (str == NULL)
4105	goto onError;
4106    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4107					PyUnicode_GET_SIZE(str),
4108					mapping,
4109					errors);
4110    Py_DECREF(str);
4111    return result;
4112
4113 onError:
4114    Py_XDECREF(str);
4115    return NULL;
4116}
4117
4118/* --- Decimal Encoder ---------------------------------------------------- */
4119
4120int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4121			    Py_ssize_t length,
4122			    char *output,
4123			    const char *errors)
4124{
4125    Py_UNICODE *p, *end;
4126    PyObject *errorHandler = NULL;
4127    PyObject *exc = NULL;
4128    const char *encoding = "decimal";
4129    const char *reason = "invalid decimal Unicode string";
4130    /* the following variable is used for caching string comparisons
4131     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4132    int known_errorHandler = -1;
4133
4134    if (output == NULL) {
4135	PyErr_BadArgument();
4136	return -1;
4137    }
4138
4139    p = s;
4140    end = s + length;
4141    while (p < end) {
4142	register Py_UNICODE ch = *p;
4143	int decimal;
4144	PyObject *repunicode;
4145	Py_ssize_t repsize;
4146	Py_ssize_t newpos;
4147	Py_UNICODE *uni2;
4148	Py_UNICODE *collstart;
4149	Py_UNICODE *collend;
4150
4151	if (Py_UNICODE_ISSPACE(ch)) {
4152	    *output++ = ' ';
4153	    ++p;
4154	    continue;
4155	}
4156	decimal = Py_UNICODE_TODECIMAL(ch);
4157	if (decimal >= 0) {
4158	    *output++ = '0' + decimal;
4159	    ++p;
4160	    continue;
4161	}
4162	if (0 < ch && ch < 256) {
4163	    *output++ = (char)ch;
4164	    ++p;
4165	    continue;
4166	}
4167	/* All other characters are considered unencodable */
4168	collstart = p;
4169	collend = p+1;
4170	while (collend < end) {
4171	    if ((0 < *collend && *collend < 256) ||
4172	        !Py_UNICODE_ISSPACE(*collend) ||
4173	        Py_UNICODE_TODECIMAL(*collend))
4174		break;
4175	}
4176	/* cache callback name lookup
4177	 * (if not done yet, i.e. it's the first error) */
4178	if (known_errorHandler==-1) {
4179	    if ((errors==NULL) || (!strcmp(errors, "strict")))
4180		known_errorHandler = 1;
4181	    else if (!strcmp(errors, "replace"))
4182		known_errorHandler = 2;
4183	    else if (!strcmp(errors, "ignore"))
4184		known_errorHandler = 3;
4185	    else if (!strcmp(errors, "xmlcharrefreplace"))
4186		known_errorHandler = 4;
4187	    else
4188		known_errorHandler = 0;
4189	}
4190	switch (known_errorHandler) {
4191	    case 1: /* strict */
4192		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4193		goto onError;
4194	    case 2: /* replace */
4195		for (p = collstart; p < collend; ++p)
4196		    *output++ = '?';
4197		/* fall through */
4198	    case 3: /* ignore */
4199		p = collend;
4200		break;
4201	    case 4: /* xmlcharrefreplace */
4202		/* generate replacement (temporarily (mis)uses p) */
4203		for (p = collstart; p < collend; ++p)
4204		    output += sprintf(output, "&#%d;", (int)*p);
4205		p = collend;
4206		break;
4207	    default:
4208		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4209		    encoding, reason, s, length, &exc,
4210		    collstart-s, collend-s, &newpos);
4211		if (repunicode == NULL)
4212		    goto onError;
4213		/* generate replacement  */
4214		repsize = PyUnicode_GET_SIZE(repunicode);
4215		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4216		    Py_UNICODE ch = *uni2;
4217		    if (Py_UNICODE_ISSPACE(ch))
4218			*output++ = ' ';
4219		    else {
4220			decimal = Py_UNICODE_TODECIMAL(ch);
4221			if (decimal >= 0)
4222			    *output++ = '0' + decimal;
4223			else if (0 < ch && ch < 256)
4224			    *output++ = (char)ch;
4225			else {
4226			    Py_DECREF(repunicode);
4227			    raise_encode_exception(&exc, encoding,
4228				s, length, collstart-s, collend-s, reason);
4229			    goto onError;
4230			}
4231		    }
4232		}
4233		p = s + newpos;
4234		Py_DECREF(repunicode);
4235	}
4236    }
4237    /* 0-terminate the output string */
4238    *output++ = '\0';
4239    Py_XDECREF(exc);
4240    Py_XDECREF(errorHandler);
4241    return 0;
4242
4243 onError:
4244    Py_XDECREF(exc);
4245    Py_XDECREF(errorHandler);
4246    return -1;
4247}
4248
4249/* --- Helpers ------------------------------------------------------------ */
4250
4251#define STRINGLIB_CHAR Py_UNICODE
4252
4253#define STRINGLIB_LEN PyUnicode_GET_SIZE
4254#define STRINGLIB_NEW PyUnicode_FromUnicode
4255#define STRINGLIB_STR PyUnicode_AS_UNICODE
4256
4257Py_LOCAL_INLINE(int)
4258STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4259{
4260    if (str[0] != other[0])
4261        return 1;
4262    return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4263}
4264
4265#define STRINGLIB_EMPTY unicode_empty
4266
4267#include "stringlib/fastsearch.h"
4268
4269#include "stringlib/count.h"
4270#include "stringlib/find.h"
4271#include "stringlib/partition.h"
4272
4273/* helper macro to fixup start/end slice values */
4274#define FIX_START_END(obj)                      \
4275    if (start < 0)                              \
4276        start += (obj)->length;                 \
4277    if (start < 0)                              \
4278        start = 0;                              \
4279    if (end > (obj)->length)                    \
4280        end = (obj)->length;                    \
4281    if (end < 0)                                \
4282        end += (obj)->length;                   \
4283    if (end < 0)                                \
4284        end = 0;
4285
4286Py_ssize_t PyUnicode_Count(PyObject *str,
4287                           PyObject *substr,
4288                           Py_ssize_t start,
4289                           Py_ssize_t end)
4290{
4291    Py_ssize_t result;
4292    PyUnicodeObject* str_obj;
4293    PyUnicodeObject* sub_obj;
4294
4295    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4296    if (!str_obj)
4297	return -1;
4298    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4299    if (!sub_obj) {
4300	Py_DECREF(str_obj);
4301	return -1;
4302    }
4303
4304    FIX_START_END(str_obj);
4305
4306    result = stringlib_count(
4307        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4308        );
4309
4310    Py_DECREF(sub_obj);
4311    Py_DECREF(str_obj);
4312
4313    return result;
4314}
4315
4316Py_ssize_t PyUnicode_Find(PyObject *str,
4317                          PyObject *sub,
4318                          Py_ssize_t start,
4319                          Py_ssize_t end,
4320                          int direction)
4321{
4322    Py_ssize_t result;
4323
4324    str = PyUnicode_FromObject(str);
4325    if (!str)
4326	return -2;
4327    sub = PyUnicode_FromObject(sub);
4328    if (!sub) {
4329	Py_DECREF(str);
4330	return -2;
4331    }
4332
4333    if (direction > 0)
4334        result = stringlib_find_slice(
4335            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4336            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4337            start, end
4338            );
4339    else
4340        result = stringlib_rfind_slice(
4341            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4342            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4343            start, end
4344            );
4345
4346    Py_DECREF(str);
4347    Py_DECREF(sub);
4348
4349    return result;
4350}
4351
4352static
4353int tailmatch(PyUnicodeObject *self,
4354	      PyUnicodeObject *substring,
4355	      Py_ssize_t start,
4356	      Py_ssize_t end,
4357	      int direction)
4358{
4359    if (substring->length == 0)
4360        return 1;
4361
4362    FIX_START_END(self);
4363
4364    end -= substring->length;
4365    if (end < start)
4366	return 0;
4367
4368    if (direction > 0) {
4369	if (Py_UNICODE_MATCH(self, end, substring))
4370	    return 1;
4371    } else {
4372        if (Py_UNICODE_MATCH(self, start, substring))
4373	    return 1;
4374    }
4375
4376    return 0;
4377}
4378
4379Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4380			PyObject *substr,
4381			Py_ssize_t start,
4382			Py_ssize_t end,
4383			int direction)
4384{
4385    Py_ssize_t result;
4386
4387    str = PyUnicode_FromObject(str);
4388    if (str == NULL)
4389	return -1;
4390    substr = PyUnicode_FromObject(substr);
4391    if (substr == NULL) {
4392	Py_DECREF(str);
4393	return -1;
4394    }
4395
4396    result = tailmatch((PyUnicodeObject *)str,
4397		       (PyUnicodeObject *)substr,
4398		       start, end, direction);
4399    Py_DECREF(str);
4400    Py_DECREF(substr);
4401    return result;
4402}
4403
4404/* Apply fixfct filter to the Unicode object self and return a
4405   reference to the modified object */
4406
4407static
4408PyObject *fixup(PyUnicodeObject *self,
4409		int (*fixfct)(PyUnicodeObject *s))
4410{
4411
4412    PyUnicodeObject *u;
4413
4414    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4415    if (u == NULL)
4416	return NULL;
4417
4418    Py_UNICODE_COPY(u->str, self->str, self->length);
4419
4420    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4421	/* fixfct should return TRUE if it modified the buffer. If
4422	   FALSE, return a reference to the original buffer instead
4423	   (to save space, not time) */
4424	Py_INCREF(self);
4425	Py_DECREF(u);
4426	return (PyObject*) self;
4427    }
4428    return (PyObject*) u;
4429}
4430
4431static
4432int fixupper(PyUnicodeObject *self)
4433{
4434    Py_ssize_t len = self->length;
4435    Py_UNICODE *s = self->str;
4436    int status = 0;
4437
4438    while (len-- > 0) {
4439	register Py_UNICODE ch;
4440
4441	ch = Py_UNICODE_TOUPPER(*s);
4442	if (ch != *s) {
4443            status = 1;
4444	    *s = ch;
4445	}
4446        s++;
4447    }
4448
4449    return status;
4450}
4451
4452static
4453int fixlower(PyUnicodeObject *self)
4454{
4455    Py_ssize_t len = self->length;
4456    Py_UNICODE *s = self->str;
4457    int status = 0;
4458
4459    while (len-- > 0) {
4460	register Py_UNICODE ch;
4461
4462	ch = Py_UNICODE_TOLOWER(*s);
4463	if (ch != *s) {
4464            status = 1;
4465	    *s = ch;
4466	}
4467        s++;
4468    }
4469
4470    return status;
4471}
4472
4473static
4474int fixswapcase(PyUnicodeObject *self)
4475{
4476    Py_ssize_t len = self->length;
4477    Py_UNICODE *s = self->str;
4478    int status = 0;
4479
4480    while (len-- > 0) {
4481        if (Py_UNICODE_ISUPPER(*s)) {
4482            *s = Py_UNICODE_TOLOWER(*s);
4483            status = 1;
4484        } else if (Py_UNICODE_ISLOWER(*s)) {
4485            *s = Py_UNICODE_TOUPPER(*s);
4486            status = 1;
4487        }
4488        s++;
4489    }
4490
4491    return status;
4492}
4493
4494static
4495int fixcapitalize(PyUnicodeObject *self)
4496{
4497    Py_ssize_t len = self->length;
4498    Py_UNICODE *s = self->str;
4499    int status = 0;
4500
4501    if (len == 0)
4502	return 0;
4503    if (Py_UNICODE_ISLOWER(*s)) {
4504	*s = Py_UNICODE_TOUPPER(*s);
4505	status = 1;
4506    }
4507    s++;
4508    while (--len > 0) {
4509        if (Py_UNICODE_ISUPPER(*s)) {
4510            *s = Py_UNICODE_TOLOWER(*s);
4511            status = 1;
4512        }
4513        s++;
4514    }
4515    return status;
4516}
4517
4518static
4519int fixtitle(PyUnicodeObject *self)
4520{
4521    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4522    register Py_UNICODE *e;
4523    int previous_is_cased;
4524
4525    /* Shortcut for single character strings */
4526    if (PyUnicode_GET_SIZE(self) == 1) {
4527	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4528	if (*p != ch) {
4529	    *p = ch;
4530	    return 1;
4531	}
4532	else
4533	    return 0;
4534    }
4535
4536    e = p + PyUnicode_GET_SIZE(self);
4537    previous_is_cased = 0;
4538    for (; p < e; p++) {
4539	register const Py_UNICODE ch = *p;
4540
4541	if (previous_is_cased)
4542	    *p = Py_UNICODE_TOLOWER(ch);
4543	else
4544	    *p = Py_UNICODE_TOTITLE(ch);
4545
4546	if (Py_UNICODE_ISLOWER(ch) ||
4547	    Py_UNICODE_ISUPPER(ch) ||
4548	    Py_UNICODE_ISTITLE(ch))
4549	    previous_is_cased = 1;
4550	else
4551	    previous_is_cased = 0;
4552    }
4553    return 1;
4554}
4555
4556PyObject *
4557PyUnicode_Join(PyObject *separator, PyObject *seq)
4558{
4559    PyObject *internal_separator = NULL;
4560    const Py_UNICODE blank = ' ';
4561    const Py_UNICODE *sep = &blank;
4562    Py_ssize_t seplen = 1;
4563    PyUnicodeObject *res = NULL; /* the result */
4564    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
4565    Py_ssize_t res_used;         /* # used bytes */
4566    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
4567    PyObject *fseq;          /* PySequence_Fast(seq) */
4568    Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
4569    PyObject *item;
4570    Py_ssize_t i;
4571
4572    fseq = PySequence_Fast(seq, "");
4573    if (fseq == NULL) {
4574    	return NULL;
4575    }
4576
4577    /* Grrrr.  A codec may be invoked to convert str objects to
4578     * Unicode, and so it's possible to call back into Python code
4579     * during PyUnicode_FromObject(), and so it's possible for a sick
4580     * codec to change the size of fseq (if seq is a list).  Therefore
4581     * we have to keep refetching the size -- can't assume seqlen
4582     * is invariant.
4583     */
4584    seqlen = PySequence_Fast_GET_SIZE(fseq);
4585    /* If empty sequence, return u"". */
4586    if (seqlen == 0) {
4587    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
4588    	goto Done;
4589    }
4590    /* If singleton sequence with an exact Unicode, return that. */
4591    if (seqlen == 1) {
4592	item = PySequence_Fast_GET_ITEM(fseq, 0);
4593	if (PyUnicode_CheckExact(item)) {
4594	    Py_INCREF(item);
4595	    res = (PyUnicodeObject *)item;
4596	    goto Done;
4597	}
4598    }
4599
4600    /* At least two items to join, or one that isn't exact Unicode. */
4601    if (seqlen > 1) {
4602        /* Set up sep and seplen -- they're needed. */
4603    	if (separator == NULL) {
4604	    sep = &blank;
4605	    seplen = 1;
4606        }
4607    	else {
4608	    internal_separator = PyUnicode_FromObject(separator);
4609	    if (internal_separator == NULL)
4610	        goto onError;
4611	    sep = PyUnicode_AS_UNICODE(internal_separator);
4612	    seplen = PyUnicode_GET_SIZE(internal_separator);
4613	    /* In case PyUnicode_FromObject() mutated seq. */
4614	    seqlen = PySequence_Fast_GET_SIZE(fseq);
4615        }
4616    }
4617
4618    /* Get space. */
4619    res = _PyUnicode_New(res_alloc);
4620    if (res == NULL)
4621        goto onError;
4622    res_p = PyUnicode_AS_UNICODE(res);
4623    res_used = 0;
4624
4625    for (i = 0; i < seqlen; ++i) {
4626	Py_ssize_t itemlen;
4627	Py_ssize_t new_res_used;
4628
4629	item = PySequence_Fast_GET_ITEM(fseq, i);
4630	/* Convert item to Unicode. */
4631	if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4632	    PyErr_Format(PyExc_TypeError,
4633			 "sequence item %zd: expected string or Unicode,"
4634			 " %.80s found",
4635			 i, item->ob_type->tp_name);
4636	    goto onError;
4637	}
4638	item = PyUnicode_FromObject(item);
4639	if (item == NULL)
4640	    goto onError;
4641	/* We own a reference to item from here on. */
4642
4643	/* In case PyUnicode_FromObject() mutated seq. */
4644	seqlen = PySequence_Fast_GET_SIZE(fseq);
4645
4646        /* Make sure we have enough space for the separator and the item. */
4647	itemlen = PyUnicode_GET_SIZE(item);
4648	new_res_used = res_used + itemlen;
4649	if (new_res_used < 0)
4650	    goto Overflow;
4651	if (i < seqlen - 1) {
4652	    new_res_used += seplen;
4653	    if (new_res_used < 0)
4654		goto Overflow;
4655	}
4656	if (new_res_used > res_alloc) {
4657	    /* double allocated size until it's big enough */
4658	    do {
4659	        res_alloc += res_alloc;
4660	        if (res_alloc <= 0)
4661	            goto Overflow;
4662	    } while (new_res_used > res_alloc);
4663	    if (_PyUnicode_Resize(&res, res_alloc) < 0) {
4664		Py_DECREF(item);
4665		goto onError;
4666	    }
4667            res_p = PyUnicode_AS_UNICODE(res) + res_used;
4668	}
4669
4670	/* Copy item, and maybe the separator. */
4671	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
4672	res_p += itemlen;
4673	if (i < seqlen - 1) {
4674	    Py_UNICODE_COPY(res_p, sep, seplen);
4675	    res_p += seplen;
4676	}
4677	Py_DECREF(item);
4678	res_used = new_res_used;
4679    }
4680
4681    /* Shrink res to match the used area; this probably can't fail,
4682     * but it's cheap to check.
4683     */
4684    if (_PyUnicode_Resize(&res, res_used) < 0)
4685	goto onError;
4686
4687 Done:
4688    Py_XDECREF(internal_separator);
4689    Py_DECREF(fseq);
4690    return (PyObject *)res;
4691
4692 Overflow:
4693    PyErr_SetString(PyExc_OverflowError,
4694                    "join() result is too long for a Python string");
4695    Py_DECREF(item);
4696    /* fall through */
4697
4698 onError:
4699    Py_XDECREF(internal_separator);
4700    Py_DECREF(fseq);
4701    Py_XDECREF(res);
4702    return NULL;
4703}
4704
4705static
4706PyUnicodeObject *pad(PyUnicodeObject *self,
4707		     Py_ssize_t left,
4708		     Py_ssize_t right,
4709		     Py_UNICODE fill)
4710{
4711    PyUnicodeObject *u;
4712
4713    if (left < 0)
4714        left = 0;
4715    if (right < 0)
4716        right = 0;
4717
4718    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4719        Py_INCREF(self);
4720        return self;
4721    }
4722
4723    u = _PyUnicode_New(left + self->length + right);
4724    if (u) {
4725        if (left)
4726            Py_UNICODE_FILL(u->str, fill, left);
4727        Py_UNICODE_COPY(u->str + left, self->str, self->length);
4728        if (right)
4729            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4730    }
4731
4732    return u;
4733}
4734
4735#define SPLIT_APPEND(data, left, right)					\
4736	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
4737	if (!str)							\
4738	    goto onError;						\
4739	if (PyList_Append(list, str)) {					\
4740	    Py_DECREF(str);						\
4741	    goto onError;						\
4742	}								\
4743        else								\
4744            Py_DECREF(str);
4745
4746static
4747PyObject *split_whitespace(PyUnicodeObject *self,
4748			   PyObject *list,
4749			   Py_ssize_t maxcount)
4750{
4751    register Py_ssize_t i;
4752    register Py_ssize_t j;
4753    Py_ssize_t len = self->length;
4754    PyObject *str;
4755
4756    for (i = j = 0; i < len; ) {
4757	/* find a token */
4758	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4759	    i++;
4760	j = i;
4761	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4762	    i++;
4763	if (j < i) {
4764	    if (maxcount-- <= 0)
4765		break;
4766	    SPLIT_APPEND(self->str, j, i);
4767	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4768		i++;
4769	    j = i;
4770	}
4771    }
4772    if (j < len) {
4773	SPLIT_APPEND(self->str, j, len);
4774    }
4775    return list;
4776
4777 onError:
4778    Py_DECREF(list);
4779    return NULL;
4780}
4781
4782PyObject *PyUnicode_Splitlines(PyObject *string,
4783			       int keepends)
4784{
4785    register Py_ssize_t i;
4786    register Py_ssize_t j;
4787    Py_ssize_t len;
4788    PyObject *list;
4789    PyObject *str;
4790    Py_UNICODE *data;
4791
4792    string = PyUnicode_FromObject(string);
4793    if (string == NULL)
4794	return NULL;
4795    data = PyUnicode_AS_UNICODE(string);
4796    len = PyUnicode_GET_SIZE(string);
4797
4798    list = PyList_New(0);
4799    if (!list)
4800        goto onError;
4801
4802    for (i = j = 0; i < len; ) {
4803	Py_ssize_t eol;
4804
4805	/* Find a line and append it */
4806	while (i < len && !BLOOM_LINEBREAK(data[i]))
4807	    i++;
4808
4809	/* Skip the line break reading CRLF as one line break */
4810	eol = i;
4811	if (i < len) {
4812	    if (data[i] == '\r' && i + 1 < len &&
4813		data[i+1] == '\n')
4814		i += 2;
4815	    else
4816		i++;
4817	    if (keepends)
4818		eol = i;
4819	}
4820	SPLIT_APPEND(data, j, eol);
4821	j = i;
4822    }
4823    if (j < len) {
4824	SPLIT_APPEND(data, j, len);
4825    }
4826
4827    Py_DECREF(string);
4828    return list;
4829
4830 onError:
4831    Py_XDECREF(list);
4832    Py_DECREF(string);
4833    return NULL;
4834}
4835
4836static
4837PyObject *split_char(PyUnicodeObject *self,
4838		     PyObject *list,
4839		     Py_UNICODE ch,
4840		     Py_ssize_t maxcount)
4841{
4842    register Py_ssize_t i;
4843    register Py_ssize_t j;
4844    Py_ssize_t len = self->length;
4845    PyObject *str;
4846
4847    for (i = j = 0; i < len; ) {
4848	if (self->str[i] == ch) {
4849	    if (maxcount-- <= 0)
4850		break;
4851	    SPLIT_APPEND(self->str, j, i);
4852	    i = j = i + 1;
4853	} else
4854	    i++;
4855    }
4856    if (j <= len) {
4857	SPLIT_APPEND(self->str, j, len);
4858    }
4859    return list;
4860
4861 onError:
4862    Py_DECREF(list);
4863    return NULL;
4864}
4865
4866static
4867PyObject *split_substring(PyUnicodeObject *self,
4868			  PyObject *list,
4869			  PyUnicodeObject *substring,
4870			  Py_ssize_t maxcount)
4871{
4872    register Py_ssize_t i;
4873    register Py_ssize_t j;
4874    Py_ssize_t len = self->length;
4875    Py_ssize_t sublen = substring->length;
4876    PyObject *str;
4877
4878    for (i = j = 0; i <= len - sublen; ) {
4879	if (Py_UNICODE_MATCH(self, i, substring)) {
4880	    if (maxcount-- <= 0)
4881		break;
4882	    SPLIT_APPEND(self->str, j, i);
4883	    i = j = i + sublen;
4884	} else
4885	    i++;
4886    }
4887    if (j <= len) {
4888	SPLIT_APPEND(self->str, j, len);
4889    }
4890    return list;
4891
4892 onError:
4893    Py_DECREF(list);
4894    return NULL;
4895}
4896
4897static
4898PyObject *rsplit_whitespace(PyUnicodeObject *self,
4899			    PyObject *list,
4900			    Py_ssize_t maxcount)
4901{
4902    register Py_ssize_t i;
4903    register Py_ssize_t j;
4904    Py_ssize_t len = self->length;
4905    PyObject *str;
4906
4907    for (i = j = len - 1; i >= 0; ) {
4908	/* find a token */
4909	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4910	    i--;
4911	j = i;
4912	while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4913	    i--;
4914	if (j > i) {
4915	    if (maxcount-- <= 0)
4916		break;
4917	    SPLIT_APPEND(self->str, i + 1, j + 1);
4918	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4919		i--;
4920	    j = i;
4921	}
4922    }
4923    if (j >= 0) {
4924	SPLIT_APPEND(self->str, 0, j + 1);
4925    }
4926    if (PyList_Reverse(list) < 0)
4927        goto onError;
4928    return list;
4929
4930 onError:
4931    Py_DECREF(list);
4932    return NULL;
4933}
4934
4935static
4936PyObject *rsplit_char(PyUnicodeObject *self,
4937		      PyObject *list,
4938		      Py_UNICODE ch,
4939		      Py_ssize_t maxcount)
4940{
4941    register Py_ssize_t i;
4942    register Py_ssize_t j;
4943    Py_ssize_t len = self->length;
4944    PyObject *str;
4945
4946    for (i = j = len - 1; i >= 0; ) {
4947	if (self->str[i] == ch) {
4948	    if (maxcount-- <= 0)
4949		break;
4950	    SPLIT_APPEND(self->str, i + 1, j + 1);
4951	    j = i = i - 1;
4952	} else
4953	    i--;
4954    }
4955    if (j >= -1) {
4956	SPLIT_APPEND(self->str, 0, j + 1);
4957    }
4958    if (PyList_Reverse(list) < 0)
4959        goto onError;
4960    return list;
4961
4962 onError:
4963    Py_DECREF(list);
4964    return NULL;
4965}
4966
4967static
4968PyObject *rsplit_substring(PyUnicodeObject *self,
4969			   PyObject *list,
4970			   PyUnicodeObject *substring,
4971			   Py_ssize_t maxcount)
4972{
4973    register Py_ssize_t i;
4974    register Py_ssize_t j;
4975    Py_ssize_t len = self->length;
4976    Py_ssize_t sublen = substring->length;
4977    PyObject *str;
4978
4979    for (i = len - sublen, j = len; i >= 0; ) {
4980	if (Py_UNICODE_MATCH(self, i, substring)) {
4981	    if (maxcount-- <= 0)
4982		break;
4983	    SPLIT_APPEND(self->str, i + sublen, j);
4984	    j = i;
4985	    i -= sublen;
4986	} else
4987	    i--;
4988    }
4989    if (j >= 0) {
4990	SPLIT_APPEND(self->str, 0, j);
4991    }
4992    if (PyList_Reverse(list) < 0)
4993        goto onError;
4994    return list;
4995
4996 onError:
4997    Py_DECREF(list);
4998    return NULL;
4999}
5000
5001#undef SPLIT_APPEND
5002
5003static
5004PyObject *split(PyUnicodeObject *self,
5005		PyUnicodeObject *substring,
5006		Py_ssize_t maxcount)
5007{
5008    PyObject *list;
5009
5010    if (maxcount < 0)
5011        maxcount = PY_SSIZE_T_MAX;
5012
5013    list = PyList_New(0);
5014    if (!list)
5015        return NULL;
5016
5017    if (substring == NULL)
5018	return split_whitespace(self,list,maxcount);
5019
5020    else if (substring->length == 1)
5021	return split_char(self,list,substring->str[0],maxcount);
5022
5023    else if (substring->length == 0) {
5024	Py_DECREF(list);
5025	PyErr_SetString(PyExc_ValueError, "empty separator");
5026	return NULL;
5027    }
5028    else
5029	return split_substring(self,list,substring,maxcount);
5030}
5031
5032static
5033PyObject *rsplit(PyUnicodeObject *self,
5034		 PyUnicodeObject *substring,
5035		 Py_ssize_t maxcount)
5036{
5037    PyObject *list;
5038
5039    if (maxcount < 0)
5040        maxcount = PY_SSIZE_T_MAX;
5041
5042    list = PyList_New(0);
5043    if (!list)
5044        return NULL;
5045
5046    if (substring == NULL)
5047	return rsplit_whitespace(self,list,maxcount);
5048
5049    else if (substring->length == 1)
5050	return rsplit_char(self,list,substring->str[0],maxcount);
5051
5052    else if (substring->length == 0) {
5053	Py_DECREF(list);
5054	PyErr_SetString(PyExc_ValueError, "empty separator");
5055	return NULL;
5056    }
5057    else
5058	return rsplit_substring(self,list,substring,maxcount);
5059}
5060
5061static
5062PyObject *replace(PyUnicodeObject *self,
5063		  PyUnicodeObject *str1,
5064		  PyUnicodeObject *str2,
5065		  Py_ssize_t maxcount)
5066{
5067    PyUnicodeObject *u;
5068
5069    if (maxcount < 0)
5070	maxcount = PY_SSIZE_T_MAX;
5071
5072    if (str1->length == str2->length) {
5073        /* same length */
5074        Py_ssize_t i;
5075        if (str1->length == 1) {
5076            /* replace characters */
5077            Py_UNICODE u1, u2;
5078            if (!findchar(self->str, self->length, str1->str[0]))
5079                goto nothing;
5080            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5081            if (!u)
5082                return NULL;
5083            Py_UNICODE_COPY(u->str, self->str, self->length);
5084            u1 = str1->str[0];
5085            u2 = str2->str[0];
5086            for (i = 0; i < u->length; i++)
5087                if (u->str[i] == u1) {
5088                    if (--maxcount < 0)
5089                        break;
5090                    u->str[i] = u2;
5091                }
5092        } else {
5093            i = fastsearch(
5094                self->str, self->length, str1->str, str1->length, FAST_SEARCH
5095                );
5096            if (i < 0)
5097                goto nothing;
5098            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5099            if (!u)
5100                return NULL;
5101            Py_UNICODE_COPY(u->str, self->str, self->length);
5102            while (i <= self->length - str1->length)
5103                if (Py_UNICODE_MATCH(self, i, str1)) {
5104                    if (--maxcount < 0)
5105                        break;
5106                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5107                    i += str1->length;
5108                } else
5109                    i++;
5110        }
5111    } else {
5112
5113        Py_ssize_t n, i, j, e;
5114        Py_ssize_t product, new_size, delta;
5115        Py_UNICODE *p;
5116
5117        /* replace strings */
5118        n = stringlib_count(self->str, self->length, str1->str, str1->length);
5119        if (n > maxcount)
5120            n = maxcount;
5121        if (n == 0)
5122            goto nothing;
5123        /* new_size = self->length + n * (str2->length - str1->length)); */
5124        delta = (str2->length - str1->length);
5125        if (delta == 0) {
5126            new_size = self->length;
5127        } else {
5128            product = n * (str2->length - str1->length);
5129            if ((product / (str2->length - str1->length)) != n) {
5130                PyErr_SetString(PyExc_OverflowError,
5131                                "replace string is too long");
5132                return NULL;
5133            }
5134            new_size = self->length + product;
5135            if (new_size < 0) {
5136                PyErr_SetString(PyExc_OverflowError,
5137                                "replace string is too long");
5138                return NULL;
5139            }
5140        }
5141        u = _PyUnicode_New(new_size);
5142        if (!u)
5143            return NULL;
5144        i = 0;
5145        p = u->str;
5146        e = self->length - str1->length;
5147        if (str1->length > 0) {
5148            while (n-- > 0) {
5149                /* look for next match */
5150                j = i;
5151                while (j <= e) {
5152                    if (Py_UNICODE_MATCH(self, j, str1))
5153                        break;
5154                    j++;
5155                }
5156		if (j > i) {
5157                    if (j > e)
5158                        break;
5159                    /* copy unchanged part [i:j] */
5160                    Py_UNICODE_COPY(p, self->str+i, j-i);
5161                    p += j - i;
5162                }
5163                /* copy substitution string */
5164                if (str2->length > 0) {
5165                    Py_UNICODE_COPY(p, str2->str, str2->length);
5166                    p += str2->length;
5167                }
5168                i = j + str1->length;
5169            }
5170            if (i < self->length)
5171                /* copy tail [i:] */
5172                Py_UNICODE_COPY(p, self->str+i, self->length-i);
5173        } else {
5174            /* interleave */
5175            while (n > 0) {
5176                Py_UNICODE_COPY(p, str2->str, str2->length);
5177                p += str2->length;
5178                if (--n <= 0)
5179                    break;
5180                *p++ = self->str[i++];
5181            }
5182            Py_UNICODE_COPY(p, self->str+i, self->length-i);
5183        }
5184    }
5185    return (PyObject *) u;
5186
5187nothing:
5188    /* nothing to replace; return original string (when possible) */
5189    if (PyUnicode_CheckExact(self)) {
5190        Py_INCREF(self);
5191        return (PyObject *) self;
5192    }
5193    return PyUnicode_FromUnicode(self->str, self->length);
5194}
5195
5196/* --- Unicode Object Methods --------------------------------------------- */
5197
5198PyDoc_STRVAR(title__doc__,
5199"S.title() -> unicode\n\
5200\n\
5201Return a titlecased version of S, i.e. words start with title case\n\
5202characters, all remaining cased characters have lower case.");
5203
5204static PyObject*
5205unicode_title(PyUnicodeObject *self)
5206{
5207    return fixup(self, fixtitle);
5208}
5209
5210PyDoc_STRVAR(capitalize__doc__,
5211"S.capitalize() -> unicode\n\
5212\n\
5213Return a capitalized version of S, i.e. make the first character\n\
5214have upper case.");
5215
5216static PyObject*
5217unicode_capitalize(PyUnicodeObject *self)
5218{
5219    return fixup(self, fixcapitalize);
5220}
5221
5222#if 0
5223PyDoc_STRVAR(capwords__doc__,
5224"S.capwords() -> unicode\n\
5225\n\
5226Apply .capitalize() to all words in S and return the result with\n\
5227normalized whitespace (all whitespace strings are replaced by ' ').");
5228
5229static PyObject*
5230unicode_capwords(PyUnicodeObject *self)
5231{
5232    PyObject *list;
5233    PyObject *item;
5234    Py_ssize_t i;
5235
5236    /* Split into words */
5237    list = split(self, NULL, -1);
5238    if (!list)
5239        return NULL;
5240
5241    /* Capitalize each word */
5242    for (i = 0; i < PyList_GET_SIZE(list); i++) {
5243        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5244		     fixcapitalize);
5245        if (item == NULL)
5246            goto onError;
5247        Py_DECREF(PyList_GET_ITEM(list, i));
5248        PyList_SET_ITEM(list, i, item);
5249    }
5250
5251    /* Join the words to form a new string */
5252    item = PyUnicode_Join(NULL, list);
5253
5254onError:
5255    Py_DECREF(list);
5256    return (PyObject *)item;
5257}
5258#endif
5259
5260/* Argument converter.  Coerces to a single unicode character */
5261
5262static int
5263convert_uc(PyObject *obj, void *addr)
5264{
5265	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5266	PyObject *uniobj;
5267	Py_UNICODE *unistr;
5268
5269	uniobj = PyUnicode_FromObject(obj);
5270	if (uniobj == NULL) {
5271		PyErr_SetString(PyExc_TypeError,
5272			"The fill character cannot be converted to Unicode");
5273		return 0;
5274	}
5275	if (PyUnicode_GET_SIZE(uniobj) != 1) {
5276		PyErr_SetString(PyExc_TypeError,
5277			"The fill character must be exactly one character long");
5278		Py_DECREF(uniobj);
5279		return 0;
5280	}
5281	unistr = PyUnicode_AS_UNICODE(uniobj);
5282	*fillcharloc = unistr[0];
5283	Py_DECREF(uniobj);
5284	return 1;
5285}
5286
5287PyDoc_STRVAR(center__doc__,
5288"S.center(width[, fillchar]) -> unicode\n\
5289\n\
5290Return S centered in a Unicode string of length width. Padding is\n\
5291done using the specified fill character (default is a space)");
5292
5293static PyObject *
5294unicode_center(PyUnicodeObject *self, PyObject *args)
5295{
5296    Py_ssize_t marg, left;
5297    Py_ssize_t width;
5298    Py_UNICODE fillchar = ' ';
5299
5300    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
5301        return NULL;
5302
5303    if (self->length >= width && PyUnicode_CheckExact(self)) {
5304        Py_INCREF(self);
5305        return (PyObject*) self;
5306    }
5307
5308    marg = width - self->length;
5309    left = marg / 2 + (marg & width & 1);
5310
5311    return (PyObject*) pad(self, left, marg - left, fillchar);
5312}
5313
5314#if 0
5315
5316/* This code should go into some future Unicode collation support
5317   module. The basic comparison should compare ordinals on a naive
5318   basis (this is what Java does and thus JPython too). */
5319
5320/* speedy UTF-16 code point order comparison */
5321/* gleaned from: */
5322/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5323
5324static short utf16Fixup[32] =
5325{
5326    0, 0, 0, 0, 0, 0, 0, 0,
5327    0, 0, 0, 0, 0, 0, 0, 0,
5328    0, 0, 0, 0, 0, 0, 0, 0,
5329    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5330};
5331
5332static int
5333unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5334{
5335    Py_ssize_t len1, len2;
5336
5337    Py_UNICODE *s1 = str1->str;
5338    Py_UNICODE *s2 = str2->str;
5339
5340    len1 = str1->length;
5341    len2 = str2->length;
5342
5343    while (len1 > 0 && len2 > 0) {
5344        Py_UNICODE c1, c2;
5345
5346        c1 = *s1++;
5347        c2 = *s2++;
5348
5349	if (c1 > (1<<11) * 26)
5350	    c1 += utf16Fixup[c1>>11];
5351	if (c2 > (1<<11) * 26)
5352            c2 += utf16Fixup[c2>>11];
5353        /* now c1 and c2 are in UTF-32-compatible order */
5354
5355        if (c1 != c2)
5356            return (c1 < c2) ? -1 : 1;
5357
5358        len1--; len2--;
5359    }
5360
5361    return (len1 < len2) ? -1 : (len1 != len2);
5362}
5363
5364#else
5365
5366static int
5367unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5368{
5369    register Py_ssize_t len1, len2;
5370
5371    Py_UNICODE *s1 = str1->str;
5372    Py_UNICODE *s2 = str2->str;
5373
5374    len1 = str1->length;
5375    len2 = str2->length;
5376
5377    while (len1 > 0 && len2 > 0) {
5378        Py_UNICODE c1, c2;
5379
5380        c1 = *s1++;
5381        c2 = *s2++;
5382
5383        if (c1 != c2)
5384            return (c1 < c2) ? -1 : 1;
5385
5386        len1--; len2--;
5387    }
5388
5389    return (len1 < len2) ? -1 : (len1 != len2);
5390}
5391
5392#endif
5393
5394int PyUnicode_Compare(PyObject *left,
5395		      PyObject *right)
5396{
5397    PyUnicodeObject *u = NULL, *v = NULL;
5398    int result;
5399
5400    /* Coerce the two arguments */
5401    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5402    if (u == NULL)
5403	goto onError;
5404    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5405    if (v == NULL)
5406	goto onError;
5407
5408    /* Shortcut for empty or interned objects */
5409    if (v == u) {
5410	Py_DECREF(u);
5411	Py_DECREF(v);
5412	return 0;
5413    }
5414
5415    result = unicode_compare(u, v);
5416
5417    Py_DECREF(u);
5418    Py_DECREF(v);
5419    return result;
5420
5421onError:
5422    Py_XDECREF(u);
5423    Py_XDECREF(v);
5424    return -1;
5425}
5426
5427PyObject *PyUnicode_RichCompare(PyObject *left,
5428                                PyObject *right,
5429                                int op)
5430{
5431    int result;
5432
5433    result = PyUnicode_Compare(left, right);
5434    if (result == -1 && PyErr_Occurred())
5435        goto onError;
5436
5437    /* Convert the return value to a Boolean */
5438    switch (op) {
5439    case Py_EQ:
5440        result = (result == 0);
5441        break;
5442    case Py_NE:
5443        result = (result != 0);
5444        break;
5445    case Py_LE:
5446        result = (result <= 0);
5447        break;
5448    case Py_GE:
5449        result = (result >= 0);
5450        break;
5451    case Py_LT:
5452        result = (result == -1);
5453        break;
5454    case Py_GT:
5455        result = (result == 1);
5456        break;
5457    }
5458    return PyBool_FromLong(result);
5459
5460 onError:
5461
5462    /* Standard case
5463
5464       Type errors mean that PyUnicode_FromObject() could not convert
5465       one of the arguments (usually the right hand side) to Unicode,
5466       ie. we can't handle the comparison request. However, it is
5467       possible that the other object knows a comparison method, which
5468       is why we return Py_NotImplemented to give the other object a
5469       chance.
5470
5471    */
5472    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5473        PyErr_Clear();
5474        Py_INCREF(Py_NotImplemented);
5475        return Py_NotImplemented;
5476    }
5477    if (op != Py_EQ && op != Py_NE)
5478        return NULL;
5479
5480    /* Equality comparison.
5481
5482       This is a special case: we silence any PyExc_UnicodeDecodeError
5483       and instead turn it into a PyErr_UnicodeWarning.
5484
5485    */
5486    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5487        return NULL;
5488    PyErr_Clear();
5489    if (PyErr_Warn(PyExc_UnicodeWarning,
5490                   (op == Py_EQ) ?
5491                   "Unicode equal comparison "
5492                   "failed to convert both arguments to Unicode - "
5493                   "interpreting them as being unequal" :
5494                   "Unicode unequal comparison "
5495                   "failed to convert both arguments to Unicode - "
5496                   "interpreting them as being unequal"
5497                   ) < 0)
5498        return NULL;
5499    result = (op == Py_NE);
5500    return PyBool_FromLong(result);
5501}
5502
5503int PyUnicode_Contains(PyObject *container,
5504		       PyObject *element)
5505{
5506    PyObject *str, *sub;
5507    int result;
5508
5509    /* Coerce the two arguments */
5510    sub = PyUnicode_FromObject(element);
5511    if (!sub) {
5512	PyErr_SetString(PyExc_TypeError,
5513	    "'in <string>' requires string as left operand");
5514        return -1;
5515    }
5516
5517    str = PyUnicode_FromObject(container);
5518    if (!str) {
5519        Py_DECREF(sub);
5520        return -1;
5521    }
5522
5523    result = stringlib_contains_obj(str, sub);
5524
5525    Py_DECREF(str);
5526    Py_DECREF(sub);
5527
5528    return result;
5529}
5530
5531/* Concat to string or Unicode object giving a new Unicode object. */
5532
5533PyObject *PyUnicode_Concat(PyObject *left,
5534			   PyObject *right)
5535{
5536    PyUnicodeObject *u = NULL, *v = NULL, *w;
5537
5538    if (PyBytes_Check(left) || PyBytes_Check(right))
5539        return PyBytes_Concat(left, right);
5540
5541    /* Coerce the two arguments */
5542    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5543    if (u == NULL)
5544	goto onError;
5545    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5546    if (v == NULL)
5547	goto onError;
5548
5549    /* Shortcuts */
5550    if (v == unicode_empty) {
5551	Py_DECREF(v);
5552	return (PyObject *)u;
5553    }
5554    if (u == unicode_empty) {
5555	Py_DECREF(u);
5556	return (PyObject *)v;
5557    }
5558
5559    /* Concat the two Unicode strings */
5560    w = _PyUnicode_New(u->length + v->length);
5561    if (w == NULL)
5562	goto onError;
5563    Py_UNICODE_COPY(w->str, u->str, u->length);
5564    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5565
5566    Py_DECREF(u);
5567    Py_DECREF(v);
5568    return (PyObject *)w;
5569
5570onError:
5571    Py_XDECREF(u);
5572    Py_XDECREF(v);
5573    return NULL;
5574}
5575
5576PyDoc_STRVAR(count__doc__,
5577"S.count(sub[, start[, end]]) -> int\n\
5578\n\
5579Return the number of non-overlapping occurrences of substring sub in\n\
5580Unicode string S[start:end].  Optional arguments start and end are\n\
5581interpreted as in slice notation.");
5582
5583static PyObject *
5584unicode_count(PyUnicodeObject *self, PyObject *args)
5585{
5586    PyUnicodeObject *substring;
5587    Py_ssize_t start = 0;
5588    Py_ssize_t end = PY_SSIZE_T_MAX;
5589    PyObject *result;
5590
5591    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5592		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5593        return NULL;
5594
5595    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5596        (PyObject *)substring);
5597    if (substring == NULL)
5598	return NULL;
5599
5600    FIX_START_END(self);
5601
5602    result = PyInt_FromSsize_t(
5603        stringlib_count(self->str + start, end - start,
5604                        substring->str, substring->length)
5605        );
5606
5607    Py_DECREF(substring);
5608
5609    return result;
5610}
5611
5612PyDoc_STRVAR(encode__doc__,
5613"S.encode([encoding[,errors]]) -> string or unicode\n\
5614\n\
5615Encodes S using the codec registered for encoding. encoding defaults\n\
5616to the default encoding. errors may be given to set a different error\n\
5617handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5618a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5619'xmlcharrefreplace' as well as any other name registered with\n\
5620codecs.register_error that can handle UnicodeEncodeErrors.");
5621
5622static PyObject *
5623unicode_encode(PyUnicodeObject *self, PyObject *args)
5624{
5625    char *encoding = NULL;
5626    char *errors = NULL;
5627    PyObject *v;
5628
5629    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5630        return NULL;
5631    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5632    if (v == NULL)
5633        goto onError;
5634    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5635        PyErr_Format(PyExc_TypeError,
5636                     "encoder did not return a string/unicode object "
5637                     "(type=%.400s)",
5638                     v->ob_type->tp_name);
5639        Py_DECREF(v);
5640        return NULL;
5641    }
5642    return v;
5643
5644 onError:
5645    return NULL;
5646}
5647
5648PyDoc_STRVAR(decode__doc__,
5649"S.decode([encoding[,errors]]) -> string or unicode\n\
5650\n\
5651Decodes S using the codec registered for encoding. encoding defaults\n\
5652to the default encoding. errors may be given to set a different error\n\
5653handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5654a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5655as well as any other name registerd with codecs.register_error that is\n\
5656able to handle UnicodeDecodeErrors.");
5657
5658static PyObject *
5659unicode_decode(PyUnicodeObject *self, PyObject *args)
5660{
5661    char *encoding = NULL;
5662    char *errors = NULL;
5663    PyObject *v;
5664
5665    if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5666        return NULL;
5667    v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5668    if (v == NULL)
5669        goto onError;
5670    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5671        PyErr_Format(PyExc_TypeError,
5672                     "decoder did not return a string/unicode object "
5673                     "(type=%.400s)",
5674                     v->ob_type->tp_name);
5675        Py_DECREF(v);
5676        return NULL;
5677    }
5678    return v;
5679
5680 onError:
5681    return NULL;
5682}
5683
5684PyDoc_STRVAR(expandtabs__doc__,
5685"S.expandtabs([tabsize]) -> unicode\n\
5686\n\
5687Return a copy of S where all tab characters are expanded using spaces.\n\
5688If tabsize is not given, a tab size of 8 characters is assumed.");
5689
5690static PyObject*
5691unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5692{
5693    Py_UNICODE *e;
5694    Py_UNICODE *p;
5695    Py_UNICODE *q;
5696    Py_ssize_t i, j;
5697    PyUnicodeObject *u;
5698    int tabsize = 8;
5699
5700    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5701	return NULL;
5702
5703    /* First pass: determine size of output string */
5704    i = j = 0;
5705    e = self->str + self->length;
5706    for (p = self->str; p < e; p++)
5707        if (*p == '\t') {
5708	    if (tabsize > 0)
5709		j += tabsize - (j % tabsize);
5710	}
5711        else {
5712            j++;
5713            if (*p == '\n' || *p == '\r') {
5714                i += j;
5715                j = 0;
5716            }
5717        }
5718
5719    /* Second pass: create output string and fill it */
5720    u = _PyUnicode_New(i + j);
5721    if (!u)
5722        return NULL;
5723
5724    j = 0;
5725    q = u->str;
5726
5727    for (p = self->str; p < e; p++)
5728        if (*p == '\t') {
5729	    if (tabsize > 0) {
5730		i = tabsize - (j % tabsize);
5731		j += i;
5732		while (i--)
5733		    *q++ = ' ';
5734	    }
5735	}
5736	else {
5737            j++;
5738	    *q++ = *p;
5739            if (*p == '\n' || *p == '\r')
5740                j = 0;
5741        }
5742
5743    return (PyObject*) u;
5744}
5745
5746PyDoc_STRVAR(find__doc__,
5747"S.find(sub [,start [,end]]) -> int\n\
5748\n\
5749Return the lowest index in S where substring sub is found,\n\
5750such that sub is contained within s[start,end].  Optional\n\
5751arguments start and end are interpreted as in slice notation.\n\
5752\n\
5753Return -1 on failure.");
5754
5755static PyObject *
5756unicode_find(PyUnicodeObject *self, PyObject *args)
5757{
5758    PyObject *substring;
5759    Py_ssize_t start = 0;
5760    Py_ssize_t end = PY_SSIZE_T_MAX;
5761    Py_ssize_t result;
5762
5763    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5764		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5765        return NULL;
5766    substring = PyUnicode_FromObject(substring);
5767    if (!substring)
5768	return NULL;
5769
5770    result = stringlib_find_slice(
5771        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5772        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5773        start, end
5774        );
5775
5776    Py_DECREF(substring);
5777
5778    return PyInt_FromSsize_t(result);
5779}
5780
5781static PyObject *
5782unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
5783{
5784    if (index < 0 || index >= self->length) {
5785        PyErr_SetString(PyExc_IndexError, "string index out of range");
5786        return NULL;
5787    }
5788
5789    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5790}
5791
5792static long
5793unicode_hash(PyUnicodeObject *self)
5794{
5795    /* Since Unicode objects compare equal to their ASCII string
5796       counterparts, they should use the individual character values
5797       as basis for their hash value.  This is needed to assure that
5798       strings and Unicode objects behave in the same way as
5799       dictionary keys. */
5800
5801    register Py_ssize_t len;
5802    register Py_UNICODE *p;
5803    register long x;
5804
5805    if (self->hash != -1)
5806	return self->hash;
5807    len = PyUnicode_GET_SIZE(self);
5808    p = PyUnicode_AS_UNICODE(self);
5809    x = *p << 7;
5810    while (--len >= 0)
5811	x = (1000003*x) ^ *p++;
5812    x ^= PyUnicode_GET_SIZE(self);
5813    if (x == -1)
5814	x = -2;
5815    self->hash = x;
5816    return x;
5817}
5818
5819PyDoc_STRVAR(index__doc__,
5820"S.index(sub [,start [,end]]) -> int\n\
5821\n\
5822Like S.find() but raise ValueError when the substring is not found.");
5823
5824static PyObject *
5825unicode_index(PyUnicodeObject *self, PyObject *args)
5826{
5827    Py_ssize_t result;
5828    PyObject *substring;
5829    Py_ssize_t start = 0;
5830    Py_ssize_t end = PY_SSIZE_T_MAX;
5831
5832    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5833		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5834        return NULL;
5835    substring = PyUnicode_FromObject(substring);
5836    if (!substring)
5837	return NULL;
5838
5839    result = stringlib_find_slice(
5840        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5841        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5842        start, end
5843        );
5844
5845    Py_DECREF(substring);
5846
5847    if (result < 0) {
5848        PyErr_SetString(PyExc_ValueError, "substring not found");
5849        return NULL;
5850    }
5851
5852    return PyInt_FromSsize_t(result);
5853}
5854
5855PyDoc_STRVAR(islower__doc__,
5856"S.islower() -> bool\n\
5857\n\
5858Return True if all cased characters in S are lowercase and there is\n\
5859at least one cased character in S, False otherwise.");
5860
5861static PyObject*
5862unicode_islower(PyUnicodeObject *self)
5863{
5864    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5865    register const Py_UNICODE *e;
5866    int cased;
5867
5868    /* Shortcut for single character strings */
5869    if (PyUnicode_GET_SIZE(self) == 1)
5870	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5871
5872    /* Special case for empty strings */
5873    if (PyUnicode_GET_SIZE(self) == 0)
5874	return PyBool_FromLong(0);
5875
5876    e = p + PyUnicode_GET_SIZE(self);
5877    cased = 0;
5878    for (; p < e; p++) {
5879	register const Py_UNICODE ch = *p;
5880
5881	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
5882	    return PyBool_FromLong(0);
5883	else if (!cased && Py_UNICODE_ISLOWER(ch))
5884	    cased = 1;
5885    }
5886    return PyBool_FromLong(cased);
5887}
5888
5889PyDoc_STRVAR(isupper__doc__,
5890"S.isupper() -> bool\n\
5891\n\
5892Return True if all cased characters in S are uppercase and there is\n\
5893at least one cased character in S, False otherwise.");
5894
5895static PyObject*
5896unicode_isupper(PyUnicodeObject *self)
5897{
5898    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5899    register const Py_UNICODE *e;
5900    int cased;
5901
5902    /* Shortcut for single character strings */
5903    if (PyUnicode_GET_SIZE(self) == 1)
5904	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5905
5906    /* Special case for empty strings */
5907    if (PyUnicode_GET_SIZE(self) == 0)
5908	return PyBool_FromLong(0);
5909
5910    e = p + PyUnicode_GET_SIZE(self);
5911    cased = 0;
5912    for (; p < e; p++) {
5913	register const Py_UNICODE ch = *p;
5914
5915	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
5916	    return PyBool_FromLong(0);
5917	else if (!cased && Py_UNICODE_ISUPPER(ch))
5918	    cased = 1;
5919    }
5920    return PyBool_FromLong(cased);
5921}
5922
5923PyDoc_STRVAR(istitle__doc__,
5924"S.istitle() -> bool\n\
5925\n\
5926Return True if S is a titlecased string and there is at least one\n\
5927character in S, i.e. upper- and titlecase characters may only\n\
5928follow uncased characters and lowercase characters only cased ones.\n\
5929Return False otherwise.");
5930
5931static PyObject*
5932unicode_istitle(PyUnicodeObject *self)
5933{
5934    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5935    register const Py_UNICODE *e;
5936    int cased, previous_is_cased;
5937
5938    /* Shortcut for single character strings */
5939    if (PyUnicode_GET_SIZE(self) == 1)
5940	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5941			       (Py_UNICODE_ISUPPER(*p) != 0));
5942
5943    /* Special case for empty strings */
5944    if (PyUnicode_GET_SIZE(self) == 0)
5945	return PyBool_FromLong(0);
5946
5947    e = p + PyUnicode_GET_SIZE(self);
5948    cased = 0;
5949    previous_is_cased = 0;
5950    for (; p < e; p++) {
5951	register const Py_UNICODE ch = *p;
5952
5953	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5954	    if (previous_is_cased)
5955		return PyBool_FromLong(0);
5956	    previous_is_cased = 1;
5957	    cased = 1;
5958	}
5959	else if (Py_UNICODE_ISLOWER(ch)) {
5960	    if (!previous_is_cased)
5961		return PyBool_FromLong(0);
5962	    previous_is_cased = 1;
5963	    cased = 1;
5964	}
5965	else
5966	    previous_is_cased = 0;
5967    }
5968    return PyBool_FromLong(cased);
5969}
5970
5971PyDoc_STRVAR(isspace__doc__,
5972"S.isspace() -> bool\n\
5973\n\
5974Return True if all characters in S are whitespace\n\
5975and there is at least one character in S, False otherwise.");
5976
5977static PyObject*
5978unicode_isspace(PyUnicodeObject *self)
5979{
5980    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5981    register const Py_UNICODE *e;
5982
5983    /* Shortcut for single character strings */
5984    if (PyUnicode_GET_SIZE(self) == 1 &&
5985	Py_UNICODE_ISSPACE(*p))
5986	return PyBool_FromLong(1);
5987
5988    /* Special case for empty strings */
5989    if (PyUnicode_GET_SIZE(self) == 0)
5990	return PyBool_FromLong(0);
5991
5992    e = p + PyUnicode_GET_SIZE(self);
5993    for (; p < e; p++) {
5994	if (!Py_UNICODE_ISSPACE(*p))
5995	    return PyBool_FromLong(0);
5996    }
5997    return PyBool_FromLong(1);
5998}
5999
6000PyDoc_STRVAR(isalpha__doc__,
6001"S.isalpha() -> bool\n\
6002\n\
6003Return True if all characters in S are alphabetic\n\
6004and there is at least one character in S, False otherwise.");
6005
6006static PyObject*
6007unicode_isalpha(PyUnicodeObject *self)
6008{
6009    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6010    register const Py_UNICODE *e;
6011
6012    /* Shortcut for single character strings */
6013    if (PyUnicode_GET_SIZE(self) == 1 &&
6014	Py_UNICODE_ISALPHA(*p))
6015	return PyBool_FromLong(1);
6016
6017    /* Special case for empty strings */
6018    if (PyUnicode_GET_SIZE(self) == 0)
6019	return PyBool_FromLong(0);
6020
6021    e = p + PyUnicode_GET_SIZE(self);
6022    for (; p < e; p++) {
6023	if (!Py_UNICODE_ISALPHA(*p))
6024	    return PyBool_FromLong(0);
6025    }
6026    return PyBool_FromLong(1);
6027}
6028
6029PyDoc_STRVAR(isalnum__doc__,
6030"S.isalnum() -> bool\n\
6031\n\
6032Return True if all characters in S are alphanumeric\n\
6033and there is at least one character in S, False otherwise.");
6034
6035static PyObject*
6036unicode_isalnum(PyUnicodeObject *self)
6037{
6038    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6039    register const Py_UNICODE *e;
6040
6041    /* Shortcut for single character strings */
6042    if (PyUnicode_GET_SIZE(self) == 1 &&
6043	Py_UNICODE_ISALNUM(*p))
6044	return PyBool_FromLong(1);
6045
6046    /* Special case for empty strings */
6047    if (PyUnicode_GET_SIZE(self) == 0)
6048	return PyBool_FromLong(0);
6049
6050    e = p + PyUnicode_GET_SIZE(self);
6051    for (; p < e; p++) {
6052	if (!Py_UNICODE_ISALNUM(*p))
6053	    return PyBool_FromLong(0);
6054    }
6055    return PyBool_FromLong(1);
6056}
6057
6058PyDoc_STRVAR(isdecimal__doc__,
6059"S.isdecimal() -> bool\n\
6060\n\
6061Return True if there are only decimal characters in S,\n\
6062False otherwise.");
6063
6064static PyObject*
6065unicode_isdecimal(PyUnicodeObject *self)
6066{
6067    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6068    register const Py_UNICODE *e;
6069
6070    /* Shortcut for single character strings */
6071    if (PyUnicode_GET_SIZE(self) == 1 &&
6072	Py_UNICODE_ISDECIMAL(*p))
6073	return PyBool_FromLong(1);
6074
6075    /* Special case for empty strings */
6076    if (PyUnicode_GET_SIZE(self) == 0)
6077	return PyBool_FromLong(0);
6078
6079    e = p + PyUnicode_GET_SIZE(self);
6080    for (; p < e; p++) {
6081	if (!Py_UNICODE_ISDECIMAL(*p))
6082	    return PyBool_FromLong(0);
6083    }
6084    return PyBool_FromLong(1);
6085}
6086
6087PyDoc_STRVAR(isdigit__doc__,
6088"S.isdigit() -> bool\n\
6089\n\
6090Return True if all characters in S are digits\n\
6091and there is at least one character in S, False otherwise.");
6092
6093static PyObject*
6094unicode_isdigit(PyUnicodeObject *self)
6095{
6096    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6097    register const Py_UNICODE *e;
6098
6099    /* Shortcut for single character strings */
6100    if (PyUnicode_GET_SIZE(self) == 1 &&
6101	Py_UNICODE_ISDIGIT(*p))
6102	return PyBool_FromLong(1);
6103
6104    /* Special case for empty strings */
6105    if (PyUnicode_GET_SIZE(self) == 0)
6106	return PyBool_FromLong(0);
6107
6108    e = p + PyUnicode_GET_SIZE(self);
6109    for (; p < e; p++) {
6110	if (!Py_UNICODE_ISDIGIT(*p))
6111	    return PyBool_FromLong(0);
6112    }
6113    return PyBool_FromLong(1);
6114}
6115
6116PyDoc_STRVAR(isnumeric__doc__,
6117"S.isnumeric() -> bool\n\
6118\n\
6119Return True if there are only numeric characters in S,\n\
6120False otherwise.");
6121
6122static PyObject*
6123unicode_isnumeric(PyUnicodeObject *self)
6124{
6125    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6126    register const Py_UNICODE *e;
6127
6128    /* Shortcut for single character strings */
6129    if (PyUnicode_GET_SIZE(self) == 1 &&
6130	Py_UNICODE_ISNUMERIC(*p))
6131	return PyBool_FromLong(1);
6132
6133    /* Special case for empty strings */
6134    if (PyUnicode_GET_SIZE(self) == 0)
6135	return PyBool_FromLong(0);
6136
6137    e = p + PyUnicode_GET_SIZE(self);
6138    for (; p < e; p++) {
6139	if (!Py_UNICODE_ISNUMERIC(*p))
6140	    return PyBool_FromLong(0);
6141    }
6142    return PyBool_FromLong(1);
6143}
6144
6145PyDoc_STRVAR(join__doc__,
6146"S.join(sequence) -> unicode\n\
6147\n\
6148Return a string which is the concatenation of the strings in the\n\
6149sequence.  The separator between elements is S.");
6150
6151static PyObject*
6152unicode_join(PyObject *self, PyObject *data)
6153{
6154    return PyUnicode_Join(self, data);
6155}
6156
6157static Py_ssize_t
6158unicode_length(PyUnicodeObject *self)
6159{
6160    return self->length;
6161}
6162
6163PyDoc_STRVAR(ljust__doc__,
6164"S.ljust(width[, fillchar]) -> int\n\
6165\n\
6166Return S left justified in a Unicode string of length width. Padding is\n\
6167done using the specified fill character (default is a space).");
6168
6169static PyObject *
6170unicode_ljust(PyUnicodeObject *self, PyObject *args)
6171{
6172    Py_ssize_t width;
6173    Py_UNICODE fillchar = ' ';
6174
6175    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6176        return NULL;
6177
6178    if (self->length >= width && PyUnicode_CheckExact(self)) {
6179        Py_INCREF(self);
6180        return (PyObject*) self;
6181    }
6182
6183    return (PyObject*) pad(self, 0, width - self->length, fillchar);
6184}
6185
6186PyDoc_STRVAR(lower__doc__,
6187"S.lower() -> unicode\n\
6188\n\
6189Return a copy of the string S converted to lowercase.");
6190
6191static PyObject*
6192unicode_lower(PyUnicodeObject *self)
6193{
6194    return fixup(self, fixlower);
6195}
6196
6197#define LEFTSTRIP 0
6198#define RIGHTSTRIP 1
6199#define BOTHSTRIP 2
6200
6201/* Arrays indexed by above */
6202static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6203
6204#define STRIPNAME(i) (stripformat[i]+3)
6205
6206/* externally visible for str.strip(unicode) */
6207PyObject *
6208_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6209{
6210	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6211	Py_ssize_t len = PyUnicode_GET_SIZE(self);
6212	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6213	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6214	Py_ssize_t i, j;
6215
6216        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6217
6218	i = 0;
6219	if (striptype != RIGHTSTRIP) {
6220            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6221                i++;
6222            }
6223	}
6224
6225	j = len;
6226	if (striptype != LEFTSTRIP) {
6227            do {
6228                j--;
6229            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6230            j++;
6231	}
6232
6233	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6234            Py_INCREF(self);
6235            return (PyObject*)self;
6236	}
6237	else
6238            return PyUnicode_FromUnicode(s+i, j-i);
6239}
6240
6241
6242static PyObject *
6243do_strip(PyUnicodeObject *self, int striptype)
6244{
6245	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6246	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6247
6248	i = 0;
6249	if (striptype != RIGHTSTRIP) {
6250		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6251			i++;
6252		}
6253	}
6254
6255	j = len;
6256	if (striptype != LEFTSTRIP) {
6257		do {
6258			j--;
6259		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6260		j++;
6261	}
6262
6263	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6264		Py_INCREF(self);
6265		return (PyObject*)self;
6266	}
6267	else
6268		return PyUnicode_FromUnicode(s+i, j-i);
6269}
6270
6271
6272static PyObject *
6273do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6274{
6275	PyObject *sep = NULL;
6276
6277	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6278		return NULL;
6279
6280	if (sep != NULL && sep != Py_None) {
6281		if (PyUnicode_Check(sep))
6282			return _PyUnicode_XStrip(self, striptype, sep);
6283		else if (PyString_Check(sep)) {
6284			PyObject *res;
6285			sep = PyUnicode_FromObject(sep);
6286			if (sep==NULL)
6287				return NULL;
6288			res = _PyUnicode_XStrip(self, striptype, sep);
6289			Py_DECREF(sep);
6290			return res;
6291		}
6292		else {
6293			PyErr_Format(PyExc_TypeError,
6294				     "%s arg must be None, unicode or str",
6295				     STRIPNAME(striptype));
6296			return NULL;
6297		}
6298	}
6299
6300	return do_strip(self, striptype);
6301}
6302
6303
6304PyDoc_STRVAR(strip__doc__,
6305"S.strip([chars]) -> unicode\n\
6306\n\
6307Return a copy of the string S with leading and trailing\n\
6308whitespace removed.\n\
6309If chars is given and not None, remove characters in chars instead.\n\
6310If chars is a str, it will be converted to unicode before stripping");
6311
6312static PyObject *
6313unicode_strip(PyUnicodeObject *self, PyObject *args)
6314{
6315	if (PyTuple_GET_SIZE(args) == 0)
6316		return do_strip(self, BOTHSTRIP); /* Common case */
6317	else
6318		return do_argstrip(self, BOTHSTRIP, args);
6319}
6320
6321
6322PyDoc_STRVAR(lstrip__doc__,
6323"S.lstrip([chars]) -> unicode\n\
6324\n\
6325Return a copy of the string S with leading whitespace removed.\n\
6326If chars is given and not None, remove characters in chars instead.\n\
6327If chars is a str, it will be converted to unicode before stripping");
6328
6329static PyObject *
6330unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6331{
6332	if (PyTuple_GET_SIZE(args) == 0)
6333		return do_strip(self, LEFTSTRIP); /* Common case */
6334	else
6335		return do_argstrip(self, LEFTSTRIP, args);
6336}
6337
6338
6339PyDoc_STRVAR(rstrip__doc__,
6340"S.rstrip([chars]) -> unicode\n\
6341\n\
6342Return a copy of the string S with trailing whitespace removed.\n\
6343If chars is given and not None, remove characters in chars instead.\n\
6344If chars is a str, it will be converted to unicode before stripping");
6345
6346static PyObject *
6347unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6348{
6349	if (PyTuple_GET_SIZE(args) == 0)
6350		return do_strip(self, RIGHTSTRIP); /* Common case */
6351	else
6352		return do_argstrip(self, RIGHTSTRIP, args);
6353}
6354
6355
6356static PyObject*
6357unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6358{
6359    PyUnicodeObject *u;
6360    Py_UNICODE *p;
6361    Py_ssize_t nchars;
6362    size_t nbytes;
6363
6364    if (len < 0)
6365        len = 0;
6366
6367    if (len == 1 && PyUnicode_CheckExact(str)) {
6368        /* no repeat, return original string */
6369        Py_INCREF(str);
6370        return (PyObject*) str;
6371    }
6372
6373    /* ensure # of chars needed doesn't overflow int and # of bytes
6374     * needed doesn't overflow size_t
6375     */
6376    nchars = len * str->length;
6377    if (len && nchars / len != str->length) {
6378        PyErr_SetString(PyExc_OverflowError,
6379                        "repeated string is too long");
6380        return NULL;
6381    }
6382    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6383    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6384        PyErr_SetString(PyExc_OverflowError,
6385                        "repeated string is too long");
6386        return NULL;
6387    }
6388    u = _PyUnicode_New(nchars);
6389    if (!u)
6390        return NULL;
6391
6392    p = u->str;
6393
6394    if (str->length == 1 && len > 0) {
6395        Py_UNICODE_FILL(p, str->str[0], len);
6396    } else {
6397	Py_ssize_t done = 0; /* number of characters copied this far */
6398	if (done < nchars) {
6399            Py_UNICODE_COPY(p, str->str, str->length);
6400            done = str->length;
6401	}
6402	while (done < nchars) {
6403            int n = (done <= nchars-done) ? done : nchars-done;
6404            Py_UNICODE_COPY(p+done, p, n);
6405            done += n;
6406	}
6407    }
6408
6409    return (PyObject*) u;
6410}
6411
6412PyObject *PyUnicode_Replace(PyObject *obj,
6413			    PyObject *subobj,
6414			    PyObject *replobj,
6415			    Py_ssize_t maxcount)
6416{
6417    PyObject *self;
6418    PyObject *str1;
6419    PyObject *str2;
6420    PyObject *result;
6421
6422    self = PyUnicode_FromObject(obj);
6423    if (self == NULL)
6424	return NULL;
6425    str1 = PyUnicode_FromObject(subobj);
6426    if (str1 == NULL) {
6427	Py_DECREF(self);
6428	return NULL;
6429    }
6430    str2 = PyUnicode_FromObject(replobj);
6431    if (str2 == NULL) {
6432	Py_DECREF(self);
6433	Py_DECREF(str1);
6434	return NULL;
6435    }
6436    result = replace((PyUnicodeObject *)self,
6437		     (PyUnicodeObject *)str1,
6438		     (PyUnicodeObject *)str2,
6439		     maxcount);
6440    Py_DECREF(self);
6441    Py_DECREF(str1);
6442    Py_DECREF(str2);
6443    return result;
6444}
6445
6446PyDoc_STRVAR(replace__doc__,
6447"S.replace (old, new[, maxsplit]) -> unicode\n\
6448\n\
6449Return a copy of S with all occurrences of substring\n\
6450old replaced by new.  If the optional argument maxsplit is\n\
6451given, only the first maxsplit occurrences are replaced.");
6452
6453static PyObject*
6454unicode_replace(PyUnicodeObject *self, PyObject *args)
6455{
6456    PyUnicodeObject *str1;
6457    PyUnicodeObject *str2;
6458    Py_ssize_t maxcount = -1;
6459    PyObject *result;
6460
6461    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
6462        return NULL;
6463    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6464    if (str1 == NULL)
6465	return NULL;
6466    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
6467    if (str2 == NULL) {
6468	Py_DECREF(str1);
6469	return NULL;
6470    }
6471
6472    result = replace(self, str1, str2, maxcount);
6473
6474    Py_DECREF(str1);
6475    Py_DECREF(str2);
6476    return result;
6477}
6478
6479static
6480PyObject *unicode_repr(PyObject *unicode)
6481{
6482    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6483				PyUnicode_GET_SIZE(unicode),
6484				1);
6485}
6486
6487PyDoc_STRVAR(rfind__doc__,
6488"S.rfind(sub [,start [,end]]) -> int\n\
6489\n\
6490Return the highest index in S where substring sub is found,\n\
6491such that sub is contained within s[start,end].  Optional\n\
6492arguments start and end are interpreted as in slice notation.\n\
6493\n\
6494Return -1 on failure.");
6495
6496static PyObject *
6497unicode_rfind(PyUnicodeObject *self, PyObject *args)
6498{
6499    PyObject *substring;
6500    Py_ssize_t start = 0;
6501    Py_ssize_t end = PY_SSIZE_T_MAX;
6502    Py_ssize_t result;
6503
6504    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6505		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6506        return NULL;
6507    substring = PyUnicode_FromObject(substring);
6508    if (!substring)
6509	return NULL;
6510
6511    result = stringlib_rfind_slice(
6512        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6513        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6514        start, end
6515        );
6516
6517    Py_DECREF(substring);
6518
6519    return PyInt_FromSsize_t(result);
6520}
6521
6522PyDoc_STRVAR(rindex__doc__,
6523"S.rindex(sub [,start [,end]]) -> int\n\
6524\n\
6525Like S.rfind() but raise ValueError when the substring is not found.");
6526
6527static PyObject *
6528unicode_rindex(PyUnicodeObject *self, PyObject *args)
6529{
6530    PyObject *substring;
6531    Py_ssize_t start = 0;
6532    Py_ssize_t end = PY_SSIZE_T_MAX;
6533    Py_ssize_t result;
6534
6535    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6536		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6537        return NULL;
6538    substring = PyUnicode_FromObject(substring);
6539    if (!substring)
6540	return NULL;
6541
6542    result = stringlib_rfind_slice(
6543        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6544        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6545        start, end
6546        );
6547
6548    Py_DECREF(substring);
6549
6550    if (result < 0) {
6551        PyErr_SetString(PyExc_ValueError, "substring not found");
6552        return NULL;
6553    }
6554    return PyInt_FromSsize_t(result);
6555}
6556
6557PyDoc_STRVAR(rjust__doc__,
6558"S.rjust(width[, fillchar]) -> unicode\n\
6559\n\
6560Return S right justified in a Unicode string of length width. Padding is\n\
6561done using the specified fill character (default is a space).");
6562
6563static PyObject *
6564unicode_rjust(PyUnicodeObject *self, PyObject *args)
6565{
6566    Py_ssize_t width;
6567    Py_UNICODE fillchar = ' ';
6568
6569    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
6570        return NULL;
6571
6572    if (self->length >= width && PyUnicode_CheckExact(self)) {
6573        Py_INCREF(self);
6574        return (PyObject*) self;
6575    }
6576
6577    return (PyObject*) pad(self, width - self->length, 0, fillchar);
6578}
6579
6580static PyObject*
6581unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
6582{
6583    /* standard clamping */
6584    if (start < 0)
6585        start = 0;
6586    if (end < 0)
6587        end = 0;
6588    if (end > self->length)
6589        end = self->length;
6590    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6591        /* full slice, return original string */
6592        Py_INCREF(self);
6593        return (PyObject*) self;
6594    }
6595    if (start > end)
6596        start = end;
6597    /* copy slice */
6598    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6599					     end - start);
6600}
6601
6602PyObject *PyUnicode_Split(PyObject *s,
6603			  PyObject *sep,
6604			  Py_ssize_t maxsplit)
6605{
6606    PyObject *result;
6607
6608    s = PyUnicode_FromObject(s);
6609    if (s == NULL)
6610	return NULL;
6611    if (sep != NULL) {
6612	sep = PyUnicode_FromObject(sep);
6613	if (sep == NULL) {
6614	    Py_DECREF(s);
6615	    return NULL;
6616	}
6617    }
6618
6619    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6620
6621    Py_DECREF(s);
6622    Py_XDECREF(sep);
6623    return result;
6624}
6625
6626PyDoc_STRVAR(split__doc__,
6627"S.split([sep [,maxsplit]]) -> list of strings\n\
6628\n\
6629Return a list of the words in S, using sep as the\n\
6630delimiter string.  If maxsplit is given, at most maxsplit\n\
6631splits are done. If sep is not specified or is None,\n\
6632any whitespace string is a separator.");
6633
6634static PyObject*
6635unicode_split(PyUnicodeObject *self, PyObject *args)
6636{
6637    PyObject *substring = Py_None;
6638    Py_ssize_t maxcount = -1;
6639
6640    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
6641        return NULL;
6642
6643    if (substring == Py_None)
6644	return split(self, NULL, maxcount);
6645    else if (PyUnicode_Check(substring))
6646	return split(self, (PyUnicodeObject *)substring, maxcount);
6647    else
6648	return PyUnicode_Split((PyObject *)self, substring, maxcount);
6649}
6650
6651PyObject *
6652PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6653{
6654    PyObject* str_obj;
6655    PyObject* sep_obj;
6656    PyObject* out;
6657
6658    str_obj = PyUnicode_FromObject(str_in);
6659    if (!str_obj)
6660	return NULL;
6661    sep_obj = PyUnicode_FromObject(sep_in);
6662    if (!sep_obj) {
6663        Py_DECREF(str_obj);
6664        return NULL;
6665    }
6666
6667    out = stringlib_partition(
6668        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6669        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6670        );
6671
6672    Py_DECREF(sep_obj);
6673    Py_DECREF(str_obj);
6674
6675    return out;
6676}
6677
6678
6679PyObject *
6680PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6681{
6682    PyObject* str_obj;
6683    PyObject* sep_obj;
6684    PyObject* out;
6685
6686    str_obj = PyUnicode_FromObject(str_in);
6687    if (!str_obj)
6688	return NULL;
6689    sep_obj = PyUnicode_FromObject(sep_in);
6690    if (!sep_obj) {
6691        Py_DECREF(str_obj);
6692        return NULL;
6693    }
6694
6695    out = stringlib_rpartition(
6696        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6697        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6698        );
6699
6700    Py_DECREF(sep_obj);
6701    Py_DECREF(str_obj);
6702
6703    return out;
6704}
6705
6706PyDoc_STRVAR(partition__doc__,
6707"S.partition(sep) -> (head, sep, tail)\n\
6708\n\
6709Searches for the separator sep in S, and returns the part before it,\n\
6710the separator itself, and the part after it.  If the separator is not\n\
6711found, returns S and two empty strings.");
6712
6713static PyObject*
6714unicode_partition(PyUnicodeObject *self, PyObject *separator)
6715{
6716    return PyUnicode_Partition((PyObject *)self, separator);
6717}
6718
6719PyDoc_STRVAR(rpartition__doc__,
6720"S.rpartition(sep) -> (tail, sep, head)\n\
6721\n\
6722Searches for the separator sep in S, starting at the end of S, and returns\n\
6723the part before it, the separator itself, and the part after it.  If the\n\
6724separator is not found, returns two empty strings and S.");
6725
6726static PyObject*
6727unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6728{
6729    return PyUnicode_RPartition((PyObject *)self, separator);
6730}
6731
6732PyObject *PyUnicode_RSplit(PyObject *s,
6733			   PyObject *sep,
6734			   Py_ssize_t maxsplit)
6735{
6736    PyObject *result;
6737
6738    s = PyUnicode_FromObject(s);
6739    if (s == NULL)
6740	return NULL;
6741    if (sep != NULL) {
6742	sep = PyUnicode_FromObject(sep);
6743	if (sep == NULL) {
6744	    Py_DECREF(s);
6745	    return NULL;
6746	}
6747    }
6748
6749    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6750
6751    Py_DECREF(s);
6752    Py_XDECREF(sep);
6753    return result;
6754}
6755
6756PyDoc_STRVAR(rsplit__doc__,
6757"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6758\n\
6759Return a list of the words in S, using sep as the\n\
6760delimiter string, starting at the end of the string and\n\
6761working to the front.  If maxsplit is given, at most maxsplit\n\
6762splits are done. If sep is not specified, any whitespace string\n\
6763is a separator.");
6764
6765static PyObject*
6766unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6767{
6768    PyObject *substring = Py_None;
6769    Py_ssize_t maxcount = -1;
6770
6771    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
6772        return NULL;
6773
6774    if (substring == Py_None)
6775	return rsplit(self, NULL, maxcount);
6776    else if (PyUnicode_Check(substring))
6777	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6778    else
6779	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6780}
6781
6782PyDoc_STRVAR(splitlines__doc__,
6783"S.splitlines([keepends]]) -> list of strings\n\
6784\n\
6785Return a list of the lines in S, breaking at line boundaries.\n\
6786Line breaks are not included in the resulting list unless keepends\n\
6787is given and true.");
6788
6789static PyObject*
6790unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6791{
6792    int keepends = 0;
6793
6794    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
6795        return NULL;
6796
6797    return PyUnicode_Splitlines((PyObject *)self, keepends);
6798}
6799
6800static
6801PyObject *unicode_str(PyUnicodeObject *self)
6802{
6803    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6804}
6805
6806PyDoc_STRVAR(swapcase__doc__,
6807"S.swapcase() -> unicode\n\
6808\n\
6809Return a copy of S with uppercase characters converted to lowercase\n\
6810and vice versa.");
6811
6812static PyObject*
6813unicode_swapcase(PyUnicodeObject *self)
6814{
6815    return fixup(self, fixswapcase);
6816}
6817
6818PyDoc_STRVAR(translate__doc__,
6819"S.translate(table) -> unicode\n\
6820\n\
6821Return a copy of the string S, where all characters have been mapped\n\
6822through the given translation table, which must be a mapping of\n\
6823Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6824Unmapped characters are left untouched. Characters mapped to None\n\
6825are deleted.");
6826
6827static PyObject*
6828unicode_translate(PyUnicodeObject *self, PyObject *table)
6829{
6830    return PyUnicode_TranslateCharmap(self->str,
6831				      self->length,
6832				      table,
6833				      "ignore");
6834}
6835
6836PyDoc_STRVAR(upper__doc__,
6837"S.upper() -> unicode\n\
6838\n\
6839Return a copy of S converted to uppercase.");
6840
6841static PyObject*
6842unicode_upper(PyUnicodeObject *self)
6843{
6844    return fixup(self, fixupper);
6845}
6846
6847PyDoc_STRVAR(zfill__doc__,
6848"S.zfill(width) -> unicode\n\
6849\n\
6850Pad a numeric string x with zeros on the left, to fill a field\n\
6851of the specified width. The string x is never truncated.");
6852
6853static PyObject *
6854unicode_zfill(PyUnicodeObject *self, PyObject *args)
6855{
6856    Py_ssize_t fill;
6857    PyUnicodeObject *u;
6858
6859    Py_ssize_t width;
6860    if (!PyArg_ParseTuple(args, "n:zfill", &width))
6861        return NULL;
6862
6863    if (self->length >= width) {
6864        if (PyUnicode_CheckExact(self)) {
6865            Py_INCREF(self);
6866            return (PyObject*) self;
6867        }
6868        else
6869            return PyUnicode_FromUnicode(
6870                PyUnicode_AS_UNICODE(self),
6871                PyUnicode_GET_SIZE(self)
6872            );
6873    }
6874
6875    fill = width - self->length;
6876
6877    u = pad(self, fill, 0, '0');
6878
6879    if (u == NULL)
6880        return NULL;
6881
6882    if (u->str[fill] == '+' || u->str[fill] == '-') {
6883        /* move sign to beginning of string */
6884        u->str[0] = u->str[fill];
6885        u->str[fill] = '0';
6886    }
6887
6888    return (PyObject*) u;
6889}
6890
6891#if 0
6892static PyObject*
6893unicode_freelistsize(PyUnicodeObject *self)
6894{
6895    return PyInt_FromLong(unicode_freelist_size);
6896}
6897#endif
6898
6899PyDoc_STRVAR(startswith__doc__,
6900"S.startswith(prefix[, start[, end]]) -> bool\n\
6901\n\
6902Return True if S starts with the specified prefix, False otherwise.\n\
6903With optional start, test S beginning at that position.\n\
6904With optional end, stop comparing S at that position.\n\
6905prefix can also be a tuple of strings to try.");
6906
6907static PyObject *
6908unicode_startswith(PyUnicodeObject *self,
6909		   PyObject *args)
6910{
6911    PyObject *subobj;
6912    PyUnicodeObject *substring;
6913    Py_ssize_t start = 0;
6914    Py_ssize_t end = PY_SSIZE_T_MAX;
6915    int result;
6916
6917    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
6918		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6919	return NULL;
6920    if (PyTuple_Check(subobj)) {
6921        Py_ssize_t i;
6922        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6923            substring = (PyUnicodeObject *)PyUnicode_FromObject(
6924                            PyTuple_GET_ITEM(subobj, i));
6925            if (substring == NULL)
6926                return NULL;
6927            result = tailmatch(self, substring, start, end, -1);
6928            Py_DECREF(substring);
6929            if (result) {
6930                Py_RETURN_TRUE;
6931            }
6932        }
6933        /* nothing matched */
6934        Py_RETURN_FALSE;
6935    }
6936    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6937    if (substring == NULL)
6938         return NULL;
6939    result = tailmatch(self, substring, start, end, -1);
6940    Py_DECREF(substring);
6941    return PyBool_FromLong(result);
6942}
6943
6944
6945PyDoc_STRVAR(endswith__doc__,
6946"S.endswith(suffix[, start[, end]]) -> bool\n\
6947\n\
6948Return True if S ends with the specified suffix, False otherwise.\n\
6949With optional start, test S beginning at that position.\n\
6950With optional end, stop comparing S at that position.\n\
6951suffix can also be a tuple of strings to try.");
6952
6953static PyObject *
6954unicode_endswith(PyUnicodeObject *self,
6955		 PyObject *args)
6956{
6957    PyObject *subobj;
6958    PyUnicodeObject *substring;
6959    Py_ssize_t start = 0;
6960    Py_ssize_t end = PY_SSIZE_T_MAX;
6961    int result;
6962
6963    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6964        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6965	return NULL;
6966    if (PyTuple_Check(subobj)) {
6967        Py_ssize_t i;
6968        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6969            substring = (PyUnicodeObject *)PyUnicode_FromObject(
6970                            PyTuple_GET_ITEM(subobj, i));
6971            if (substring == NULL)
6972            return NULL;
6973            result = tailmatch(self, substring, start, end, +1);
6974            Py_DECREF(substring);
6975            if (result) {
6976                Py_RETURN_TRUE;
6977            }
6978        }
6979        Py_RETURN_FALSE;
6980    }
6981    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6982    if (substring == NULL)
6983    return NULL;
6984
6985    result = tailmatch(self, substring, start, end, +1);
6986    Py_DECREF(substring);
6987    return PyBool_FromLong(result);
6988}
6989
6990
6991
6992static PyObject *
6993unicode_getnewargs(PyUnicodeObject *v)
6994{
6995	return Py_BuildValue("(u#)", v->str, v->length);
6996}
6997
6998
6999static PyMethodDef unicode_methods[] = {
7000
7001    /* Order is according to common usage: often used methods should
7002       appear first, since lookup is done sequentially. */
7003
7004    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7005    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7006    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7007    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7008    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7009    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7010    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7011    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7012    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7013    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7014    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7015    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7016    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7017    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7018    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7019    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7020    {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7021/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7022    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7023    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7024    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7025    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7026    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7027    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7028    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7029    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7030    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7031    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7032    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7033    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7034    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7035    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7036    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7037    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7038    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7039    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7040    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7041    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7042    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7043    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7044#if 0
7045    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7046#endif
7047
7048#if 0
7049    /* This one is just used for debugging the implementation. */
7050    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
7051#endif
7052
7053    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
7054    {NULL, NULL}
7055};
7056
7057static PyObject *
7058unicode_mod(PyObject *v, PyObject *w)
7059{
7060       if (!PyUnicode_Check(v)) {
7061               Py_INCREF(Py_NotImplemented);
7062               return Py_NotImplemented;
7063       }
7064       return PyUnicode_Format(v, w);
7065}
7066
7067static PyNumberMethods unicode_as_number = {
7068	0,				/*nb_add*/
7069	0,				/*nb_subtract*/
7070	0,				/*nb_multiply*/
7071	unicode_mod,			/*nb_remainder*/
7072};
7073
7074static PySequenceMethods unicode_as_sequence = {
7075    (lenfunc) unicode_length, 		/* sq_length */
7076    PyUnicode_Concat,		 	/* sq_concat */
7077    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
7078    (ssizeargfunc) unicode_getitem, 	/* sq_item */
7079    (ssizessizeargfunc) unicode_slice, 	/* sq_slice */
7080    0, 					/* sq_ass_item */
7081    0, 					/* sq_ass_slice */
7082    PyUnicode_Contains, 		/* sq_contains */
7083};
7084
7085static PyObject*
7086unicode_subscript(PyUnicodeObject* self, PyObject* item)
7087{
7088    if (PyIndex_Check(item)) {
7089        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7090        if (i == -1 && PyErr_Occurred())
7091            return NULL;
7092        if (i < 0)
7093            i += PyUnicode_GET_SIZE(self);
7094        return unicode_getitem(self, i);
7095    } else if (PySlice_Check(item)) {
7096        Py_ssize_t start, stop, step, slicelength, cur, i;
7097        Py_UNICODE* source_buf;
7098        Py_UNICODE* result_buf;
7099        PyObject* result;
7100
7101        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7102				 &start, &stop, &step, &slicelength) < 0) {
7103            return NULL;
7104        }
7105
7106        if (slicelength <= 0) {
7107            return PyUnicode_FromUnicode(NULL, 0);
7108        } else {
7109            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7110            result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7111                                                    sizeof(Py_UNICODE));
7112
7113	    if (result_buf == NULL)
7114		    return PyErr_NoMemory();
7115
7116            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7117                result_buf[i] = source_buf[cur];
7118            }
7119
7120            result = PyUnicode_FromUnicode(result_buf, slicelength);
7121            PyMem_FREE(result_buf);
7122            return result;
7123        }
7124    } else {
7125        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7126        return NULL;
7127    }
7128}
7129
7130static PyMappingMethods unicode_as_mapping = {
7131    (lenfunc)unicode_length,		/* mp_length */
7132    (binaryfunc)unicode_subscript,	/* mp_subscript */
7133    (objobjargproc)0,			/* mp_ass_subscript */
7134};
7135
7136static Py_ssize_t
7137unicode_buffer_getreadbuf(PyUnicodeObject *self,
7138			  Py_ssize_t index,
7139			  const void **ptr)
7140{
7141    if (index != 0) {
7142        PyErr_SetString(PyExc_SystemError,
7143			"accessing non-existent unicode segment");
7144        return -1;
7145    }
7146    *ptr = (void *) self->str;
7147    return PyUnicode_GET_DATA_SIZE(self);
7148}
7149
7150static Py_ssize_t
7151unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7152			   const void **ptr)
7153{
7154    PyErr_SetString(PyExc_TypeError,
7155		    "cannot use unicode as modifiable buffer");
7156    return -1;
7157}
7158
7159static int
7160unicode_buffer_getsegcount(PyUnicodeObject *self,
7161			   Py_ssize_t *lenp)
7162{
7163    if (lenp)
7164        *lenp = PyUnicode_GET_DATA_SIZE(self);
7165    return 1;
7166}
7167
7168static Py_ssize_t
7169unicode_buffer_getcharbuf(PyUnicodeObject *self,
7170			  Py_ssize_t index,
7171			  const void **ptr)
7172{
7173    PyObject *str;
7174
7175    if (index != 0) {
7176        PyErr_SetString(PyExc_SystemError,
7177			"accessing non-existent unicode segment");
7178        return -1;
7179    }
7180    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7181    if (str == NULL)
7182	return -1;
7183    *ptr = (void *) PyString_AS_STRING(str);
7184    return PyString_GET_SIZE(str);
7185}
7186
7187/* Helpers for PyUnicode_Format() */
7188
7189static PyObject *
7190getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7191{
7192    Py_ssize_t argidx = *p_argidx;
7193    if (argidx < arglen) {
7194	(*p_argidx)++;
7195	if (arglen < 0)
7196	    return args;
7197	else
7198	    return PyTuple_GetItem(args, argidx);
7199    }
7200    PyErr_SetString(PyExc_TypeError,
7201		    "not enough arguments for format string");
7202    return NULL;
7203}
7204
7205#define F_LJUST (1<<0)
7206#define F_SIGN	(1<<1)
7207#define F_BLANK (1<<2)
7208#define F_ALT	(1<<3)
7209#define F_ZERO	(1<<4)
7210
7211static Py_ssize_t
7212strtounicode(Py_UNICODE *buffer, const char *charbuffer)
7213{
7214    register Py_ssize_t i;
7215    Py_ssize_t len = strlen(charbuffer);
7216    for (i = len - 1; i >= 0; i--)
7217	buffer[i] = (Py_UNICODE) charbuffer[i];
7218
7219    return len;
7220}
7221
7222static int
7223doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7224{
7225    Py_ssize_t result;
7226
7227    PyOS_ascii_formatd((char *)buffer, len, format, x);
7228    result = strtounicode(buffer, (char *)buffer);
7229    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7230}
7231
7232static int
7233longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7234{
7235    Py_ssize_t result;
7236
7237    PyOS_snprintf((char *)buffer, len, format, x);
7238    result = strtounicode(buffer, (char *)buffer);
7239    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7240}
7241
7242/* XXX To save some code duplication, formatfloat/long/int could have been
7243   shared with stringobject.c, converting from 8-bit to Unicode after the
7244   formatting is done. */
7245
7246static int
7247formatfloat(Py_UNICODE *buf,
7248	    size_t buflen,
7249	    int flags,
7250	    int prec,
7251	    int type,
7252	    PyObject *v)
7253{
7254    /* fmt = '%#.' + `prec` + `type`
7255       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7256    char fmt[20];
7257    double x;
7258
7259    x = PyFloat_AsDouble(v);
7260    if (x == -1.0 && PyErr_Occurred())
7261	return -1;
7262    if (prec < 0)
7263	prec = 6;
7264    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7265	type = 'g';
7266    /* Worst case length calc to ensure no buffer overrun:
7267
7268       'g' formats:
7269	 fmt = %#.<prec>g
7270	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7271	    for any double rep.)
7272	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7273
7274       'f' formats:
7275	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7276	 len = 1 + 50 + 1 + prec = 52 + prec
7277
7278       If prec=0 the effective precision is 1 (the leading digit is
7279       always given), therefore increase the length by one.
7280
7281    */
7282    if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7283	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7284	PyErr_SetString(PyExc_OverflowError,
7285			"formatted float is too long (precision too large?)");
7286	return -1;
7287    }
7288    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7289		  (flags&F_ALT) ? "#" : "",
7290		  prec, type);
7291    return doubletounicode(buf, buflen, fmt, x);
7292}
7293
7294static PyObject*
7295formatlong(PyObject *val, int flags, int prec, int type)
7296{
7297	char *buf;
7298	int i, len;
7299	PyObject *str; /* temporary string object. */
7300	PyUnicodeObject *result;
7301
7302	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7303	if (!str)
7304		return NULL;
7305	result = _PyUnicode_New(len);
7306	if (!result) {
7307		Py_DECREF(str);
7308		return NULL;
7309	}
7310	for (i = 0; i < len; i++)
7311		result->str[i] = buf[i];
7312	result->str[len] = 0;
7313	Py_DECREF(str);
7314	return (PyObject*)result;
7315}
7316
7317static int
7318formatint(Py_UNICODE *buf,
7319	  size_t buflen,
7320	  int flags,
7321	  int prec,
7322	  int type,
7323	  PyObject *v)
7324{
7325    /* fmt = '%#.' + `prec` + 'l' + `type`
7326     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7327     *                     + 1 + 1
7328     *                   = 24
7329     */
7330    char fmt[64]; /* plenty big enough! */
7331    char *sign;
7332    long x;
7333
7334    x = PyInt_AsLong(v);
7335    if (x == -1 && PyErr_Occurred())
7336        return -1;
7337    if (x < 0 && type == 'u') {
7338        type = 'd';
7339    }
7340    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7341        sign = "-";
7342    else
7343        sign = "";
7344    if (prec < 0)
7345        prec = 1;
7346
7347    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7348     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7349     */
7350    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
7351        PyErr_SetString(PyExc_OverflowError,
7352    	        "formatted integer is too long (precision too large?)");
7353        return -1;
7354    }
7355
7356    if ((flags & F_ALT) &&
7357        (type == 'x' || type == 'X')) {
7358        /* When converting under %#x or %#X, there are a number
7359         * of issues that cause pain:
7360         * - when 0 is being converted, the C standard leaves off
7361         *   the '0x' or '0X', which is inconsistent with other
7362         *   %#x/%#X conversions and inconsistent with Python's
7363         *   hex() function
7364         * - there are platforms that violate the standard and
7365         *   convert 0 with the '0x' or '0X'
7366         *   (Metrowerks, Compaq Tru64)
7367         * - there are platforms that give '0x' when converting
7368         *   under %#X, but convert 0 in accordance with the
7369         *   standard (OS/2 EMX)
7370         *
7371         * We can achieve the desired consistency by inserting our
7372         * own '0x' or '0X' prefix, and substituting %x/%X in place
7373         * of %#x/%#X.
7374         *
7375         * Note that this is the same approach as used in
7376         * formatint() in stringobject.c
7377         */
7378        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7379                      sign, type, prec, type);
7380    }
7381    else {
7382        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7383                      sign, (flags&F_ALT) ? "#" : "",
7384                      prec, type);
7385    }
7386    if (sign[0])
7387        return longtounicode(buf, buflen, fmt, -x);
7388    else
7389        return longtounicode(buf, buflen, fmt, x);
7390}
7391
7392static int
7393formatchar(Py_UNICODE *buf,
7394           size_t buflen,
7395           PyObject *v)
7396{
7397    /* presume that the buffer is at least 2 characters long */
7398    if (PyUnicode_Check(v)) {
7399	if (PyUnicode_GET_SIZE(v) != 1)
7400	    goto onError;
7401	buf[0] = PyUnicode_AS_UNICODE(v)[0];
7402    }
7403
7404    else if (PyString_Check(v)) {
7405	if (PyString_GET_SIZE(v) != 1)
7406	    goto onError;
7407	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7408    }
7409
7410    else {
7411	/* Integer input truncated to a character */
7412        long x;
7413	x = PyInt_AsLong(v);
7414	if (x == -1 && PyErr_Occurred())
7415	    goto onError;
7416#ifdef Py_UNICODE_WIDE
7417	if (x < 0 || x > 0x10ffff) {
7418	    PyErr_SetString(PyExc_OverflowError,
7419			    "%c arg not in range(0x110000) "
7420			    "(wide Python build)");
7421	    return -1;
7422	}
7423#else
7424	if (x < 0 || x > 0xffff) {
7425	    PyErr_SetString(PyExc_OverflowError,
7426			    "%c arg not in range(0x10000) "
7427			    "(narrow Python build)");
7428	    return -1;
7429	}
7430#endif
7431	buf[0] = (Py_UNICODE) x;
7432    }
7433    buf[1] = '\0';
7434    return 1;
7435
7436 onError:
7437    PyErr_SetString(PyExc_TypeError,
7438		    "%c requires int or char");
7439    return -1;
7440}
7441
7442/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7443
7444   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7445   chars are formatted. XXX This is a magic number. Each formatting
7446   routine does bounds checking to ensure no overflow, but a better
7447   solution may be to malloc a buffer of appropriate size for each
7448   format. For now, the current solution is sufficient.
7449*/
7450#define FORMATBUFLEN (size_t)120
7451
7452PyObject *PyUnicode_Format(PyObject *format,
7453			   PyObject *args)
7454{
7455    Py_UNICODE *fmt, *res;
7456    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
7457    int args_owned = 0;
7458    PyUnicodeObject *result = NULL;
7459    PyObject *dict = NULL;
7460    PyObject *uformat;
7461
7462    if (format == NULL || args == NULL) {
7463	PyErr_BadInternalCall();
7464	return NULL;
7465    }
7466    uformat = PyUnicode_FromObject(format);
7467    if (uformat == NULL)
7468	return NULL;
7469    fmt = PyUnicode_AS_UNICODE(uformat);
7470    fmtcnt = PyUnicode_GET_SIZE(uformat);
7471
7472    reslen = rescnt = fmtcnt + 100;
7473    result = _PyUnicode_New(reslen);
7474    if (result == NULL)
7475	goto onError;
7476    res = PyUnicode_AS_UNICODE(result);
7477
7478    if (PyTuple_Check(args)) {
7479	arglen = PyTuple_Size(args);
7480	argidx = 0;
7481    }
7482    else {
7483	arglen = -1;
7484	argidx = -2;
7485    }
7486    if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7487        !PyObject_TypeCheck(args, &PyBaseString_Type))
7488	dict = args;
7489
7490    while (--fmtcnt >= 0) {
7491	if (*fmt != '%') {
7492	    if (--rescnt < 0) {
7493		rescnt = fmtcnt + 100;
7494		reslen += rescnt;
7495		if (_PyUnicode_Resize(&result, reslen) < 0)
7496		    goto onError;
7497		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7498		--rescnt;
7499	    }
7500	    *res++ = *fmt++;
7501	}
7502	else {
7503	    /* Got a format specifier */
7504	    int flags = 0;
7505	    Py_ssize_t width = -1;
7506	    int prec = -1;
7507	    Py_UNICODE c = '\0';
7508	    Py_UNICODE fill;
7509	    PyObject *v = NULL;
7510	    PyObject *temp = NULL;
7511	    Py_UNICODE *pbuf;
7512	    Py_UNICODE sign;
7513	    Py_ssize_t len;
7514	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
7515
7516	    fmt++;
7517	    if (*fmt == '(') {
7518		Py_UNICODE *keystart;
7519		Py_ssize_t keylen;
7520		PyObject *key;
7521		int pcount = 1;
7522
7523		if (dict == NULL) {
7524		    PyErr_SetString(PyExc_TypeError,
7525				    "format requires a mapping");
7526		    goto onError;
7527		}
7528		++fmt;
7529		--fmtcnt;
7530		keystart = fmt;
7531		/* Skip over balanced parentheses */
7532		while (pcount > 0 && --fmtcnt >= 0) {
7533		    if (*fmt == ')')
7534			--pcount;
7535		    else if (*fmt == '(')
7536			++pcount;
7537		    fmt++;
7538		}
7539		keylen = fmt - keystart - 1;
7540		if (fmtcnt < 0 || pcount > 0) {
7541		    PyErr_SetString(PyExc_ValueError,
7542				    "incomplete format key");
7543		    goto onError;
7544		}
7545#if 0
7546		/* keys are converted to strings using UTF-8 and
7547		   then looked up since Python uses strings to hold
7548		   variables names etc. in its namespaces and we
7549		   wouldn't want to break common idioms. */
7550		key = PyUnicode_EncodeUTF8(keystart,
7551					   keylen,
7552					   NULL);
7553#else
7554		key = PyUnicode_FromUnicode(keystart, keylen);
7555#endif
7556		if (key == NULL)
7557		    goto onError;
7558		if (args_owned) {
7559		    Py_DECREF(args);
7560		    args_owned = 0;
7561		}
7562		args = PyObject_GetItem(dict, key);
7563		Py_DECREF(key);
7564		if (args == NULL) {
7565		    goto onError;
7566		}
7567		args_owned = 1;
7568		arglen = -1;
7569		argidx = -2;
7570	    }
7571	    while (--fmtcnt >= 0) {
7572		switch (c = *fmt++) {
7573		case '-': flags |= F_LJUST; continue;
7574		case '+': flags |= F_SIGN; continue;
7575		case ' ': flags |= F_BLANK; continue;
7576		case '#': flags |= F_ALT; continue;
7577		case '0': flags |= F_ZERO; continue;
7578		}
7579		break;
7580	    }
7581	    if (c == '*') {
7582		v = getnextarg(args, arglen, &argidx);
7583		if (v == NULL)
7584		    goto onError;
7585		if (!PyInt_Check(v)) {
7586		    PyErr_SetString(PyExc_TypeError,
7587				    "* wants int");
7588		    goto onError;
7589		}
7590		width = PyInt_AsLong(v);
7591		if (width == -1 && PyErr_Occurred())
7592			goto onError;
7593		if (width < 0) {
7594		    flags |= F_LJUST;
7595		    width = -width;
7596		}
7597		if (--fmtcnt >= 0)
7598		    c = *fmt++;
7599	    }
7600	    else if (c >= '0' && c <= '9') {
7601		width = c - '0';
7602		while (--fmtcnt >= 0) {
7603		    c = *fmt++;
7604		    if (c < '0' || c > '9')
7605			break;
7606		    if ((width*10) / 10 != width) {
7607			PyErr_SetString(PyExc_ValueError,
7608					"width too big");
7609			goto onError;
7610		    }
7611		    width = width*10 + (c - '0');
7612		}
7613	    }
7614	    if (c == '.') {
7615		prec = 0;
7616		if (--fmtcnt >= 0)
7617		    c = *fmt++;
7618		if (c == '*') {
7619		    v = getnextarg(args, arglen, &argidx);
7620		    if (v == NULL)
7621			goto onError;
7622		    if (!PyInt_Check(v)) {
7623			PyErr_SetString(PyExc_TypeError,
7624					"* wants int");
7625			goto onError;
7626		    }
7627		    prec = PyInt_AsLong(v);
7628		    if (prec == -1 && PyErr_Occurred())
7629			goto onError;
7630		    if (prec < 0)
7631			prec = 0;
7632		    if (--fmtcnt >= 0)
7633			c = *fmt++;
7634		}
7635		else if (c >= '0' && c <= '9') {
7636		    prec = c - '0';
7637		    while (--fmtcnt >= 0) {
7638			c = Py_CHARMASK(*fmt++);
7639			if (c < '0' || c > '9')
7640			    break;
7641			if ((prec*10) / 10 != prec) {
7642			    PyErr_SetString(PyExc_ValueError,
7643					    "prec too big");
7644			    goto onError;
7645			}
7646			prec = prec*10 + (c - '0');
7647		    }
7648		}
7649	    } /* prec */
7650	    if (fmtcnt >= 0) {
7651		if (c == 'h' || c == 'l' || c == 'L') {
7652		    if (--fmtcnt >= 0)
7653			c = *fmt++;
7654		}
7655	    }
7656	    if (fmtcnt < 0) {
7657		PyErr_SetString(PyExc_ValueError,
7658				"incomplete format");
7659		goto onError;
7660	    }
7661	    if (c != '%') {
7662		v = getnextarg(args, arglen, &argidx);
7663		if (v == NULL)
7664		    goto onError;
7665	    }
7666	    sign = 0;
7667	    fill = ' ';
7668	    switch (c) {
7669
7670	    case '%':
7671		pbuf = formatbuf;
7672		/* presume that buffer length is at least 1 */
7673		pbuf[0] = '%';
7674		len = 1;
7675		break;
7676
7677	    case 's':
7678	    case 'r':
7679		if (PyUnicode_Check(v) && c == 's') {
7680		    temp = v;
7681		    Py_INCREF(temp);
7682		}
7683		else {
7684		    PyObject *unicode;
7685		    if (c == 's')
7686			temp = PyObject_Unicode(v);
7687		    else
7688			temp = PyObject_Repr(v);
7689		    if (temp == NULL)
7690			goto onError;
7691                    if (PyUnicode_Check(temp))
7692                        /* nothing to do */;
7693                    else if (PyString_Check(temp)) {
7694                        /* convert to string to Unicode */
7695		        unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7696						   PyString_GET_SIZE(temp),
7697						   NULL,
7698						   "strict");
7699		        Py_DECREF(temp);
7700		        temp = unicode;
7701		        if (temp == NULL)
7702			    goto onError;
7703		    }
7704		    else {
7705			Py_DECREF(temp);
7706			PyErr_SetString(PyExc_TypeError,
7707					"%s argument has non-string str()");
7708			goto onError;
7709		    }
7710		}
7711		pbuf = PyUnicode_AS_UNICODE(temp);
7712		len = PyUnicode_GET_SIZE(temp);
7713		if (prec >= 0 && len > prec)
7714		    len = prec;
7715		break;
7716
7717	    case 'i':
7718	    case 'd':
7719	    case 'u':
7720	    case 'o':
7721	    case 'x':
7722	    case 'X':
7723		if (c == 'i')
7724		    c = 'd';
7725		if (PyLong_Check(v)) {
7726		    temp = formatlong(v, flags, prec, c);
7727		    if (!temp)
7728			goto onError;
7729		    pbuf = PyUnicode_AS_UNICODE(temp);
7730		    len = PyUnicode_GET_SIZE(temp);
7731		    sign = 1;
7732		}
7733		else {
7734		    pbuf = formatbuf;
7735		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7736				    flags, prec, c, v);
7737		    if (len < 0)
7738			goto onError;
7739		    sign = 1;
7740		}
7741		if (flags & F_ZERO)
7742		    fill = '0';
7743		break;
7744
7745	    case 'e':
7746	    case 'E':
7747	    case 'f':
7748	    case 'F':
7749	    case 'g':
7750	    case 'G':
7751		if (c == 'F')
7752			c = 'f';
7753		pbuf = formatbuf;
7754		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7755			flags, prec, c, v);
7756		if (len < 0)
7757		    goto onError;
7758		sign = 1;
7759		if (flags & F_ZERO)
7760		    fill = '0';
7761		break;
7762
7763	    case 'c':
7764		pbuf = formatbuf;
7765		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
7766		if (len < 0)
7767		    goto onError;
7768		break;
7769
7770	    default:
7771		PyErr_Format(PyExc_ValueError,
7772			     "unsupported format character '%c' (0x%x) "
7773			     "at index %zd",
7774			     (31<=c && c<=126) ? (char)c : '?',
7775                             (int)c,
7776			     (Py_ssize_t)(fmt - 1 -
7777					  PyUnicode_AS_UNICODE(uformat)));
7778		goto onError;
7779	    }
7780	    if (sign) {
7781		if (*pbuf == '-' || *pbuf == '+') {
7782		    sign = *pbuf++;
7783		    len--;
7784		}
7785		else if (flags & F_SIGN)
7786		    sign = '+';
7787		else if (flags & F_BLANK)
7788		    sign = ' ';
7789		else
7790		    sign = 0;
7791	    }
7792	    if (width < len)
7793		width = len;
7794	    if (rescnt - (sign != 0) < width) {
7795		reslen -= rescnt;
7796		rescnt = width + fmtcnt + 100;
7797		reslen += rescnt;
7798		if (reslen < 0) {
7799		    Py_XDECREF(temp);
7800		    PyErr_NoMemory();
7801		    goto onError;
7802		}
7803		if (_PyUnicode_Resize(&result, reslen) < 0) {
7804		    Py_XDECREF(temp);
7805		    goto onError;
7806		}
7807		res = PyUnicode_AS_UNICODE(result)
7808		    + reslen - rescnt;
7809	    }
7810	    if (sign) {
7811		if (fill != ' ')
7812		    *res++ = sign;
7813		rescnt--;
7814		if (width > len)
7815		    width--;
7816	    }
7817	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7818		assert(pbuf[0] == '0');
7819		assert(pbuf[1] == c);
7820		if (fill != ' ') {
7821		    *res++ = *pbuf++;
7822		    *res++ = *pbuf++;
7823		}
7824		rescnt -= 2;
7825		width -= 2;
7826		if (width < 0)
7827		    width = 0;
7828		len -= 2;
7829	    }
7830	    if (width > len && !(flags & F_LJUST)) {
7831		do {
7832		    --rescnt;
7833		    *res++ = fill;
7834		} while (--width > len);
7835	    }
7836	    if (fill == ' ') {
7837		if (sign)
7838		    *res++ = sign;
7839		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7840		    assert(pbuf[0] == '0');
7841		    assert(pbuf[1] == c);
7842		    *res++ = *pbuf++;
7843		    *res++ = *pbuf++;
7844		}
7845	    }
7846	    Py_UNICODE_COPY(res, pbuf, len);
7847	    res += len;
7848	    rescnt -= len;
7849	    while (--width >= len) {
7850		--rescnt;
7851		*res++ = ' ';
7852	    }
7853	    if (dict && (argidx < arglen) && c != '%') {
7854		PyErr_SetString(PyExc_TypeError,
7855				"not all arguments converted during string formatting");
7856                Py_XDECREF(temp);
7857		goto onError;
7858	    }
7859	    Py_XDECREF(temp);
7860	} /* '%' */
7861    } /* until end */
7862    if (argidx < arglen && !dict) {
7863	PyErr_SetString(PyExc_TypeError,
7864			"not all arguments converted during string formatting");
7865	goto onError;
7866    }
7867
7868    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7869	goto onError;
7870    if (args_owned) {
7871	Py_DECREF(args);
7872    }
7873    Py_DECREF(uformat);
7874    return (PyObject *)result;
7875
7876 onError:
7877    Py_XDECREF(result);
7878    Py_DECREF(uformat);
7879    if (args_owned) {
7880	Py_DECREF(args);
7881    }
7882    return NULL;
7883}
7884
7885static PyBufferProcs unicode_as_buffer = {
7886    (readbufferproc) unicode_buffer_getreadbuf,
7887    (writebufferproc) unicode_buffer_getwritebuf,
7888    (segcountproc) unicode_buffer_getsegcount,
7889    (charbufferproc) unicode_buffer_getcharbuf,
7890};
7891
7892static PyObject *
7893unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7894
7895static PyObject *
7896unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7897{
7898        PyObject *x = NULL;
7899	static char *kwlist[] = {"string", "encoding", "errors", 0};
7900	char *encoding = NULL;
7901	char *errors = NULL;
7902
7903	if (type != &PyUnicode_Type)
7904		return unicode_subtype_new(type, args, kwds);
7905	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7906					  kwlist, &x, &encoding, &errors))
7907	    return NULL;
7908	if (x == NULL)
7909		return (PyObject *)_PyUnicode_New(0);
7910	if (encoding == NULL && errors == NULL)
7911	    return PyObject_Unicode(x);
7912	else
7913	return PyUnicode_FromEncodedObject(x, encoding, errors);
7914}
7915
7916static PyObject *
7917unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7918{
7919	PyUnicodeObject *tmp, *pnew;
7920	Py_ssize_t n;
7921
7922	assert(PyType_IsSubtype(type, &PyUnicode_Type));
7923	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7924	if (tmp == NULL)
7925		return NULL;
7926	assert(PyUnicode_Check(tmp));
7927	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7928	if (pnew == NULL) {
7929		Py_DECREF(tmp);
7930		return NULL;
7931	}
7932	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7933	if (pnew->str == NULL) {
7934		_Py_ForgetReference((PyObject *)pnew);
7935		PyObject_Del(pnew);
7936		Py_DECREF(tmp);
7937		return PyErr_NoMemory();
7938	}
7939	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7940	pnew->length = n;
7941	pnew->hash = tmp->hash;
7942	Py_DECREF(tmp);
7943	return (PyObject *)pnew;
7944}
7945
7946PyDoc_STRVAR(unicode_doc,
7947"unicode(string [, encoding[, errors]]) -> object\n\
7948\n\
7949Create a new Unicode object from the given encoded string.\n\
7950encoding defaults to the current default string encoding.\n\
7951errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7952
7953static PyObject *unicode_iter(PyObject *seq);
7954
7955PyTypeObject PyUnicode_Type = {
7956    PyObject_HEAD_INIT(&PyType_Type)
7957    0, 					/* ob_size */
7958    "unicode", 				/* tp_name */
7959    sizeof(PyUnicodeObject), 		/* tp_size */
7960    0, 					/* tp_itemsize */
7961    /* Slots */
7962    (destructor)unicode_dealloc, 	/* tp_dealloc */
7963    0, 					/* tp_print */
7964    0,				 	/* tp_getattr */
7965    0, 					/* tp_setattr */
7966    0, 					/* tp_compare */
7967    unicode_repr, 			/* tp_repr */
7968    &unicode_as_number, 		/* tp_as_number */
7969    &unicode_as_sequence, 		/* tp_as_sequence */
7970    &unicode_as_mapping, 		/* tp_as_mapping */
7971    (hashfunc) unicode_hash, 		/* tp_hash*/
7972    0, 					/* tp_call*/
7973    (reprfunc) unicode_str,	 	/* tp_str */
7974    PyObject_GenericGetAttr, 		/* tp_getattro */
7975    0,			 		/* tp_setattro */
7976    &unicode_as_buffer,			/* tp_as_buffer */
7977    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
7978        Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
7979    unicode_doc,			/* tp_doc */
7980    0,					/* tp_traverse */
7981    0,					/* tp_clear */
7982    PyUnicode_RichCompare,		/* tp_richcompare */
7983    0,					/* tp_weaklistoffset */
7984    unicode_iter,			/* tp_iter */
7985    0,					/* tp_iternext */
7986    unicode_methods,			/* tp_methods */
7987    0,					/* tp_members */
7988    0,					/* tp_getset */
7989    &PyBaseString_Type,			/* tp_base */
7990    0,					/* tp_dict */
7991    0,					/* tp_descr_get */
7992    0,					/* tp_descr_set */
7993    0,					/* tp_dictoffset */
7994    0,					/* tp_init */
7995    0,					/* tp_alloc */
7996    unicode_new,			/* tp_new */
7997    PyObject_Del,      		/* tp_free */
7998};
7999
8000/* Initialize the Unicode implementation */
8001
8002void _PyUnicode_Init(void)
8003{
8004    int i;
8005
8006    /* XXX - move this array to unicodectype.c ? */
8007    Py_UNICODE linebreak[] = {
8008        0x000A, /* LINE FEED */
8009        0x000D, /* CARRIAGE RETURN */
8010        0x001C, /* FILE SEPARATOR */
8011        0x001D, /* GROUP SEPARATOR */
8012        0x001E, /* RECORD SEPARATOR */
8013        0x0085, /* NEXT LINE */
8014        0x2028, /* LINE SEPARATOR */
8015        0x2029, /* PARAGRAPH SEPARATOR */
8016    };
8017
8018    /* Init the implementation */
8019    unicode_freelist = NULL;
8020    unicode_freelist_size = 0;
8021    unicode_empty = _PyUnicode_New(0);
8022    if (!unicode_empty)
8023	return;
8024
8025    strcpy(unicode_default_encoding, "ascii");
8026    for (i = 0; i < 256; i++)
8027	unicode_latin1[i] = NULL;
8028    if (PyType_Ready(&PyUnicode_Type) < 0)
8029	Py_FatalError("Can't initialize 'unicode'");
8030
8031    /* initialize the linebreak bloom filter */
8032    bloom_linebreak = make_bloom_mask(
8033        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8034        );
8035
8036    PyType_Ready(&EncodingMapType);
8037}
8038
8039/* Finalize the Unicode implementation */
8040
8041void
8042_PyUnicode_Fini(void)
8043{
8044    PyUnicodeObject *u;
8045    int i;
8046
8047    Py_XDECREF(unicode_empty);
8048    unicode_empty = NULL;
8049
8050    for (i = 0; i < 256; i++) {
8051	if (unicode_latin1[i]) {
8052	    Py_DECREF(unicode_latin1[i]);
8053	    unicode_latin1[i] = NULL;
8054	}
8055    }
8056
8057    for (u = unicode_freelist; u != NULL;) {
8058	PyUnicodeObject *v = u;
8059	u = *(PyUnicodeObject **)u;
8060	if (v->str)
8061	    PyMem_DEL(v->str);
8062	Py_XDECREF(v->defenc);
8063	PyObject_Del(v);
8064    }
8065    unicode_freelist = NULL;
8066    unicode_freelist_size = 0;
8067}
8068
8069
8070
8071/********************* Unicode Iterator **************************/
8072
8073typedef struct {
8074	PyObject_HEAD
8075	Py_ssize_t it_index;
8076	PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
8077} unicodeiterobject;
8078
8079static void
8080unicodeiter_dealloc(unicodeiterobject *it)
8081{
8082	_PyObject_GC_UNTRACK(it);
8083	Py_XDECREF(it->it_seq);
8084	PyObject_GC_Del(it);
8085}
8086
8087static int
8088unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
8089{
8090	Py_VISIT(it->it_seq);
8091	return 0;
8092}
8093
8094static PyObject *
8095unicodeiter_next(unicodeiterobject *it)
8096{
8097	PyUnicodeObject *seq;
8098	PyObject *item;
8099
8100	assert(it != NULL);
8101	seq = it->it_seq;
8102	if (seq == NULL)
8103		return NULL;
8104	assert(PyUnicode_Check(seq));
8105
8106	if (it->it_index < PyUnicode_GET_SIZE(seq)) {
8107		item = PyUnicode_FromUnicode(
8108                    PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
8109		if (item != NULL)
8110			++it->it_index;
8111		return item;
8112	}
8113
8114	Py_DECREF(seq);
8115	it->it_seq = NULL;
8116	return NULL;
8117}
8118
8119static PyObject *
8120unicodeiter_len(unicodeiterobject *it)
8121{
8122	Py_ssize_t len = 0;
8123	if (it->it_seq)
8124		len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
8125	return PyInt_FromSsize_t(len);
8126}
8127
8128PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
8129
8130static PyMethodDef unicodeiter_methods[] = {
8131	{"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
8132         length_hint_doc},
8133 	{NULL,		NULL}		/* sentinel */
8134};
8135
8136PyTypeObject PyUnicodeIter_Type = {
8137	PyObject_HEAD_INIT(&PyType_Type)
8138	0,					/* ob_size */
8139	"unicodeiterator",			/* tp_name */
8140	sizeof(unicodeiterobject),		/* tp_basicsize */
8141	0,					/* tp_itemsize */
8142	/* methods */
8143	(destructor)unicodeiter_dealloc,	/* tp_dealloc */
8144	0,					/* tp_print */
8145	0,					/* tp_getattr */
8146	0,					/* tp_setattr */
8147	0,					/* tp_compare */
8148	0,					/* tp_repr */
8149	0,					/* tp_as_number */
8150	0,					/* tp_as_sequence */
8151	0,					/* tp_as_mapping */
8152	0,					/* tp_hash */
8153	0,					/* tp_call */
8154	0,					/* tp_str */
8155	PyObject_GenericGetAttr,		/* tp_getattro */
8156	0,					/* tp_setattro */
8157	0,					/* tp_as_buffer */
8158	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
8159	0,					/* tp_doc */
8160	(traverseproc)unicodeiter_traverse,	/* tp_traverse */
8161	0,					/* tp_clear */
8162	0,					/* tp_richcompare */
8163	0,					/* tp_weaklistoffset */
8164	PyObject_SelfIter,			/* tp_iter */
8165	(iternextfunc)unicodeiter_next,		/* tp_iternext */
8166	unicodeiter_methods,			/* tp_methods */
8167	0,
8168};
8169
8170static PyObject *
8171unicode_iter(PyObject *seq)
8172{
8173	unicodeiterobject *it;
8174
8175	if (!PyUnicode_Check(seq)) {
8176		PyErr_BadInternalCall();
8177		return NULL;
8178	}
8179	it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
8180	if (it == NULL)
8181		return NULL;
8182	it->it_index = 0;
8183	Py_INCREF(seq);
8184	it->it_seq = (PyUnicodeObject *)seq;
8185	_PyObject_GC_TRACK(it);
8186	return (PyObject *)it;
8187}
8188
8189#ifdef __cplusplus
8190}
8191#endif
8192
8193
8194/*
8195Local variables:
8196c-basic-offset: 4
8197indent-tabs-mode: nil
8198End:
8199*/
8200