unicodeobject.c revision 0e3f591aeeef9ed715f8770320f4c4c7332a8794
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15    Copyright (c) 1999 by Secret Labs AB
16    Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44
45#include "unicodeobject.h"
46#include "ucnhash.h"
47
48#ifdef MS_WINDOWS
49#include <windows.h>
50#endif
51
52/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE       1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58   The implementation will keep allocated Unicode memory intact for
59   all objects on the free list having a size less than this
60   limit. This reduces malloc() overhead for small Unicode objects.
61
62   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64   malloc()-overhead) bytes of unused garbage.
65
66   Setting the limit to 0 effectively turns the feature off.
67
68   Note: This is an experimental feature ! If you get core dumps when
69   using Unicode objects, turn this feature off.
70
71*/
72
73#define KEEPALIVE_SIZE_LIMIT       9
74
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
83/* --- Globals ------------------------------------------------------------
84
85   The globals are initialized by the _PyUnicode_Init() API and should
86   not be used before calling that API.
87
88*/
89
90
91#ifdef __cplusplus
92extern "C" {
93#endif
94
95/* Free list for Unicode objects */
96static PyUnicodeObject *unicode_freelist;
97static int unicode_freelist_size;
98
99/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103   shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
106/* Default encoding to use and assume when NULL is passed as encoding
107   parameter; it is initialized by _PyUnicode_Init().
108
109   Always use the PyUnicode_SetDefaultEncoding() and
110   PyUnicode_GetDefaultEncoding() APIs to access this global.
111
112*/
113static char unicode_default_encoding[100];
114
115Py_UNICODE
116PyUnicode_GetMax(void)
117{
118#ifdef Py_UNICODE_WIDE
119	return 0x10FFFF;
120#else
121	/* This is actually an illegal character, so it should
122	   not be passed to unichr. */
123	return 0xFFFF;
124#endif
125}
126
127/* --- Bloom Filters ----------------------------------------------------- */
128
129/* stuff to implement simple "bloom filters" for Unicode characters.
130   to keep things simple, we use a single bitmask, using the least 5
131   bits from each unicode characters as the bit index. */
132
133/* the linebreak mask is set up by Unicode_Init below */
134
135#define BLOOM_MASK unsigned long
136
137static BLOOM_MASK bloom_linebreak;
138
139#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
140
141#define BLOOM_LINEBREAK(ch)\
142    (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
143
144Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
145{
146    /* calculate simple bloom-style bitmask for a given unicode string */
147
148    long mask;
149    Py_ssize_t i;
150
151    mask = 0;
152    for (i = 0; i < len; i++)
153        mask |= (1 << (ptr[i] & 0x1F));
154
155    return mask;
156}
157
158Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
159{
160    Py_ssize_t i;
161
162    for (i = 0; i < setlen; i++)
163        if (set[i] == chr)
164            return 1;
165
166    return 0;
167}
168
169#define BLOOM_MEMBER(mask, chr, set, setlen)\
170    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
171
172/* --- Unicode Object ----------------------------------------------------- */
173
174static
175int unicode_resize(register PyUnicodeObject *unicode,
176                      Py_ssize_t length)
177{
178    void *oldstr;
179
180    /* Shortcut if there's nothing much to do. */
181    if (unicode->length == length)
182	goto reset;
183
184    /* Resizing shared object (unicode_empty or single character
185       objects) in-place is not allowed. Use PyUnicode_Resize()
186       instead ! */
187
188    if (unicode == unicode_empty ||
189	(unicode->length == 1 &&
190	 unicode->str[0] < 256U &&
191	 unicode_latin1[unicode->str[0]] == unicode)) {
192        PyErr_SetString(PyExc_SystemError,
193                        "can't resize shared unicode objects");
194        return -1;
195    }
196
197    /* We allocate one more byte to make sure the string is Ux0000 terminated.
198       The overallocation is also used by fastsearch, which assumes that it's
199       safe to look at str[length] (without making any assumptions about what
200       it contains). */
201
202    oldstr = unicode->str;
203    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204    if (!unicode->str) {
205	unicode->str = (Py_UNICODE *)oldstr;
206        PyErr_NoMemory();
207        return -1;
208    }
209    unicode->str[length] = 0;
210    unicode->length = length;
211
212 reset:
213    /* Reset the object caches */
214    if (unicode->defenc) {
215        Py_DECREF(unicode->defenc);
216        unicode->defenc = NULL;
217    }
218    unicode->hash = -1;
219
220    return 0;
221}
222
223/* We allocate one more byte to make sure the string is
224   Ux0000 terminated -- XXX is this needed ?
225
226   XXX This allocator could further be enhanced by assuring that the
227       free list never reduces its size below 1.
228
229*/
230
231static
232PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
233{
234    register PyUnicodeObject *unicode;
235
236    /* Optimization for empty strings */
237    if (length == 0 && unicode_empty != NULL) {
238        Py_INCREF(unicode_empty);
239        return unicode_empty;
240    }
241
242    /* Unicode freelist & memory allocation */
243    if (unicode_freelist) {
244        unicode = unicode_freelist;
245        unicode_freelist = *(PyUnicodeObject **)unicode;
246        unicode_freelist_size--;
247	if (unicode->str) {
248	    /* Keep-Alive optimization: we only upsize the buffer,
249	       never downsize it. */
250	    if ((unicode->length < length) &&
251                unicode_resize(unicode, length) < 0) {
252		PyMem_DEL(unicode->str);
253		goto onError;
254	    }
255	}
256        else {
257	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
258        }
259        PyObject_INIT(unicode, &PyUnicode_Type);
260    }
261    else {
262        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
263        if (unicode == NULL)
264            return NULL;
265	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
266    }
267
268    if (!unicode->str) {
269	PyErr_NoMemory();
270	goto onError;
271    }
272    /* Initialize the first element to guard against cases where
273     * the caller fails before initializing str -- unicode_resize()
274     * reads str[0], and the Keep-Alive optimization can keep memory
275     * allocated for str alive across a call to unicode_dealloc(unicode).
276     * We don't want unicode_resize to read uninitialized memory in
277     * that case.
278     */
279    unicode->str[0] = 0;
280    unicode->str[length] = 0;
281    unicode->length = length;
282    unicode->hash = -1;
283    unicode->defenc = NULL;
284    return unicode;
285
286 onError:
287    _Py_ForgetReference((PyObject *)unicode);
288    PyObject_Del(unicode);
289    return NULL;
290}
291
292static
293void unicode_dealloc(register PyUnicodeObject *unicode)
294{
295    if (PyUnicode_CheckExact(unicode) &&
296	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
297        /* Keep-Alive optimization */
298	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
299	    PyMem_DEL(unicode->str);
300	    unicode->str = NULL;
301	    unicode->length = 0;
302	}
303	if (unicode->defenc) {
304	    Py_DECREF(unicode->defenc);
305	    unicode->defenc = NULL;
306	}
307	/* Add to free list */
308        *(PyUnicodeObject **)unicode = unicode_freelist;
309        unicode_freelist = unicode;
310        unicode_freelist_size++;
311    }
312    else {
313	PyMem_DEL(unicode->str);
314	Py_XDECREF(unicode->defenc);
315	unicode->ob_type->tp_free((PyObject *)unicode);
316    }
317}
318
319int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
320{
321    register PyUnicodeObject *v;
322
323    /* Argument checks */
324    if (unicode == NULL) {
325	PyErr_BadInternalCall();
326	return -1;
327    }
328    v = (PyUnicodeObject *)*unicode;
329    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
330	PyErr_BadInternalCall();
331	return -1;
332    }
333
334    /* Resizing unicode_empty and single character objects is not
335       possible since these are being shared. We simply return a fresh
336       copy with the same Unicode content. */
337    if (v->length != length &&
338	(v == unicode_empty || v->length == 1)) {
339	PyUnicodeObject *w = _PyUnicode_New(length);
340	if (w == NULL)
341	    return -1;
342	Py_UNICODE_COPY(w->str, v->str,
343			length < v->length ? length : v->length);
344	Py_DECREF(*unicode);
345	*unicode = (PyObject *)w;
346	return 0;
347    }
348
349    /* Note that we don't have to modify *unicode for unshared Unicode
350       objects, since we can modify them in-place. */
351    return unicode_resize(v, length);
352}
353
354/* Internal API for use in unicodeobject.c only ! */
355#define _PyUnicode_Resize(unicodevar, length) \
356        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
357
358PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
359				Py_ssize_t size)
360{
361    PyUnicodeObject *unicode;
362
363    /* If the Unicode data is known at construction time, we can apply
364       some optimizations which share commonly used objects. */
365    if (u != NULL) {
366
367	/* Optimization for empty strings */
368	if (size == 0 && unicode_empty != NULL) {
369	    Py_INCREF(unicode_empty);
370	    return (PyObject *)unicode_empty;
371	}
372
373	/* Single character Unicode objects in the Latin-1 range are
374	   shared when using this constructor */
375	if (size == 1 && *u < 256) {
376	    unicode = unicode_latin1[*u];
377	    if (!unicode) {
378		unicode = _PyUnicode_New(1);
379		if (!unicode)
380		    return NULL;
381		unicode->str[0] = *u;
382		unicode_latin1[*u] = unicode;
383	    }
384	    Py_INCREF(unicode);
385	    return (PyObject *)unicode;
386	}
387    }
388
389    unicode = _PyUnicode_New(size);
390    if (!unicode)
391        return NULL;
392
393    /* Copy the Unicode data into the new object */
394    if (u != NULL)
395	Py_UNICODE_COPY(unicode->str, u, size);
396
397    return (PyObject *)unicode;
398}
399
400#ifdef HAVE_WCHAR_H
401
402PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
403				 Py_ssize_t size)
404{
405    PyUnicodeObject *unicode;
406
407    if (w == NULL) {
408	PyErr_BadInternalCall();
409	return NULL;
410    }
411
412    unicode = _PyUnicode_New(size);
413    if (!unicode)
414        return NULL;
415
416    /* Copy the wchar_t data into the new object */
417#ifdef HAVE_USABLE_WCHAR_T
418    memcpy(unicode->str, w, size * sizeof(wchar_t));
419#else
420    {
421	register Py_UNICODE *u;
422	register Py_ssize_t i;
423	u = PyUnicode_AS_UNICODE(unicode);
424	for (i = size; i > 0; i--)
425	    *u++ = *w++;
426    }
427#endif
428
429    return (PyObject *)unicode;
430}
431
432Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433				wchar_t *w,
434				Py_ssize_t size)
435{
436    if (unicode == NULL) {
437	PyErr_BadInternalCall();
438	return -1;
439    }
440
441    /* If possible, try to copy the 0-termination as well */
442    if (size > PyUnicode_GET_SIZE(unicode))
443	size = PyUnicode_GET_SIZE(unicode) + 1;
444
445#ifdef HAVE_USABLE_WCHAR_T
446    memcpy(w, unicode->str, size * sizeof(wchar_t));
447#else
448    {
449	register Py_UNICODE *u;
450	register Py_ssize_t i;
451	u = PyUnicode_AS_UNICODE(unicode);
452	for (i = size; i > 0; i--)
453	    *w++ = *u++;
454    }
455#endif
456
457    if (size > PyUnicode_GET_SIZE(unicode))
458        return PyUnicode_GET_SIZE(unicode);
459    else
460    return size;
461}
462
463#endif
464
465PyObject *PyUnicode_FromOrdinal(int ordinal)
466{
467    Py_UNICODE s[1];
468
469#ifdef Py_UNICODE_WIDE
470    if (ordinal < 0 || ordinal > 0x10ffff) {
471	PyErr_SetString(PyExc_ValueError,
472			"unichr() arg not in range(0x110000) "
473			"(wide Python build)");
474	return NULL;
475    }
476#else
477    if (ordinal < 0 || ordinal > 0xffff) {
478	PyErr_SetString(PyExc_ValueError,
479			"unichr() arg not in range(0x10000) "
480			"(narrow Python build)");
481	return NULL;
482    }
483#endif
484
485    s[0] = (Py_UNICODE)ordinal;
486    return PyUnicode_FromUnicode(s, 1);
487}
488
489PyObject *PyUnicode_FromObject(register PyObject *obj)
490{
491    /* XXX Perhaps we should make this API an alias of
492           PyObject_Unicode() instead ?! */
493    if (PyUnicode_CheckExact(obj)) {
494	Py_INCREF(obj);
495	return obj;
496    }
497    if (PyUnicode_Check(obj)) {
498	/* For a Unicode subtype that's not a Unicode object,
499	   return a true Unicode object with the same data. */
500	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501				     PyUnicode_GET_SIZE(obj));
502    }
503    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
504}
505
506PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507				      const char *encoding,
508				      const char *errors)
509{
510    const char *s = NULL;
511    Py_ssize_t len;
512    PyObject *v;
513
514    if (obj == NULL) {
515	PyErr_BadInternalCall();
516	return NULL;
517    }
518
519#if 0
520    /* For b/w compatibility we also accept Unicode objects provided
521       that no encodings is given and then redirect to
522       PyObject_Unicode() which then applies the additional logic for
523       Unicode subclasses.
524
525       NOTE: This API should really only be used for object which
526             represent *encoded* Unicode !
527
528    */
529	if (PyUnicode_Check(obj)) {
530	    if (encoding) {
531		PyErr_SetString(PyExc_TypeError,
532				"decoding Unicode is not supported");
533	    return NULL;
534	    }
535	return PyObject_Unicode(obj);
536	    }
537#else
538    if (PyUnicode_Check(obj)) {
539	PyErr_SetString(PyExc_TypeError,
540			"decoding Unicode is not supported");
541	return NULL;
542	}
543#endif
544
545    /* Coerce object */
546    if (PyString_Check(obj)) {
547	    s = PyString_AS_STRING(obj);
548	    len = PyString_GET_SIZE(obj);
549	    }
550    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551	/* Overwrite the error message with something more useful in
552	   case of a TypeError. */
553	if (PyErr_ExceptionMatches(PyExc_TypeError))
554	PyErr_Format(PyExc_TypeError,
555			 "coercing to Unicode: need string or buffer, "
556			 "%.80s found",
557		     obj->ob_type->tp_name);
558	goto onError;
559    }
560
561    /* Convert to Unicode */
562    if (len == 0) {
563	Py_INCREF(unicode_empty);
564	v = (PyObject *)unicode_empty;
565    }
566    else
567	v = PyUnicode_Decode(s, len, encoding, errors);
568
569    return v;
570
571 onError:
572    return NULL;
573}
574
575PyObject *PyUnicode_Decode(const char *s,
576			   Py_ssize_t size,
577			   const char *encoding,
578			   const char *errors)
579{
580    PyObject *buffer = NULL, *unicode;
581
582    if (encoding == NULL)
583	encoding = PyUnicode_GetDefaultEncoding();
584
585    /* Shortcuts for common default encodings */
586    if (strcmp(encoding, "utf-8") == 0)
587        return PyUnicode_DecodeUTF8(s, size, errors);
588    else if (strcmp(encoding, "latin-1") == 0)
589        return PyUnicode_DecodeLatin1(s, size, errors);
590#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591    else if (strcmp(encoding, "mbcs") == 0)
592        return PyUnicode_DecodeMBCS(s, size, errors);
593#endif
594    else if (strcmp(encoding, "ascii") == 0)
595        return PyUnicode_DecodeASCII(s, size, errors);
596
597    /* Decode via the codec registry */
598    buffer = PyBuffer_FromMemory((void *)s, size);
599    if (buffer == NULL)
600        goto onError;
601    unicode = PyCodec_Decode(buffer, encoding, errors);
602    if (unicode == NULL)
603        goto onError;
604    if (!PyUnicode_Check(unicode)) {
605        PyErr_Format(PyExc_TypeError,
606                     "decoder did not return an unicode object (type=%.400s)",
607                     unicode->ob_type->tp_name);
608        Py_DECREF(unicode);
609        goto onError;
610    }
611    Py_DECREF(buffer);
612    return unicode;
613
614 onError:
615    Py_XDECREF(buffer);
616    return NULL;
617}
618
619PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620                                    const char *encoding,
621                                    const char *errors)
622{
623    PyObject *v;
624
625    if (!PyUnicode_Check(unicode)) {
626        PyErr_BadArgument();
627        goto onError;
628    }
629
630    if (encoding == NULL)
631	encoding = PyUnicode_GetDefaultEncoding();
632
633    /* Decode via the codec registry */
634    v = PyCodec_Decode(unicode, encoding, errors);
635    if (v == NULL)
636        goto onError;
637    return v;
638
639 onError:
640    return NULL;
641}
642
643PyObject *PyUnicode_Encode(const Py_UNICODE *s,
644			   Py_ssize_t size,
645			   const char *encoding,
646			   const char *errors)
647{
648    PyObject *v, *unicode;
649
650    unicode = PyUnicode_FromUnicode(s, size);
651    if (unicode == NULL)
652	return NULL;
653    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654    Py_DECREF(unicode);
655    return v;
656}
657
658PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659                                    const char *encoding,
660                                    const char *errors)
661{
662    PyObject *v;
663
664    if (!PyUnicode_Check(unicode)) {
665        PyErr_BadArgument();
666        goto onError;
667    }
668
669    if (encoding == NULL)
670	encoding = PyUnicode_GetDefaultEncoding();
671
672    /* Encode via the codec registry */
673    v = PyCodec_Encode(unicode, encoding, errors);
674    if (v == NULL)
675        goto onError;
676    return v;
677
678 onError:
679    return NULL;
680}
681
682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683                                    const char *encoding,
684                                    const char *errors)
685{
686    PyObject *v;
687
688    if (!PyUnicode_Check(unicode)) {
689        PyErr_BadArgument();
690        goto onError;
691    }
692
693    if (encoding == NULL)
694	encoding = PyUnicode_GetDefaultEncoding();
695
696    /* Shortcuts for common default encodings */
697    if (errors == NULL) {
698	if (strcmp(encoding, "utf-8") == 0)
699	    return PyUnicode_AsUTF8String(unicode);
700	else if (strcmp(encoding, "latin-1") == 0)
701	    return PyUnicode_AsLatin1String(unicode);
702#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703	else if (strcmp(encoding, "mbcs") == 0)
704	    return PyUnicode_AsMBCSString(unicode);
705#endif
706	else if (strcmp(encoding, "ascii") == 0)
707	    return PyUnicode_AsASCIIString(unicode);
708    }
709
710    /* Encode via the codec registry */
711    v = PyCodec_Encode(unicode, encoding, errors);
712    if (v == NULL)
713        goto onError;
714    if (!PyString_Check(v)) {
715        PyErr_Format(PyExc_TypeError,
716                     "encoder did not return a string object (type=%.400s)",
717                     v->ob_type->tp_name);
718        Py_DECREF(v);
719        goto onError;
720    }
721    return v;
722
723 onError:
724    return NULL;
725}
726
727PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728					    const char *errors)
729{
730    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
731
732    if (v)
733        return v;
734    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735    if (v && errors == NULL)
736        ((PyUnicodeObject *)unicode)->defenc = v;
737    return v;
738}
739
740Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
741{
742    if (!PyUnicode_Check(unicode)) {
743        PyErr_BadArgument();
744        goto onError;
745    }
746    return PyUnicode_AS_UNICODE(unicode);
747
748 onError:
749    return NULL;
750}
751
752Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
753{
754    if (!PyUnicode_Check(unicode)) {
755        PyErr_BadArgument();
756        goto onError;
757    }
758    return PyUnicode_GET_SIZE(unicode);
759
760 onError:
761    return -1;
762}
763
764const char *PyUnicode_GetDefaultEncoding(void)
765{
766    return unicode_default_encoding;
767}
768
769int PyUnicode_SetDefaultEncoding(const char *encoding)
770{
771    PyObject *v;
772
773    /* Make sure the encoding is valid. As side effect, this also
774       loads the encoding into the codec registry cache. */
775    v = _PyCodec_Lookup(encoding);
776    if (v == NULL)
777	goto onError;
778    Py_DECREF(v);
779    strncpy(unicode_default_encoding,
780	    encoding,
781	    sizeof(unicode_default_encoding));
782    return 0;
783
784 onError:
785    return -1;
786}
787
788/* error handling callback helper:
789   build arguments, call the callback and check the arguments,
790   if no exception occurred, copy the replacement to the output
791   and adjust various state variables.
792   return 0 on success, -1 on error
793*/
794
795static
796int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797                 const char *encoding, const char *reason,
798                 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
799                 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
800{
801    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
802
803    PyObject *restuple = NULL;
804    PyObject *repunicode = NULL;
805    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
806    Py_ssize_t requiredsize;
807    Py_ssize_t newpos;
808    Py_UNICODE *repptr;
809    Py_ssize_t repsize;
810    int res = -1;
811
812    if (*errorHandler == NULL) {
813	*errorHandler = PyCodec_LookupError(errors);
814	if (*errorHandler == NULL)
815	   goto onError;
816    }
817
818    if (*exceptionObject == NULL) {
819    	*exceptionObject = PyUnicodeDecodeError_Create(
820	    encoding, input, insize, *startinpos, *endinpos, reason);
821	if (*exceptionObject == NULL)
822	   goto onError;
823    }
824    else {
825	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
826	    goto onError;
827	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
828	    goto onError;
829	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
830	    goto onError;
831    }
832
833    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
834    if (restuple == NULL)
835	goto onError;
836    if (!PyTuple_Check(restuple)) {
837	PyErr_Format(PyExc_TypeError, &argparse[4]);
838	goto onError;
839    }
840    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
841	goto onError;
842    if (newpos<0)
843	newpos = insize+newpos;
844    if (newpos<0 || newpos>insize) {
845	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
846	goto onError;
847    }
848
849    /* need more space? (at least enough for what we
850       have+the replacement+the rest of the string (starting
851       at the new input position), so we won't have to check space
852       when there are no errors in the rest of the string) */
853    repptr = PyUnicode_AS_UNICODE(repunicode);
854    repsize = PyUnicode_GET_SIZE(repunicode);
855    requiredsize = *outpos + repsize + insize-newpos;
856    if (requiredsize > outsize) {
857	if (requiredsize<2*outsize)
858	    requiredsize = 2*outsize;
859	if (PyUnicode_Resize(output, requiredsize) < 0)
860	    goto onError;
861	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
862    }
863    *endinpos = newpos;
864    *inptr = input + newpos;
865    Py_UNICODE_COPY(*outptr, repptr, repsize);
866    *outptr += repsize;
867    *outpos += repsize;
868    /* we made it! */
869    res = 0;
870
871    onError:
872    Py_XDECREF(restuple);
873    return res;
874}
875
876/* --- UTF-7 Codec -------------------------------------------------------- */
877
878/* see RFC2152 for details */
879
880static
881char utf7_special[128] = {
882    /* indicate whether a UTF-7 character is special i.e. cannot be directly
883       encoded:
884	   0 - not special
885	   1 - special
886	   2 - whitespace (optional)
887	   3 - RFC2152 Set O (optional) */
888    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
896
897};
898
899/* Note: The comparison (c) <= 0 is a trick to work-around gcc
900   warnings about the comparison always being false; since
901   utf7_special[0] is 1, we can safely make that one comparison
902   true  */
903
904#define SPECIAL(c, encodeO, encodeWS) \
905    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
906     (encodeWS && (utf7_special[(c)] == 2)) || \
907     (encodeO && (utf7_special[(c)] == 3)))
908
909#define B64(n)  \
910    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
911#define B64CHAR(c) \
912    (isalnum(c) || (c) == '+' || (c) == '/')
913#define UB64(c) \
914    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
915     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
916
917#define ENCODE(out, ch, bits)                   \
918    while (bits >= 6) {                         \
919        *out++ = B64(ch >> (bits-6));           \
920        bits -= 6;                              \
921    }
922
923#define DECODE(out, ch, bits, surrogate)                                \
924    while (bits >= 16) {                                                \
925        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
926        bits -= 16;                                                     \
927        if (surrogate) {                                                \
928            /* We have already generated an error for the high surrogate \
929               so let's not bother seeing if the low surrogate is correct or not */ \
930            surrogate = 0;                                              \
931        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
932            /* This is a surrogate pair. Unfortunately we can't represent \
933               it in a 16-bit character */                              \
934            surrogate = 1;                                              \
935            errmsg = "code pairs are not supported";                    \
936            goto utf7Error;                                             \
937        } else {                                                        \
938            *out++ = outCh;                                             \
939        }                                                               \
940    }
941
942PyObject *PyUnicode_DecodeUTF7(const char *s,
943			       Py_ssize_t size,
944			       const char *errors)
945{
946    const char *starts = s;
947    Py_ssize_t startinpos;
948    Py_ssize_t endinpos;
949    Py_ssize_t outpos;
950    const char *e;
951    PyUnicodeObject *unicode;
952    Py_UNICODE *p;
953    const char *errmsg = "";
954    int inShift = 0;
955    unsigned int bitsleft = 0;
956    unsigned long charsleft = 0;
957    int surrogate = 0;
958    PyObject *errorHandler = NULL;
959    PyObject *exc = NULL;
960
961    unicode = _PyUnicode_New(size);
962    if (!unicode)
963        return NULL;
964    if (size == 0)
965        return (PyObject *)unicode;
966
967    p = unicode->str;
968    e = s + size;
969
970    while (s < e) {
971        Py_UNICODE ch;
972        restart:
973        ch = *s;
974
975        if (inShift) {
976            if ((ch == '-') || !B64CHAR(ch)) {
977                inShift = 0;
978                s++;
979
980                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
981                if (bitsleft >= 6) {
982                    /* The shift sequence has a partial character in it. If
983                       bitsleft < 6 then we could just classify it as padding
984                       but that is not the case here */
985
986                    errmsg = "partial character in shift sequence";
987                    goto utf7Error;
988                }
989                /* According to RFC2152 the remaining bits should be zero. We
990                   choose to signal an error/insert a replacement character
991                   here so indicate the potential of a misencoded character. */
992
993                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
995                    errmsg = "non-zero padding bits in shift sequence";
996                    goto utf7Error;
997                }
998
999                if (ch == '-') {
1000                    if ((s < e) && (*(s) == '-')) {
1001                        *p++ = '-';
1002                        inShift = 1;
1003                    }
1004                } else if (SPECIAL(ch,0,0)) {
1005                    errmsg = "unexpected special character";
1006	                goto utf7Error;
1007                } else  {
1008                    *p++ = ch;
1009                }
1010            } else {
1011                charsleft = (charsleft << 6) | UB64(ch);
1012                bitsleft += 6;
1013                s++;
1014                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1015            }
1016        }
1017        else if ( ch == '+' ) {
1018            startinpos = s-starts;
1019            s++;
1020            if (s < e && *s == '-') {
1021                s++;
1022                *p++ = '+';
1023            } else
1024            {
1025                inShift = 1;
1026                bitsleft = 0;
1027            }
1028        }
1029        else if (SPECIAL(ch,0,0)) {
1030            errmsg = "unexpected special character";
1031            s++;
1032	        goto utf7Error;
1033        }
1034        else {
1035            *p++ = ch;
1036            s++;
1037        }
1038        continue;
1039    utf7Error:
1040        outpos = p-PyUnicode_AS_UNICODE(unicode);
1041        endinpos = s-starts;
1042        if (unicode_decode_call_errorhandler(
1043             errors, &errorHandler,
1044             "utf7", errmsg,
1045             starts, size, &startinpos, &endinpos, &exc, &s,
1046             (PyObject **)&unicode, &outpos, &p))
1047        goto onError;
1048    }
1049
1050    if (inShift) {
1051        outpos = p-PyUnicode_AS_UNICODE(unicode);
1052        endinpos = size;
1053        if (unicode_decode_call_errorhandler(
1054             errors, &errorHandler,
1055             "utf7", "unterminated shift sequence",
1056             starts, size, &startinpos, &endinpos, &exc, &s,
1057             (PyObject **)&unicode, &outpos, &p))
1058            goto onError;
1059        if (s < e)
1060           goto restart;
1061    }
1062
1063    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1064        goto onError;
1065
1066    Py_XDECREF(errorHandler);
1067    Py_XDECREF(exc);
1068    return (PyObject *)unicode;
1069
1070onError:
1071    Py_XDECREF(errorHandler);
1072    Py_XDECREF(exc);
1073    Py_DECREF(unicode);
1074    return NULL;
1075}
1076
1077
1078PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1079                   Py_ssize_t size,
1080                   int encodeSetO,
1081                   int encodeWhiteSpace,
1082                   const char *errors)
1083{
1084    PyObject *v;
1085    /* It might be possible to tighten this worst case */
1086    Py_ssize_t cbAllocated = 5 * size;
1087    int inShift = 0;
1088    Py_ssize_t i = 0;
1089    unsigned int bitsleft = 0;
1090    unsigned long charsleft = 0;
1091    char * out;
1092    char * start;
1093
1094    if (size == 0)
1095		return PyString_FromStringAndSize(NULL, 0);
1096
1097    v = PyString_FromStringAndSize(NULL, cbAllocated);
1098    if (v == NULL)
1099        return NULL;
1100
1101    start = out = PyString_AS_STRING(v);
1102    for (;i < size; ++i) {
1103        Py_UNICODE ch = s[i];
1104
1105        if (!inShift) {
1106            if (ch == '+') {
1107                *out++ = '+';
1108                *out++ = '-';
1109            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1110                charsleft = ch;
1111                bitsleft = 16;
1112                *out++ = '+';
1113                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1114                inShift = bitsleft > 0;
1115            } else {
1116                *out++ = (char) ch;
1117            }
1118        } else {
1119            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1120                *out++ = B64(charsleft << (6-bitsleft));
1121                charsleft = 0;
1122                bitsleft = 0;
1123                /* Characters not in the BASE64 set implicitly unshift the sequence
1124                   so no '-' is required, except if the character is itself a '-' */
1125                if (B64CHAR(ch) || ch == '-') {
1126                    *out++ = '-';
1127                }
1128                inShift = 0;
1129                *out++ = (char) ch;
1130            } else {
1131                bitsleft += 16;
1132                charsleft = (charsleft << 16) | ch;
1133                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1134
1135                /* If the next character is special then we dont' need to terminate
1136                   the shift sequence. If the next character is not a BASE64 character
1137                   or '-' then the shift sequence will be terminated implicitly and we
1138                   don't have to insert a '-'. */
1139
1140                if (bitsleft == 0) {
1141                    if (i + 1 < size) {
1142                        Py_UNICODE ch2 = s[i+1];
1143
1144                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1145
1146                        } else if (B64CHAR(ch2) || ch2 == '-') {
1147                            *out++ = '-';
1148                            inShift = 0;
1149                        } else {
1150                            inShift = 0;
1151                        }
1152
1153                    }
1154                    else {
1155                        *out++ = '-';
1156                        inShift = 0;
1157                    }
1158                }
1159            }
1160        }
1161    }
1162    if (bitsleft) {
1163        *out++= B64(charsleft << (6-bitsleft) );
1164        *out++ = '-';
1165    }
1166
1167    _PyString_Resize(&v, out - start);
1168    return v;
1169}
1170
1171#undef SPECIAL
1172#undef B64
1173#undef B64CHAR
1174#undef UB64
1175#undef ENCODE
1176#undef DECODE
1177
1178/* --- UTF-8 Codec -------------------------------------------------------- */
1179
1180static
1181char utf8_code_length[256] = {
1182    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1183       illegal prefix.  see RFC 2279 for details */
1184    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1200};
1201
1202PyObject *PyUnicode_DecodeUTF8(const char *s,
1203			       Py_ssize_t size,
1204			       const char *errors)
1205{
1206    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1207}
1208
1209PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1210			                Py_ssize_t size,
1211			                const char *errors,
1212			                Py_ssize_t *consumed)
1213{
1214    const char *starts = s;
1215    int n;
1216    Py_ssize_t startinpos;
1217    Py_ssize_t endinpos;
1218    Py_ssize_t outpos;
1219    const char *e;
1220    PyUnicodeObject *unicode;
1221    Py_UNICODE *p;
1222    const char *errmsg = "";
1223    PyObject *errorHandler = NULL;
1224    PyObject *exc = NULL;
1225
1226    /* Note: size will always be longer than the resulting Unicode
1227       character count */
1228    unicode = _PyUnicode_New(size);
1229    if (!unicode)
1230        return NULL;
1231    if (size == 0) {
1232        if (consumed)
1233            *consumed = 0;
1234        return (PyObject *)unicode;
1235    }
1236
1237    /* Unpack UTF-8 encoded data */
1238    p = unicode->str;
1239    e = s + size;
1240
1241    while (s < e) {
1242        Py_UCS4 ch = (unsigned char)*s;
1243
1244        if (ch < 0x80) {
1245            *p++ = (Py_UNICODE)ch;
1246            s++;
1247            continue;
1248        }
1249
1250        n = utf8_code_length[ch];
1251
1252        if (s + n > e) {
1253	    if (consumed)
1254		break;
1255	    else {
1256		errmsg = "unexpected end of data";
1257		startinpos = s-starts;
1258		endinpos = size;
1259		goto utf8Error;
1260	    }
1261	}
1262
1263        switch (n) {
1264
1265        case 0:
1266            errmsg = "unexpected code byte";
1267	    startinpos = s-starts;
1268	    endinpos = startinpos+1;
1269	    goto utf8Error;
1270
1271        case 1:
1272            errmsg = "internal error";
1273	    startinpos = s-starts;
1274	    endinpos = startinpos+1;
1275	    goto utf8Error;
1276
1277        case 2:
1278            if ((s[1] & 0xc0) != 0x80) {
1279                errmsg = "invalid data";
1280		startinpos = s-starts;
1281		endinpos = startinpos+2;
1282		goto utf8Error;
1283	    }
1284            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1285            if (ch < 0x80) {
1286		startinpos = s-starts;
1287		endinpos = startinpos+2;
1288                errmsg = "illegal encoding";
1289		goto utf8Error;
1290	    }
1291	    else
1292		*p++ = (Py_UNICODE)ch;
1293            break;
1294
1295        case 3:
1296            if ((s[1] & 0xc0) != 0x80 ||
1297                (s[2] & 0xc0) != 0x80) {
1298                errmsg = "invalid data";
1299		startinpos = s-starts;
1300		endinpos = startinpos+3;
1301		goto utf8Error;
1302	    }
1303            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1304            if (ch < 0x0800) {
1305		/* Note: UTF-8 encodings of surrogates are considered
1306		   legal UTF-8 sequences;
1307
1308		   XXX For wide builds (UCS-4) we should probably try
1309		       to recombine the surrogates into a single code
1310		       unit.
1311		*/
1312                errmsg = "illegal encoding";
1313		startinpos = s-starts;
1314		endinpos = startinpos+3;
1315		goto utf8Error;
1316	    }
1317	    else
1318		*p++ = (Py_UNICODE)ch;
1319            break;
1320
1321        case 4:
1322            if ((s[1] & 0xc0) != 0x80 ||
1323                (s[2] & 0xc0) != 0x80 ||
1324                (s[3] & 0xc0) != 0x80) {
1325                errmsg = "invalid data";
1326		startinpos = s-starts;
1327		endinpos = startinpos+4;
1328		goto utf8Error;
1329	    }
1330            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1331                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1332            /* validate and convert to UTF-16 */
1333            if ((ch < 0x10000)        /* minimum value allowed for 4
1334					 byte encoding */
1335                || (ch > 0x10ffff))   /* maximum value allowed for
1336					 UTF-16 */
1337	    {
1338                errmsg = "illegal encoding";
1339		startinpos = s-starts;
1340		endinpos = startinpos+4;
1341		goto utf8Error;
1342	    }
1343#ifdef Py_UNICODE_WIDE
1344	    *p++ = (Py_UNICODE)ch;
1345#else
1346            /*  compute and append the two surrogates: */
1347
1348            /*  translate from 10000..10FFFF to 0..FFFF */
1349            ch -= 0x10000;
1350
1351            /*  high surrogate = top 10 bits added to D800 */
1352            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1353
1354            /*  low surrogate = bottom 10 bits added to DC00 */
1355            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1356#endif
1357            break;
1358
1359        default:
1360            /* Other sizes are only needed for UCS-4 */
1361            errmsg = "unsupported Unicode code range";
1362	    startinpos = s-starts;
1363	    endinpos = startinpos+n;
1364	    goto utf8Error;
1365        }
1366        s += n;
1367	continue;
1368
1369    utf8Error:
1370    outpos = p-PyUnicode_AS_UNICODE(unicode);
1371    if (unicode_decode_call_errorhandler(
1372	     errors, &errorHandler,
1373	     "utf8", errmsg,
1374	     starts, size, &startinpos, &endinpos, &exc, &s,
1375	     (PyObject **)&unicode, &outpos, &p))
1376	goto onError;
1377    }
1378    if (consumed)
1379	*consumed = s-starts;
1380
1381    /* Adjust length */
1382    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1383        goto onError;
1384
1385    Py_XDECREF(errorHandler);
1386    Py_XDECREF(exc);
1387    return (PyObject *)unicode;
1388
1389onError:
1390    Py_XDECREF(errorHandler);
1391    Py_XDECREF(exc);
1392    Py_DECREF(unicode);
1393    return NULL;
1394}
1395
1396/* Allocation strategy:  if the string is short, convert into a stack buffer
1397   and allocate exactly as much space needed at the end.  Else allocate the
1398   maximum possible needed (4 result bytes per Unicode character), and return
1399   the excess memory at the end.
1400*/
1401PyObject *
1402PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1403		     Py_ssize_t size,
1404		     const char *errors)
1405{
1406#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1407
1408    Py_ssize_t i;           /* index into s of next input byte */
1409    PyObject *v;        /* result string object */
1410    char *p;            /* next free byte in output buffer */
1411    Py_ssize_t nallocated;  /* number of result bytes allocated */
1412    Py_ssize_t nneeded;        /* number of result bytes needed */
1413    char stackbuf[MAX_SHORT_UNICHARS * 4];
1414
1415    assert(s != NULL);
1416    assert(size >= 0);
1417
1418    if (size <= MAX_SHORT_UNICHARS) {
1419        /* Write into the stack buffer; nallocated can't overflow.
1420         * At the end, we'll allocate exactly as much heap space as it
1421         * turns out we need.
1422         */
1423        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1424        v = NULL;   /* will allocate after we're done */
1425        p = stackbuf;
1426    }
1427    else {
1428        /* Overallocate on the heap, and give the excess back at the end. */
1429        nallocated = size * 4;
1430        if (nallocated / 4 != size)  /* overflow! */
1431            return PyErr_NoMemory();
1432        v = PyString_FromStringAndSize(NULL, nallocated);
1433        if (v == NULL)
1434            return NULL;
1435        p = PyString_AS_STRING(v);
1436    }
1437
1438    for (i = 0; i < size;) {
1439        Py_UCS4 ch = s[i++];
1440
1441        if (ch < 0x80)
1442            /* Encode ASCII */
1443            *p++ = (char) ch;
1444
1445        else if (ch < 0x0800) {
1446            /* Encode Latin-1 */
1447            *p++ = (char)(0xc0 | (ch >> 6));
1448            *p++ = (char)(0x80 | (ch & 0x3f));
1449        }
1450        else {
1451            /* Encode UCS2 Unicode ordinals */
1452            if (ch < 0x10000) {
1453                /* Special case: check for high surrogate */
1454                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1455                    Py_UCS4 ch2 = s[i];
1456                    /* Check for low surrogate and combine the two to
1457                       form a UCS4 value */
1458                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1459                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1460                        i++;
1461                        goto encodeUCS4;
1462                    }
1463                    /* Fall through: handles isolated high surrogates */
1464                }
1465                *p++ = (char)(0xe0 | (ch >> 12));
1466                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1467                *p++ = (char)(0x80 | (ch & 0x3f));
1468                continue;
1469    	    }
1470encodeUCS4:
1471            /* Encode UCS4 Unicode ordinals */
1472            *p++ = (char)(0xf0 | (ch >> 18));
1473            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1474            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1475            *p++ = (char)(0x80 | (ch & 0x3f));
1476        }
1477    }
1478
1479    if (v == NULL) {
1480        /* This was stack allocated. */
1481        nneeded = p - stackbuf;
1482        assert(nneeded <= nallocated);
1483        v = PyString_FromStringAndSize(stackbuf, nneeded);
1484    }
1485    else {
1486    	/* Cut back to size actually needed. */
1487        nneeded = p - PyString_AS_STRING(v);
1488        assert(nneeded <= nallocated);
1489        _PyString_Resize(&v, nneeded);
1490    }
1491    return v;
1492
1493#undef MAX_SHORT_UNICHARS
1494}
1495
1496PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1497{
1498    if (!PyUnicode_Check(unicode)) {
1499        PyErr_BadArgument();
1500        return NULL;
1501    }
1502    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1503				PyUnicode_GET_SIZE(unicode),
1504				NULL);
1505}
1506
1507/* --- UTF-16 Codec ------------------------------------------------------- */
1508
1509PyObject *
1510PyUnicode_DecodeUTF16(const char *s,
1511		      Py_ssize_t size,
1512		      const char *errors,
1513		      int *byteorder)
1514{
1515    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1516}
1517
1518PyObject *
1519PyUnicode_DecodeUTF16Stateful(const char *s,
1520			      Py_ssize_t size,
1521			      const char *errors,
1522			      int *byteorder,
1523			      Py_ssize_t *consumed)
1524{
1525    const char *starts = s;
1526    Py_ssize_t startinpos;
1527    Py_ssize_t endinpos;
1528    Py_ssize_t outpos;
1529    PyUnicodeObject *unicode;
1530    Py_UNICODE *p;
1531    const unsigned char *q, *e;
1532    int bo = 0;       /* assume native ordering by default */
1533    const char *errmsg = "";
1534    /* Offsets from q for retrieving byte pairs in the right order. */
1535#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536    int ihi = 1, ilo = 0;
1537#else
1538    int ihi = 0, ilo = 1;
1539#endif
1540    PyObject *errorHandler = NULL;
1541    PyObject *exc = NULL;
1542
1543    /* Note: size will always be longer than the resulting Unicode
1544       character count */
1545    unicode = _PyUnicode_New(size);
1546    if (!unicode)
1547        return NULL;
1548    if (size == 0)
1549        return (PyObject *)unicode;
1550
1551    /* Unpack UTF-16 encoded data */
1552    p = unicode->str;
1553    q = (unsigned char *)s;
1554    e = q + size;
1555
1556    if (byteorder)
1557        bo = *byteorder;
1558
1559    /* Check for BOM marks (U+FEFF) in the input and adjust current
1560       byte order setting accordingly. In native mode, the leading BOM
1561       mark is skipped, in all other modes, it is copied to the output
1562       stream as-is (giving a ZWNBSP character). */
1563    if (bo == 0) {
1564        if (size >= 2) {
1565            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1566#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1567	    if (bom == 0xFEFF) {
1568		q += 2;
1569		bo = -1;
1570	    }
1571	    else if (bom == 0xFFFE) {
1572		q += 2;
1573		bo = 1;
1574	    }
1575#else
1576	    if (bom == 0xFEFF) {
1577		q += 2;
1578		bo = 1;
1579	    }
1580	    else if (bom == 0xFFFE) {
1581		q += 2;
1582		bo = -1;
1583	    }
1584#endif
1585	}
1586    }
1587
1588    if (bo == -1) {
1589        /* force LE */
1590        ihi = 1;
1591        ilo = 0;
1592    }
1593    else if (bo == 1) {
1594        /* force BE */
1595        ihi = 0;
1596        ilo = 1;
1597    }
1598
1599    while (q < e) {
1600	Py_UNICODE ch;
1601	/* remaining bytes at the end? (size should be even) */
1602	if (e-q<2) {
1603	    if (consumed)
1604		break;
1605	    errmsg = "truncated data";
1606	    startinpos = ((const char *)q)-starts;
1607	    endinpos = ((const char *)e)-starts;
1608	    goto utf16Error;
1609	    /* The remaining input chars are ignored if the callback
1610	       chooses to skip the input */
1611	}
1612	ch = (q[ihi] << 8) | q[ilo];
1613
1614	q += 2;
1615
1616	if (ch < 0xD800 || ch > 0xDFFF) {
1617	    *p++ = ch;
1618	    continue;
1619	}
1620
1621	/* UTF-16 code pair: */
1622	if (q >= e) {
1623	    errmsg = "unexpected end of data";
1624	    startinpos = (((const char *)q)-2)-starts;
1625	    endinpos = ((const char *)e)-starts;
1626	    goto utf16Error;
1627	}
1628	if (0xD800 <= ch && ch <= 0xDBFF) {
1629	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1630	    q += 2;
1631	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1632#ifndef Py_UNICODE_WIDE
1633		*p++ = ch;
1634		*p++ = ch2;
1635#else
1636		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1637#endif
1638		continue;
1639	    }
1640	    else {
1641                errmsg = "illegal UTF-16 surrogate";
1642		startinpos = (((const char *)q)-4)-starts;
1643		endinpos = startinpos+2;
1644		goto utf16Error;
1645	    }
1646
1647	}
1648	errmsg = "illegal encoding";
1649	startinpos = (((const char *)q)-2)-starts;
1650	endinpos = startinpos+2;
1651	/* Fall through to report the error */
1652
1653    utf16Error:
1654	outpos = p-PyUnicode_AS_UNICODE(unicode);
1655	if (unicode_decode_call_errorhandler(
1656	         errors, &errorHandler,
1657	         "utf16", errmsg,
1658	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1659	         (PyObject **)&unicode, &outpos, &p))
1660	    goto onError;
1661    }
1662
1663    if (byteorder)
1664        *byteorder = bo;
1665
1666    if (consumed)
1667	*consumed = (const char *)q-starts;
1668
1669    /* Adjust length */
1670    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1671        goto onError;
1672
1673    Py_XDECREF(errorHandler);
1674    Py_XDECREF(exc);
1675    return (PyObject *)unicode;
1676
1677onError:
1678    Py_DECREF(unicode);
1679    Py_XDECREF(errorHandler);
1680    Py_XDECREF(exc);
1681    return NULL;
1682}
1683
1684PyObject *
1685PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1686		      Py_ssize_t size,
1687		      const char *errors,
1688		      int byteorder)
1689{
1690    PyObject *v;
1691    unsigned char *p;
1692#ifdef Py_UNICODE_WIDE
1693    int i, pairs;
1694#else
1695    const int pairs = 0;
1696#endif
1697    /* Offsets from p for storing byte pairs in the right order. */
1698#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699    int ihi = 1, ilo = 0;
1700#else
1701    int ihi = 0, ilo = 1;
1702#endif
1703
1704#define STORECHAR(CH)                   \
1705    do {                                \
1706        p[ihi] = ((CH) >> 8) & 0xff;    \
1707        p[ilo] = (CH) & 0xff;           \
1708        p += 2;                         \
1709    } while(0)
1710
1711#ifdef Py_UNICODE_WIDE
1712    for (i = pairs = 0; i < size; i++)
1713	if (s[i] >= 0x10000)
1714	    pairs++;
1715#endif
1716    v = PyString_FromStringAndSize(NULL,
1717		  2 * (size + pairs + (byteorder == 0)));
1718    if (v == NULL)
1719        return NULL;
1720
1721    p = (unsigned char *)PyString_AS_STRING(v);
1722    if (byteorder == 0)
1723	STORECHAR(0xFEFF);
1724    if (size == 0)
1725        return v;
1726
1727    if (byteorder == -1) {
1728        /* force LE */
1729        ihi = 1;
1730        ilo = 0;
1731    }
1732    else if (byteorder == 1) {
1733        /* force BE */
1734        ihi = 0;
1735        ilo = 1;
1736    }
1737
1738    while (size-- > 0) {
1739	Py_UNICODE ch = *s++;
1740	Py_UNICODE ch2 = 0;
1741#ifdef Py_UNICODE_WIDE
1742	if (ch >= 0x10000) {
1743	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1744	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1745	}
1746#endif
1747        STORECHAR(ch);
1748        if (ch2)
1749            STORECHAR(ch2);
1750    }
1751    return v;
1752#undef STORECHAR
1753}
1754
1755PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1756{
1757    if (!PyUnicode_Check(unicode)) {
1758        PyErr_BadArgument();
1759        return NULL;
1760    }
1761    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1762				 PyUnicode_GET_SIZE(unicode),
1763				 NULL,
1764				 0);
1765}
1766
1767/* --- Unicode Escape Codec ----------------------------------------------- */
1768
1769static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1770
1771PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1772					Py_ssize_t size,
1773					const char *errors)
1774{
1775    const char *starts = s;
1776    Py_ssize_t startinpos;
1777    Py_ssize_t endinpos;
1778    Py_ssize_t outpos;
1779    int i;
1780    PyUnicodeObject *v;
1781    Py_UNICODE *p;
1782    const char *end;
1783    char* message;
1784    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1785    PyObject *errorHandler = NULL;
1786    PyObject *exc = NULL;
1787
1788    /* Escaped strings will always be longer than the resulting
1789       Unicode string, so we start with size here and then reduce the
1790       length after conversion to the true value.
1791       (but if the error callback returns a long replacement string
1792       we'll have to allocate more space) */
1793    v = _PyUnicode_New(size);
1794    if (v == NULL)
1795        goto onError;
1796    if (size == 0)
1797        return (PyObject *)v;
1798
1799    p = PyUnicode_AS_UNICODE(v);
1800    end = s + size;
1801
1802    while (s < end) {
1803        unsigned char c;
1804        Py_UNICODE x;
1805        int digits;
1806
1807        /* Non-escape characters are interpreted as Unicode ordinals */
1808        if (*s != '\\') {
1809            *p++ = (unsigned char) *s++;
1810            continue;
1811        }
1812
1813        startinpos = s-starts;
1814        /* \ - Escapes */
1815        s++;
1816        switch (*s++) {
1817
1818        /* \x escapes */
1819        case '\n': break;
1820        case '\\': *p++ = '\\'; break;
1821        case '\'': *p++ = '\''; break;
1822        case '\"': *p++ = '\"'; break;
1823        case 'b': *p++ = '\b'; break;
1824        case 'f': *p++ = '\014'; break; /* FF */
1825        case 't': *p++ = '\t'; break;
1826        case 'n': *p++ = '\n'; break;
1827        case 'r': *p++ = '\r'; break;
1828        case 'v': *p++ = '\013'; break; /* VT */
1829        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1830
1831        /* \OOO (octal) escapes */
1832        case '0': case '1': case '2': case '3':
1833        case '4': case '5': case '6': case '7':
1834            x = s[-1] - '0';
1835            if ('0' <= *s && *s <= '7') {
1836                x = (x<<3) + *s++ - '0';
1837                if ('0' <= *s && *s <= '7')
1838                    x = (x<<3) + *s++ - '0';
1839            }
1840            *p++ = x;
1841            break;
1842
1843        /* hex escapes */
1844        /* \xXX */
1845        case 'x':
1846            digits = 2;
1847            message = "truncated \\xXX escape";
1848            goto hexescape;
1849
1850        /* \uXXXX */
1851        case 'u':
1852            digits = 4;
1853            message = "truncated \\uXXXX escape";
1854            goto hexescape;
1855
1856        /* \UXXXXXXXX */
1857        case 'U':
1858            digits = 8;
1859            message = "truncated \\UXXXXXXXX escape";
1860        hexescape:
1861            chr = 0;
1862            outpos = p-PyUnicode_AS_UNICODE(v);
1863            if (s+digits>end) {
1864                endinpos = size;
1865                if (unicode_decode_call_errorhandler(
1866                    errors, &errorHandler,
1867                    "unicodeescape", "end of string in escape sequence",
1868                    starts, size, &startinpos, &endinpos, &exc, &s,
1869                    (PyObject **)&v, &outpos, &p))
1870                    goto onError;
1871                goto nextByte;
1872            }
1873            for (i = 0; i < digits; ++i) {
1874                c = (unsigned char) s[i];
1875                if (!isxdigit(c)) {
1876                    endinpos = (s+i+1)-starts;
1877                    if (unicode_decode_call_errorhandler(
1878                        errors, &errorHandler,
1879                        "unicodeescape", message,
1880                        starts, size, &startinpos, &endinpos, &exc, &s,
1881                        (PyObject **)&v, &outpos, &p))
1882                        goto onError;
1883                    goto nextByte;
1884                }
1885                chr = (chr<<4) & ~0xF;
1886                if (c >= '0' && c <= '9')
1887                    chr += c - '0';
1888                else if (c >= 'a' && c <= 'f')
1889                    chr += 10 + c - 'a';
1890                else
1891                    chr += 10 + c - 'A';
1892            }
1893            s += i;
1894            if (chr == 0xffffffff && PyErr_Occurred())
1895                /* _decoding_error will have already written into the
1896                   target buffer. */
1897                break;
1898        store:
1899            /* when we get here, chr is a 32-bit unicode character */
1900            if (chr <= 0xffff)
1901                /* UCS-2 character */
1902                *p++ = (Py_UNICODE) chr;
1903            else if (chr <= 0x10ffff) {
1904                /* UCS-4 character. Either store directly, or as
1905                   surrogate pair. */
1906#ifdef Py_UNICODE_WIDE
1907                *p++ = chr;
1908#else
1909                chr -= 0x10000L;
1910                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1911                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1912#endif
1913            } else {
1914                endinpos = s-starts;
1915                outpos = p-PyUnicode_AS_UNICODE(v);
1916                if (unicode_decode_call_errorhandler(
1917                    errors, &errorHandler,
1918                    "unicodeescape", "illegal Unicode character",
1919                    starts, size, &startinpos, &endinpos, &exc, &s,
1920                    (PyObject **)&v, &outpos, &p))
1921                    goto onError;
1922            }
1923            break;
1924
1925        /* \N{name} */
1926        case 'N':
1927            message = "malformed \\N character escape";
1928            if (ucnhash_CAPI == NULL) {
1929                /* load the unicode data module */
1930                PyObject *m, *api;
1931                m = PyImport_ImportModule("unicodedata");
1932                if (m == NULL)
1933                    goto ucnhashError;
1934                api = PyObject_GetAttrString(m, "ucnhash_CAPI");
1935                Py_DECREF(m);
1936                if (api == NULL)
1937                    goto ucnhashError;
1938                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
1939                Py_DECREF(api);
1940                if (ucnhash_CAPI == NULL)
1941                    goto ucnhashError;
1942            }
1943            if (*s == '{') {
1944                const char *start = s+1;
1945                /* look for the closing brace */
1946                while (*s != '}' && s < end)
1947                    s++;
1948                if (s > start && s < end && *s == '}') {
1949                    /* found a name.  look it up in the unicode database */
1950                    message = "unknown Unicode character name";
1951                    s++;
1952                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
1953                        goto store;
1954                }
1955            }
1956            endinpos = s-starts;
1957            outpos = p-PyUnicode_AS_UNICODE(v);
1958            if (unicode_decode_call_errorhandler(
1959                errors, &errorHandler,
1960                "unicodeescape", message,
1961                starts, size, &startinpos, &endinpos, &exc, &s,
1962                (PyObject **)&v, &outpos, &p))
1963                goto onError;
1964            break;
1965
1966        default:
1967            if (s > end) {
1968                message = "\\ at end of string";
1969                s--;
1970                endinpos = s-starts;
1971                outpos = p-PyUnicode_AS_UNICODE(v);
1972                if (unicode_decode_call_errorhandler(
1973                    errors, &errorHandler,
1974                    "unicodeescape", message,
1975                    starts, size, &startinpos, &endinpos, &exc, &s,
1976                    (PyObject **)&v, &outpos, &p))
1977                    goto onError;
1978            }
1979            else {
1980                *p++ = '\\';
1981                *p++ = (unsigned char)s[-1];
1982            }
1983            break;
1984        }
1985        nextByte:
1986        ;
1987    }
1988    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
1989        goto onError;
1990    Py_XDECREF(errorHandler);
1991    Py_XDECREF(exc);
1992    return (PyObject *)v;
1993
1994ucnhashError:
1995    PyErr_SetString(
1996        PyExc_UnicodeError,
1997        "\\N escapes not supported (can't load unicodedata module)"
1998        );
1999    Py_XDECREF(v);
2000    Py_XDECREF(errorHandler);
2001    Py_XDECREF(exc);
2002    return NULL;
2003
2004onError:
2005    Py_XDECREF(v);
2006    Py_XDECREF(errorHandler);
2007    Py_XDECREF(exc);
2008    return NULL;
2009}
2010
2011/* Return a Unicode-Escape string version of the Unicode object.
2012
2013   If quotes is true, the string is enclosed in u"" or u'' quotes as
2014   appropriate.
2015
2016*/
2017
2018Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2019                                      Py_ssize_t size,
2020                                      Py_UNICODE ch)
2021{
2022    /* like wcschr, but doesn't stop at NULL characters */
2023
2024    while (size-- > 0) {
2025        if (*s == ch)
2026            return s;
2027        s++;
2028    }
2029
2030    return NULL;
2031}
2032
2033static
2034PyObject *unicodeescape_string(const Py_UNICODE *s,
2035                               Py_ssize_t size,
2036                               int quotes)
2037{
2038    PyObject *repr;
2039    char *p;
2040
2041    static const char *hexdigit = "0123456789abcdef";
2042
2043    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
2044    if (repr == NULL)
2045        return NULL;
2046
2047    p = PyString_AS_STRING(repr);
2048
2049    if (quotes) {
2050        *p++ = 'u';
2051        *p++ = (findchar(s, size, '\'') &&
2052                !findchar(s, size, '"')) ? '"' : '\'';
2053    }
2054    while (size-- > 0) {
2055        Py_UNICODE ch = *s++;
2056
2057        /* Escape quotes and backslashes */
2058        if ((quotes &&
2059	     ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2060            *p++ = '\\';
2061            *p++ = (char) ch;
2062	    continue;
2063        }
2064
2065#ifdef Py_UNICODE_WIDE
2066        /* Map 21-bit characters to '\U00xxxxxx' */
2067        else if (ch >= 0x10000) {
2068	    Py_ssize_t offset = p - PyString_AS_STRING(repr);
2069
2070	    /* Resize the string if necessary */
2071	    if (offset + 12 > PyString_GET_SIZE(repr)) {
2072		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
2073		    return NULL;
2074		p = PyString_AS_STRING(repr) + offset;
2075	    }
2076
2077            *p++ = '\\';
2078            *p++ = 'U';
2079            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2080            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2081            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2082            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2083            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2084            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2085            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2086            *p++ = hexdigit[ch & 0x0000000F];
2087	    continue;
2088        }
2089#endif
2090	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2091	else if (ch >= 0xD800 && ch < 0xDC00) {
2092	    Py_UNICODE ch2;
2093	    Py_UCS4 ucs;
2094
2095	    ch2 = *s++;
2096	    size--;
2097	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2098		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2099		*p++ = '\\';
2100		*p++ = 'U';
2101		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2102		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2103		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2104		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2105		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2106		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2107		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2108		*p++ = hexdigit[ucs & 0x0000000F];
2109		continue;
2110	    }
2111	    /* Fall through: isolated surrogates are copied as-is */
2112	    s--;
2113	    size++;
2114	}
2115
2116        /* Map 16-bit characters to '\uxxxx' */
2117        if (ch >= 256) {
2118            *p++ = '\\';
2119            *p++ = 'u';
2120            *p++ = hexdigit[(ch >> 12) & 0x000F];
2121            *p++ = hexdigit[(ch >> 8) & 0x000F];
2122            *p++ = hexdigit[(ch >> 4) & 0x000F];
2123            *p++ = hexdigit[ch & 0x000F];
2124        }
2125
2126        /* Map special whitespace to '\t', \n', '\r' */
2127        else if (ch == '\t') {
2128            *p++ = '\\';
2129            *p++ = 't';
2130        }
2131        else if (ch == '\n') {
2132            *p++ = '\\';
2133            *p++ = 'n';
2134        }
2135        else if (ch == '\r') {
2136            *p++ = '\\';
2137            *p++ = 'r';
2138        }
2139
2140        /* Map non-printable US ASCII to '\xhh' */
2141        else if (ch < ' ' || ch >= 0x7F) {
2142            *p++ = '\\';
2143            *p++ = 'x';
2144            *p++ = hexdigit[(ch >> 4) & 0x000F];
2145            *p++ = hexdigit[ch & 0x000F];
2146        }
2147
2148        /* Copy everything else as-is */
2149        else
2150            *p++ = (char) ch;
2151    }
2152    if (quotes)
2153        *p++ = PyString_AS_STRING(repr)[1];
2154
2155    *p = '\0';
2156    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2157    return repr;
2158}
2159
2160PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2161					Py_ssize_t size)
2162{
2163    return unicodeescape_string(s, size, 0);
2164}
2165
2166PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2167{
2168    if (!PyUnicode_Check(unicode)) {
2169        PyErr_BadArgument();
2170        return NULL;
2171    }
2172    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2173					 PyUnicode_GET_SIZE(unicode));
2174}
2175
2176/* --- Raw Unicode Escape Codec ------------------------------------------- */
2177
2178PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2179					   Py_ssize_t size,
2180					   const char *errors)
2181{
2182    const char *starts = s;
2183    Py_ssize_t startinpos;
2184    Py_ssize_t endinpos;
2185    Py_ssize_t outpos;
2186    PyUnicodeObject *v;
2187    Py_UNICODE *p;
2188    const char *end;
2189    const char *bs;
2190    PyObject *errorHandler = NULL;
2191    PyObject *exc = NULL;
2192
2193    /* Escaped strings will always be longer than the resulting
2194       Unicode string, so we start with size here and then reduce the
2195       length after conversion to the true value. (But decoding error
2196       handler might have to resize the string) */
2197    v = _PyUnicode_New(size);
2198    if (v == NULL)
2199	goto onError;
2200    if (size == 0)
2201	return (PyObject *)v;
2202    p = PyUnicode_AS_UNICODE(v);
2203    end = s + size;
2204    while (s < end) {
2205	unsigned char c;
2206	Py_UCS4 x;
2207	int i;
2208        int count;
2209
2210	/* Non-escape characters are interpreted as Unicode ordinals */
2211	if (*s != '\\') {
2212	    *p++ = (unsigned char)*s++;
2213	    continue;
2214	}
2215	startinpos = s-starts;
2216
2217	/* \u-escapes are only interpreted iff the number of leading
2218	   backslashes if odd */
2219	bs = s;
2220	for (;s < end;) {
2221	    if (*s != '\\')
2222		break;
2223	    *p++ = (unsigned char)*s++;
2224	}
2225	if (((s - bs) & 1) == 0 ||
2226	    s >= end ||
2227	    (*s != 'u' && *s != 'U')) {
2228	    continue;
2229	}
2230	p--;
2231        count = *s=='u' ? 4 : 8;
2232	s++;
2233
2234	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2235	outpos = p-PyUnicode_AS_UNICODE(v);
2236	for (x = 0, i = 0; i < count; ++i, ++s) {
2237	    c = (unsigned char)*s;
2238	    if (!isxdigit(c)) {
2239		endinpos = s-starts;
2240		if (unicode_decode_call_errorhandler(
2241		    errors, &errorHandler,
2242		    "rawunicodeescape", "truncated \\uXXXX",
2243		    starts, size, &startinpos, &endinpos, &exc, &s,
2244		    (PyObject **)&v, &outpos, &p))
2245		    goto onError;
2246		goto nextByte;
2247	    }
2248	    x = (x<<4) & ~0xF;
2249	    if (c >= '0' && c <= '9')
2250		x += c - '0';
2251	    else if (c >= 'a' && c <= 'f')
2252		x += 10 + c - 'a';
2253	    else
2254		x += 10 + c - 'A';
2255	}
2256#ifndef Py_UNICODE_WIDE
2257        if (x > 0x10000) {
2258            if (unicode_decode_call_errorhandler(
2259                    errors, &errorHandler,
2260                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
2261		    starts, size, &startinpos, &endinpos, &exc, &s,
2262		    (PyObject **)&v, &outpos, &p))
2263		    goto onError;
2264        }
2265#endif
2266	*p++ = x;
2267	nextByte:
2268	;
2269    }
2270    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2271	goto onError;
2272    Py_XDECREF(errorHandler);
2273    Py_XDECREF(exc);
2274    return (PyObject *)v;
2275
2276 onError:
2277    Py_XDECREF(v);
2278    Py_XDECREF(errorHandler);
2279    Py_XDECREF(exc);
2280    return NULL;
2281}
2282
2283PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2284					   Py_ssize_t size)
2285{
2286    PyObject *repr;
2287    char *p;
2288    char *q;
2289
2290    static const char *hexdigit = "0123456789abcdef";
2291
2292#ifdef Py_UNICODE_WIDE
2293    repr = PyString_FromStringAndSize(NULL, 10 * size);
2294#else
2295    repr = PyString_FromStringAndSize(NULL, 6 * size);
2296#endif
2297    if (repr == NULL)
2298        return NULL;
2299    if (size == 0)
2300	return repr;
2301
2302    p = q = PyString_AS_STRING(repr);
2303    while (size-- > 0) {
2304        Py_UNICODE ch = *s++;
2305#ifdef Py_UNICODE_WIDE
2306	/* Map 32-bit characters to '\Uxxxxxxxx' */
2307	if (ch >= 0x10000) {
2308            *p++ = '\\';
2309            *p++ = 'U';
2310            *p++ = hexdigit[(ch >> 28) & 0xf];
2311            *p++ = hexdigit[(ch >> 24) & 0xf];
2312            *p++ = hexdigit[(ch >> 20) & 0xf];
2313            *p++ = hexdigit[(ch >> 16) & 0xf];
2314            *p++ = hexdigit[(ch >> 12) & 0xf];
2315            *p++ = hexdigit[(ch >> 8) & 0xf];
2316            *p++ = hexdigit[(ch >> 4) & 0xf];
2317            *p++ = hexdigit[ch & 15];
2318        }
2319        else
2320#endif
2321	/* Map 16-bit characters to '\uxxxx' */
2322	if (ch >= 256) {
2323            *p++ = '\\';
2324            *p++ = 'u';
2325            *p++ = hexdigit[(ch >> 12) & 0xf];
2326            *p++ = hexdigit[(ch >> 8) & 0xf];
2327            *p++ = hexdigit[(ch >> 4) & 0xf];
2328            *p++ = hexdigit[ch & 15];
2329        }
2330	/* Copy everything else as-is */
2331	else
2332            *p++ = (char) ch;
2333    }
2334    *p = '\0';
2335    _PyString_Resize(&repr, p - q);
2336    return repr;
2337}
2338
2339PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2340{
2341    if (!PyUnicode_Check(unicode)) {
2342	PyErr_BadArgument();
2343	return NULL;
2344    }
2345    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2346					    PyUnicode_GET_SIZE(unicode));
2347}
2348
2349/* --- Unicode Internal Codec ------------------------------------------- */
2350
2351PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2352					   Py_ssize_t size,
2353					   const char *errors)
2354{
2355    const char *starts = s;
2356    Py_ssize_t startinpos;
2357    Py_ssize_t endinpos;
2358    Py_ssize_t outpos;
2359    PyUnicodeObject *v;
2360    Py_UNICODE *p;
2361    const char *end;
2362    const char *reason;
2363    PyObject *errorHandler = NULL;
2364    PyObject *exc = NULL;
2365
2366#ifdef Py_UNICODE_WIDE
2367    Py_UNICODE unimax = PyUnicode_GetMax();
2368#endif
2369
2370    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2371    if (v == NULL)
2372	goto onError;
2373    if (PyUnicode_GetSize((PyObject *)v) == 0)
2374	return (PyObject *)v;
2375    p = PyUnicode_AS_UNICODE(v);
2376    end = s + size;
2377
2378    while (s < end) {
2379        memcpy(p, s, sizeof(Py_UNICODE));
2380        /* We have to sanity check the raw data, otherwise doom looms for
2381           some malformed UCS-4 data. */
2382        if (
2383            #ifdef Py_UNICODE_WIDE
2384            *p > unimax || *p < 0 ||
2385            #endif
2386            end-s < Py_UNICODE_SIZE
2387            )
2388            {
2389            startinpos = s - starts;
2390            if (end-s < Py_UNICODE_SIZE) {
2391                endinpos = end-starts;
2392                reason = "truncated input";
2393            }
2394            else {
2395                endinpos = s - starts + Py_UNICODE_SIZE;
2396                reason = "illegal code point (> 0x10FFFF)";
2397            }
2398            outpos = p - PyUnicode_AS_UNICODE(v);
2399            if (unicode_decode_call_errorhandler(
2400                    errors, &errorHandler,
2401                    "unicode_internal", reason,
2402                    starts, size, &startinpos, &endinpos, &exc, &s,
2403                    (PyObject **)&v, &outpos, &p)) {
2404                goto onError;
2405            }
2406        }
2407        else {
2408            p++;
2409            s += Py_UNICODE_SIZE;
2410        }
2411    }
2412
2413    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2414        goto onError;
2415    Py_XDECREF(errorHandler);
2416    Py_XDECREF(exc);
2417    return (PyObject *)v;
2418
2419 onError:
2420    Py_XDECREF(v);
2421    Py_XDECREF(errorHandler);
2422    Py_XDECREF(exc);
2423    return NULL;
2424}
2425
2426/* --- Latin-1 Codec ------------------------------------------------------ */
2427
2428PyObject *PyUnicode_DecodeLatin1(const char *s,
2429				 Py_ssize_t size,
2430				 const char *errors)
2431{
2432    PyUnicodeObject *v;
2433    Py_UNICODE *p;
2434
2435    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2436    if (size == 1) {
2437	Py_UNICODE r = *(unsigned char*)s;
2438	return PyUnicode_FromUnicode(&r, 1);
2439    }
2440
2441    v = _PyUnicode_New(size);
2442    if (v == NULL)
2443	goto onError;
2444    if (size == 0)
2445	return (PyObject *)v;
2446    p = PyUnicode_AS_UNICODE(v);
2447    while (size-- > 0)
2448	*p++ = (unsigned char)*s++;
2449    return (PyObject *)v;
2450
2451 onError:
2452    Py_XDECREF(v);
2453    return NULL;
2454}
2455
2456/* create or adjust a UnicodeEncodeError */
2457static void make_encode_exception(PyObject **exceptionObject,
2458    const char *encoding,
2459    const Py_UNICODE *unicode, Py_ssize_t size,
2460    Py_ssize_t startpos, Py_ssize_t endpos,
2461    const char *reason)
2462{
2463    if (*exceptionObject == NULL) {
2464	*exceptionObject = PyUnicodeEncodeError_Create(
2465	    encoding, unicode, size, startpos, endpos, reason);
2466    }
2467    else {
2468	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2469	    goto onError;
2470	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2471	    goto onError;
2472	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2473	    goto onError;
2474	return;
2475	onError:
2476	Py_DECREF(*exceptionObject);
2477	*exceptionObject = NULL;
2478    }
2479}
2480
2481/* raises a UnicodeEncodeError */
2482static void raise_encode_exception(PyObject **exceptionObject,
2483    const char *encoding,
2484    const Py_UNICODE *unicode, Py_ssize_t size,
2485    Py_ssize_t startpos, Py_ssize_t endpos,
2486    const char *reason)
2487{
2488    make_encode_exception(exceptionObject,
2489	encoding, unicode, size, startpos, endpos, reason);
2490    if (*exceptionObject != NULL)
2491	PyCodec_StrictErrors(*exceptionObject);
2492}
2493
2494/* error handling callback helper:
2495   build arguments, call the callback and check the arguments,
2496   put the result into newpos and return the replacement string, which
2497   has to be freed by the caller */
2498static PyObject *unicode_encode_call_errorhandler(const char *errors,
2499    PyObject **errorHandler,
2500    const char *encoding, const char *reason,
2501    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2502    Py_ssize_t startpos, Py_ssize_t endpos,
2503    Py_ssize_t *newpos)
2504{
2505    static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
2506
2507    PyObject *restuple;
2508    PyObject *resunicode;
2509
2510    if (*errorHandler == NULL) {
2511	*errorHandler = PyCodec_LookupError(errors);
2512        if (*errorHandler == NULL)
2513	    return NULL;
2514    }
2515
2516    make_encode_exception(exceptionObject,
2517	encoding, unicode, size, startpos, endpos, reason);
2518    if (*exceptionObject == NULL)
2519	return NULL;
2520
2521    restuple = PyObject_CallFunctionObjArgs(
2522	*errorHandler, *exceptionObject, NULL);
2523    if (restuple == NULL)
2524	return NULL;
2525    if (!PyTuple_Check(restuple)) {
2526	PyErr_Format(PyExc_TypeError, &argparse[4]);
2527	Py_DECREF(restuple);
2528	return NULL;
2529    }
2530    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2531	&resunicode, newpos)) {
2532	Py_DECREF(restuple);
2533	return NULL;
2534    }
2535    if (*newpos<0)
2536	*newpos = size+*newpos;
2537    if (*newpos<0 || *newpos>size) {
2538	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
2539	Py_DECREF(restuple);
2540	return NULL;
2541    }
2542    Py_INCREF(resunicode);
2543    Py_DECREF(restuple);
2544    return resunicode;
2545}
2546
2547static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2548				 Py_ssize_t size,
2549				 const char *errors,
2550				 int limit)
2551{
2552    /* output object */
2553    PyObject *res;
2554    /* pointers to the beginning and end+1 of input */
2555    const Py_UNICODE *startp = p;
2556    const Py_UNICODE *endp = p + size;
2557    /* pointer to the beginning of the unencodable characters */
2558    /* const Py_UNICODE *badp = NULL; */
2559    /* pointer into the output */
2560    char *str;
2561    /* current output position */
2562    Py_ssize_t respos = 0;
2563    Py_ssize_t ressize;
2564    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2565    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2566    PyObject *errorHandler = NULL;
2567    PyObject *exc = NULL;
2568    /* the following variable is used for caching string comparisons
2569     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2570    int known_errorHandler = -1;
2571
2572    /* allocate enough for a simple encoding without
2573       replacements, if we need more, we'll resize */
2574    res = PyString_FromStringAndSize(NULL, size);
2575    if (res == NULL)
2576        goto onError;
2577    if (size == 0)
2578	return res;
2579    str = PyString_AS_STRING(res);
2580    ressize = size;
2581
2582    while (p<endp) {
2583	Py_UNICODE c = *p;
2584
2585	/* can we encode this? */
2586	if (c<limit) {
2587	    /* no overflow check, because we know that the space is enough */
2588	    *str++ = (char)c;
2589	    ++p;
2590	}
2591	else {
2592	    Py_ssize_t unicodepos = p-startp;
2593	    Py_ssize_t requiredsize;
2594	    PyObject *repunicode;
2595	    Py_ssize_t repsize;
2596	    Py_ssize_t newpos;
2597	    Py_ssize_t respos;
2598	    Py_UNICODE *uni2;
2599	    /* startpos for collecting unencodable chars */
2600	    const Py_UNICODE *collstart = p;
2601	    const Py_UNICODE *collend = p;
2602	    /* find all unecodable characters */
2603	    while ((collend < endp) && ((*collend)>=limit))
2604		++collend;
2605	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2606	    if (known_errorHandler==-1) {
2607		if ((errors==NULL) || (!strcmp(errors, "strict")))
2608		    known_errorHandler = 1;
2609		else if (!strcmp(errors, "replace"))
2610		    known_errorHandler = 2;
2611		else if (!strcmp(errors, "ignore"))
2612		    known_errorHandler = 3;
2613		else if (!strcmp(errors, "xmlcharrefreplace"))
2614		    known_errorHandler = 4;
2615		else
2616		    known_errorHandler = 0;
2617	    }
2618	    switch (known_errorHandler) {
2619		case 1: /* strict */
2620		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2621		    goto onError;
2622		case 2: /* replace */
2623		    while (collstart++<collend)
2624			*str++ = '?'; /* fall through */
2625		case 3: /* ignore */
2626		    p = collend;
2627		    break;
2628		case 4: /* xmlcharrefreplace */
2629		    respos = str-PyString_AS_STRING(res);
2630		    /* determine replacement size (temporarily (mis)uses p) */
2631		    for (p = collstart, repsize = 0; p < collend; ++p) {
2632			if (*p<10)
2633			    repsize += 2+1+1;
2634			else if (*p<100)
2635			    repsize += 2+2+1;
2636			else if (*p<1000)
2637			    repsize += 2+3+1;
2638			else if (*p<10000)
2639			    repsize += 2+4+1;
2640#ifndef Py_UNICODE_WIDE
2641			else
2642			    repsize += 2+5+1;
2643#else
2644			else if (*p<100000)
2645			    repsize += 2+5+1;
2646			else if (*p<1000000)
2647			    repsize += 2+6+1;
2648			else
2649			    repsize += 2+7+1;
2650#endif
2651		    }
2652		    requiredsize = respos+repsize+(endp-collend);
2653		    if (requiredsize > ressize) {
2654			if (requiredsize<2*ressize)
2655			    requiredsize = 2*ressize;
2656			if (_PyString_Resize(&res, requiredsize))
2657			    goto onError;
2658			str = PyString_AS_STRING(res) + respos;
2659			ressize = requiredsize;
2660		    }
2661		    /* generate replacement (temporarily (mis)uses p) */
2662		    for (p = collstart; p < collend; ++p) {
2663			str += sprintf(str, "&#%d;", (int)*p);
2664		    }
2665		    p = collend;
2666		    break;
2667		default:
2668		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2669			encoding, reason, startp, size, &exc,
2670			collstart-startp, collend-startp, &newpos);
2671		    if (repunicode == NULL)
2672			goto onError;
2673		    /* need more space? (at least enough for what we
2674		       have+the replacement+the rest of the string, so
2675		       we won't have to check space for encodable characters) */
2676		    respos = str-PyString_AS_STRING(res);
2677		    repsize = PyUnicode_GET_SIZE(repunicode);
2678		    requiredsize = respos+repsize+(endp-collend);
2679		    if (requiredsize > ressize) {
2680			if (requiredsize<2*ressize)
2681			    requiredsize = 2*ressize;
2682			if (_PyString_Resize(&res, requiredsize)) {
2683			    Py_DECREF(repunicode);
2684			    goto onError;
2685			}
2686			str = PyString_AS_STRING(res) + respos;
2687			ressize = requiredsize;
2688		    }
2689		    /* check if there is anything unencodable in the replacement
2690		       and copy it to the output */
2691		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2692			c = *uni2;
2693			if (c >= limit) {
2694			    raise_encode_exception(&exc, encoding, startp, size,
2695				unicodepos, unicodepos+1, reason);
2696			    Py_DECREF(repunicode);
2697			    goto onError;
2698			}
2699			*str = (char)c;
2700		    }
2701		    p = startp + newpos;
2702		    Py_DECREF(repunicode);
2703	    }
2704	}
2705    }
2706    /* Resize if we allocated to much */
2707    respos = str-PyString_AS_STRING(res);
2708    if (respos<ressize)
2709       /* If this falls res will be NULL */
2710	_PyString_Resize(&res, respos);
2711    Py_XDECREF(errorHandler);
2712    Py_XDECREF(exc);
2713    return res;
2714
2715    onError:
2716    Py_XDECREF(res);
2717    Py_XDECREF(errorHandler);
2718    Py_XDECREF(exc);
2719    return NULL;
2720}
2721
2722PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2723				 Py_ssize_t size,
2724				 const char *errors)
2725{
2726    return unicode_encode_ucs1(p, size, errors, 256);
2727}
2728
2729PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2730{
2731    if (!PyUnicode_Check(unicode)) {
2732	PyErr_BadArgument();
2733	return NULL;
2734    }
2735    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2736				  PyUnicode_GET_SIZE(unicode),
2737				  NULL);
2738}
2739
2740/* --- 7-bit ASCII Codec -------------------------------------------------- */
2741
2742PyObject *PyUnicode_DecodeASCII(const char *s,
2743				Py_ssize_t size,
2744				const char *errors)
2745{
2746    const char *starts = s;
2747    PyUnicodeObject *v;
2748    Py_UNICODE *p;
2749    Py_ssize_t startinpos;
2750    Py_ssize_t endinpos;
2751    Py_ssize_t outpos;
2752    const char *e;
2753    PyObject *errorHandler = NULL;
2754    PyObject *exc = NULL;
2755
2756    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2757    if (size == 1 && *(unsigned char*)s < 128) {
2758	Py_UNICODE r = *(unsigned char*)s;
2759	return PyUnicode_FromUnicode(&r, 1);
2760    }
2761
2762    v = _PyUnicode_New(size);
2763    if (v == NULL)
2764	goto onError;
2765    if (size == 0)
2766	return (PyObject *)v;
2767    p = PyUnicode_AS_UNICODE(v);
2768    e = s + size;
2769    while (s < e) {
2770	register unsigned char c = (unsigned char)*s;
2771	if (c < 128) {
2772	    *p++ = c;
2773	    ++s;
2774	}
2775	else {
2776	    startinpos = s-starts;
2777	    endinpos = startinpos + 1;
2778	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2779	    if (unicode_decode_call_errorhandler(
2780		 errors, &errorHandler,
2781		 "ascii", "ordinal not in range(128)",
2782		 starts, size, &startinpos, &endinpos, &exc, &s,
2783		 (PyObject **)&v, &outpos, &p))
2784		goto onError;
2785	}
2786    }
2787    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2788	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2789	    goto onError;
2790    Py_XDECREF(errorHandler);
2791    Py_XDECREF(exc);
2792    return (PyObject *)v;
2793
2794 onError:
2795    Py_XDECREF(v);
2796    Py_XDECREF(errorHandler);
2797    Py_XDECREF(exc);
2798    return NULL;
2799}
2800
2801PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2802				Py_ssize_t size,
2803				const char *errors)
2804{
2805    return unicode_encode_ucs1(p, size, errors, 128);
2806}
2807
2808PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2809{
2810    if (!PyUnicode_Check(unicode)) {
2811	PyErr_BadArgument();
2812	return NULL;
2813    }
2814    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2815				 PyUnicode_GET_SIZE(unicode),
2816				 NULL);
2817}
2818
2819#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2820
2821/* --- MBCS codecs for Windows -------------------------------------------- */
2822
2823#if SIZEOF_INT < SIZEOF_SSIZE_T
2824#define NEED_RETRY
2825#endif
2826
2827/* XXX This code is limited to "true" double-byte encodings, as
2828   a) it assumes an incomplete character consists of a single byte, and
2829   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2830      encodings, see IsDBCSLeadByteEx documentation. */
2831
2832static int is_dbcs_lead_byte(const char *s, int offset)
2833{
2834    const char *curr = s + offset;
2835
2836    if (IsDBCSLeadByte(*curr)) {
2837	const char *prev = CharPrev(s, curr);
2838	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
2839    }
2840    return 0;
2841}
2842
2843/*
2844 * Decode MBCS string into unicode object. If 'final' is set, converts
2845 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2846 */
2847static int decode_mbcs(PyUnicodeObject **v,
2848			const char *s, /* MBCS string */
2849			int size, /* sizeof MBCS string */
2850			int final)
2851{
2852    Py_UNICODE *p;
2853    Py_ssize_t n = 0;
2854    int usize = 0;
2855
2856    assert(size >= 0);
2857
2858    /* Skip trailing lead-byte unless 'final' is set */
2859    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
2860	--size;
2861
2862    /* First get the size of the result */
2863    if (size > 0) {
2864	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2865	if (usize == 0) {
2866	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
2867	    return -1;
2868	}
2869    }
2870
2871    if (*v == NULL) {
2872	/* Create unicode object */
2873	*v = _PyUnicode_New(usize);
2874	if (*v == NULL)
2875	    return -1;
2876    }
2877    else {
2878	/* Extend unicode object */
2879	n = PyUnicode_GET_SIZE(*v);
2880	if (_PyUnicode_Resize(v, n + usize) < 0)
2881	    return -1;
2882    }
2883
2884    /* Do the conversion */
2885    if (size > 0) {
2886	p = PyUnicode_AS_UNICODE(*v) + n;
2887	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2888	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
2889	    return -1;
2890	}
2891    }
2892
2893    return size;
2894}
2895
2896PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
2897					Py_ssize_t size,
2898					const char *errors,
2899					Py_ssize_t *consumed)
2900{
2901    PyUnicodeObject *v = NULL;
2902    int done;
2903
2904    if (consumed)
2905	*consumed = 0;
2906
2907#ifdef NEED_RETRY
2908  retry:
2909    if (size > INT_MAX)
2910	done = decode_mbcs(&v, s, INT_MAX, 0);
2911    else
2912#endif
2913	done = decode_mbcs(&v, s, (int)size, !consumed);
2914
2915    if (done < 0) {
2916        Py_XDECREF(v);
2917	return NULL;
2918    }
2919
2920    if (consumed)
2921	*consumed += done;
2922
2923#ifdef NEED_RETRY
2924    if (size > INT_MAX) {
2925	s += done;
2926	size -= done;
2927	goto retry;
2928    }
2929#endif
2930
2931    return (PyObject *)v;
2932}
2933
2934PyObject *PyUnicode_DecodeMBCS(const char *s,
2935				Py_ssize_t size,
2936				const char *errors)
2937{
2938    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
2939}
2940
2941/*
2942 * Convert unicode into string object (MBCS).
2943 * Returns 0 if succeed, -1 otherwise.
2944 */
2945static int encode_mbcs(PyObject **repr,
2946			const Py_UNICODE *p, /* unicode */
2947			int size) /* size of unicode */
2948{
2949    int mbcssize = 0;
2950    Py_ssize_t n = 0;
2951
2952    assert(size >= 0);
2953
2954    /* First get the size of the result */
2955    if (size > 0) {
2956	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2957	if (mbcssize == 0) {
2958	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
2959	    return -1;
2960	}
2961    }
2962
2963    if (*repr == NULL) {
2964	/* Create string object */
2965	*repr = PyString_FromStringAndSize(NULL, mbcssize);
2966	if (*repr == NULL)
2967	    return -1;
2968    }
2969    else {
2970	/* Extend string object */
2971	n = PyString_Size(*repr);
2972	if (_PyString_Resize(repr, n + mbcssize) < 0)
2973	    return -1;
2974    }
2975
2976    /* Do the conversion */
2977    if (size > 0) {
2978	char *s = PyString_AS_STRING(*repr) + n;
2979	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2980	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
2981	    return -1;
2982	}
2983    }
2984
2985    return 0;
2986}
2987
2988PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2989				Py_ssize_t size,
2990				const char *errors)
2991{
2992    PyObject *repr = NULL;
2993    int ret;
2994
2995#ifdef NEED_RETRY
2996 retry:
2997    if (size > INT_MAX)
2998	ret = encode_mbcs(&repr, p, INT_MAX);
2999    else
3000#endif
3001	ret = encode_mbcs(&repr, p, (int)size);
3002
3003    if (ret < 0) {
3004	Py_XDECREF(repr);
3005	return NULL;
3006    }
3007
3008#ifdef NEED_RETRY
3009    if (size > INT_MAX) {
3010	p += INT_MAX;
3011	size -= INT_MAX;
3012	goto retry;
3013    }
3014#endif
3015
3016    return repr;
3017}
3018
3019PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3020{
3021    if (!PyUnicode_Check(unicode)) {
3022        PyErr_BadArgument();
3023        return NULL;
3024    }
3025    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3026				PyUnicode_GET_SIZE(unicode),
3027				NULL);
3028}
3029
3030#undef NEED_RETRY
3031
3032#endif /* MS_WINDOWS */
3033
3034/* --- Character Mapping Codec -------------------------------------------- */
3035
3036PyObject *PyUnicode_DecodeCharmap(const char *s,
3037				  Py_ssize_t size,
3038				  PyObject *mapping,
3039				  const char *errors)
3040{
3041    const char *starts = s;
3042    Py_ssize_t startinpos;
3043    Py_ssize_t endinpos;
3044    Py_ssize_t outpos;
3045    const char *e;
3046    PyUnicodeObject *v;
3047    Py_UNICODE *p;
3048    Py_ssize_t extrachars = 0;
3049    PyObject *errorHandler = NULL;
3050    PyObject *exc = NULL;
3051    Py_UNICODE *mapstring = NULL;
3052    Py_ssize_t maplen = 0;
3053
3054    /* Default to Latin-1 */
3055    if (mapping == NULL)
3056	return PyUnicode_DecodeLatin1(s, size, errors);
3057
3058    v = _PyUnicode_New(size);
3059    if (v == NULL)
3060	goto onError;
3061    if (size == 0)
3062	return (PyObject *)v;
3063    p = PyUnicode_AS_UNICODE(v);
3064    e = s + size;
3065    if (PyUnicode_CheckExact(mapping)) {
3066	mapstring = PyUnicode_AS_UNICODE(mapping);
3067	maplen = PyUnicode_GET_SIZE(mapping);
3068	while (s < e) {
3069	    unsigned char ch = *s;
3070	    Py_UNICODE x = 0xfffe; /* illegal value */
3071
3072	    if (ch < maplen)
3073		x = mapstring[ch];
3074
3075	    if (x == 0xfffe) {
3076		/* undefined mapping */
3077		outpos = p-PyUnicode_AS_UNICODE(v);
3078		startinpos = s-starts;
3079		endinpos = startinpos+1;
3080		if (unicode_decode_call_errorhandler(
3081		     errors, &errorHandler,
3082		     "charmap", "character maps to <undefined>",
3083		     starts, size, &startinpos, &endinpos, &exc, &s,
3084		     (PyObject **)&v, &outpos, &p)) {
3085		    goto onError;
3086		}
3087		continue;
3088	    }
3089	    *p++ = x;
3090	    ++s;
3091	}
3092    }
3093    else {
3094	while (s < e) {
3095	    unsigned char ch = *s;
3096	    PyObject *w, *x;
3097
3098	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
3099	    w = PyInt_FromLong((long)ch);
3100	    if (w == NULL)
3101		goto onError;
3102	    x = PyObject_GetItem(mapping, w);
3103	    Py_DECREF(w);
3104	    if (x == NULL) {
3105		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3106		    /* No mapping found means: mapping is undefined. */
3107		    PyErr_Clear();
3108		    x = Py_None;
3109		    Py_INCREF(x);
3110		} else
3111		    goto onError;
3112	    }
3113
3114	    /* Apply mapping */
3115	    if (PyInt_Check(x)) {
3116		long value = PyInt_AS_LONG(x);
3117		if (value < 0 || value > 65535) {
3118		    PyErr_SetString(PyExc_TypeError,
3119				    "character mapping must be in range(65536)");
3120		    Py_DECREF(x);
3121		    goto onError;
3122		}
3123		*p++ = (Py_UNICODE)value;
3124	    }
3125	    else if (x == Py_None) {
3126		/* undefined mapping */
3127		outpos = p-PyUnicode_AS_UNICODE(v);
3128		startinpos = s-starts;
3129		endinpos = startinpos+1;
3130		if (unicode_decode_call_errorhandler(
3131		     errors, &errorHandler,
3132		     "charmap", "character maps to <undefined>",
3133		     starts, size, &startinpos, &endinpos, &exc, &s,
3134		     (PyObject **)&v, &outpos, &p)) {
3135		    Py_DECREF(x);
3136		    goto onError;
3137		}
3138		Py_DECREF(x);
3139		continue;
3140	    }
3141	    else if (PyUnicode_Check(x)) {
3142		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3143
3144		if (targetsize == 1)
3145		    /* 1-1 mapping */
3146		    *p++ = *PyUnicode_AS_UNICODE(x);
3147
3148		else if (targetsize > 1) {
3149		    /* 1-n mapping */
3150		    if (targetsize > extrachars) {
3151			/* resize first */
3152			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3153			Py_ssize_t needed = (targetsize - extrachars) + \
3154				     (targetsize << 2);
3155			extrachars += needed;
3156			if (_PyUnicode_Resize(&v,
3157					     PyUnicode_GET_SIZE(v) + needed) < 0) {
3158			    Py_DECREF(x);
3159			    goto onError;
3160			}
3161			p = PyUnicode_AS_UNICODE(v) + oldpos;
3162		    }
3163		    Py_UNICODE_COPY(p,
3164				    PyUnicode_AS_UNICODE(x),
3165				    targetsize);
3166		    p += targetsize;
3167		    extrachars -= targetsize;
3168		}
3169		/* 1-0 mapping: skip the character */
3170	    }
3171	    else {
3172		/* wrong return value */
3173		PyErr_SetString(PyExc_TypeError,
3174		      "character mapping must return integer, None or unicode");
3175		Py_DECREF(x);
3176		goto onError;
3177	    }
3178	    Py_DECREF(x);
3179	    ++s;
3180	}
3181    }
3182    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3183	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3184	    goto onError;
3185    Py_XDECREF(errorHandler);
3186    Py_XDECREF(exc);
3187    return (PyObject *)v;
3188
3189 onError:
3190    Py_XDECREF(errorHandler);
3191    Py_XDECREF(exc);
3192    Py_XDECREF(v);
3193    return NULL;
3194}
3195
3196/* Charmap encoding: the lookup table */
3197
3198struct encoding_map{
3199  PyObject_HEAD
3200  unsigned char level1[32];
3201  int count2, count3;
3202  unsigned char level23[1];
3203};
3204
3205static PyObject*
3206encoding_map_size(PyObject *obj, PyObject* args)
3207{
3208    struct encoding_map *map = (struct encoding_map*)obj;
3209    return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3210                          128*map->count3);
3211}
3212
3213static PyMethodDef encoding_map_methods[] = {
3214	{"size", encoding_map_size, METH_NOARGS,
3215         PyDoc_STR("Return the size (in bytes) of this object") },
3216        { 0 }
3217};
3218
3219static void
3220encoding_map_dealloc(PyObject* o)
3221{
3222	PyObject_FREE(o);
3223}
3224
3225static PyTypeObject EncodingMapType = {
3226	PyObject_HEAD_INIT(NULL)
3227        0,                      /*ob_size*/
3228        "EncodingMap",          /*tp_name*/
3229        sizeof(struct encoding_map),   /*tp_basicsize*/
3230        0,                      /*tp_itemsize*/
3231        /* methods */
3232        encoding_map_dealloc,   /*tp_dealloc*/
3233        0,                      /*tp_print*/
3234        0,                      /*tp_getattr*/
3235        0,                      /*tp_setattr*/
3236        0,                      /*tp_compare*/
3237        0,                      /*tp_repr*/
3238        0,                      /*tp_as_number*/
3239        0,                      /*tp_as_sequence*/
3240        0,                      /*tp_as_mapping*/
3241        0,                      /*tp_hash*/
3242        0,                      /*tp_call*/
3243        0,                      /*tp_str*/
3244        0,                      /*tp_getattro*/
3245        0,                      /*tp_setattro*/
3246        0,                      /*tp_as_buffer*/
3247        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
3248        0,                      /*tp_doc*/
3249        0,                      /*tp_traverse*/
3250        0,                      /*tp_clear*/
3251        0,                      /*tp_richcompare*/
3252        0,                      /*tp_weaklistoffset*/
3253        0,                      /*tp_iter*/
3254        0,                      /*tp_iternext*/
3255        encoding_map_methods,   /*tp_methods*/
3256        0,                      /*tp_members*/
3257        0,                      /*tp_getset*/
3258        0,                      /*tp_base*/
3259        0,                      /*tp_dict*/
3260        0,                      /*tp_descr_get*/
3261        0,                      /*tp_descr_set*/
3262        0,                      /*tp_dictoffset*/
3263        0,                      /*tp_init*/
3264        0,                      /*tp_alloc*/
3265        0,                      /*tp_new*/
3266        0,                      /*tp_free*/
3267        0,                      /*tp_is_gc*/
3268};
3269
3270PyObject*
3271PyUnicode_BuildEncodingMap(PyObject* string)
3272{
3273    Py_UNICODE *decode;
3274    PyObject *result;
3275    struct encoding_map *mresult;
3276    int i;
3277    int need_dict = 0;
3278    unsigned char level1[32];
3279    unsigned char level2[512];
3280    unsigned char *mlevel1, *mlevel2, *mlevel3;
3281    int count2 = 0, count3 = 0;
3282
3283    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3284        PyErr_BadArgument();
3285        return NULL;
3286    }
3287    decode = PyUnicode_AS_UNICODE(string);
3288    memset(level1, 0xFF, sizeof level1);
3289    memset(level2, 0xFF, sizeof level2);
3290
3291    /* If there isn't a one-to-one mapping of NULL to \0,
3292       or if there are non-BMP characters, we need to use
3293       a mapping dictionary. */
3294    if (decode[0] != 0)
3295        need_dict = 1;
3296    for (i = 1; i < 256; i++) {
3297        int l1, l2;
3298        if (decode[i] == 0
3299            #ifdef Py_UNICODE_WIDE
3300            || decode[i] > 0xFFFF
3301            #endif
3302        ) {
3303            need_dict = 1;
3304            break;
3305        }
3306        if (decode[i] == 0xFFFE)
3307            /* unmapped character */
3308            continue;
3309        l1 = decode[i] >> 11;
3310        l2 = decode[i] >> 7;
3311        if (level1[l1] == 0xFF)
3312            level1[l1] = count2++;
3313        if (level2[l2] == 0xFF)
3314            level2[l2] = count3++;
3315    }
3316
3317    if (count2 >= 0xFF || count3 >= 0xFF)
3318        need_dict = 1;
3319
3320    if (need_dict) {
3321        PyObject *result = PyDict_New();
3322        PyObject *key, *value;
3323        if (!result)
3324            return NULL;
3325        for (i = 0; i < 256; i++) {
3326            key = value = NULL;
3327            key = PyInt_FromLong(decode[i]);
3328            value = PyInt_FromLong(i);
3329            if (!key || !value)
3330                goto failed1;
3331            if (PyDict_SetItem(result, key, value) == -1)
3332                goto failed1;
3333            Py_DECREF(key);
3334            Py_DECREF(value);
3335        }
3336        return result;
3337      failed1:
3338        Py_XDECREF(key);
3339        Py_XDECREF(value);
3340        Py_DECREF(result);
3341        return NULL;
3342    }
3343
3344    /* Create a three-level trie */
3345    result = PyObject_MALLOC(sizeof(struct encoding_map) +
3346                             16*count2 + 128*count3 - 1);
3347    if (!result)
3348        return PyErr_NoMemory();
3349    PyObject_Init(result, &EncodingMapType);
3350    mresult = (struct encoding_map*)result;
3351    mresult->count2 = count2;
3352    mresult->count3 = count3;
3353    mlevel1 = mresult->level1;
3354    mlevel2 = mresult->level23;
3355    mlevel3 = mresult->level23 + 16*count2;
3356    memcpy(mlevel1, level1, 32);
3357    memset(mlevel2, 0xFF, 16*count2);
3358    memset(mlevel3, 0, 128*count3);
3359    count3 = 0;
3360    for (i = 1; i < 256; i++) {
3361        int o1, o2, o3, i2, i3;
3362        if (decode[i] == 0xFFFE)
3363            /* unmapped character */
3364            continue;
3365        o1 = decode[i]>>11;
3366        o2 = (decode[i]>>7) & 0xF;
3367        i2 = 16*mlevel1[o1] + o2;
3368        if (mlevel2[i2] == 0xFF)
3369            mlevel2[i2] = count3++;
3370        o3 = decode[i] & 0x7F;
3371        i3 = 128*mlevel2[i2] + o3;
3372        mlevel3[i3] = i;
3373    }
3374    return result;
3375}
3376
3377static int
3378encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3379{
3380    struct encoding_map *map = (struct encoding_map*)mapping;
3381    int l1 = c>>11;
3382    int l2 = (c>>7) & 0xF;
3383    int l3 = c & 0x7F;
3384    int i;
3385
3386#ifdef Py_UNICODE_WIDE
3387    if (c > 0xFFFF) {
3388	return -1;
3389    }
3390#endif
3391    if (c == 0)
3392        return 0;
3393    /* level 1*/
3394    i = map->level1[l1];
3395    if (i == 0xFF) {
3396        return -1;
3397    }
3398    /* level 2*/
3399    i = map->level23[16*i+l2];
3400    if (i == 0xFF) {
3401        return -1;
3402    }
3403    /* level 3 */
3404    i = map->level23[16*map->count2 + 128*i + l3];
3405    if (i == 0) {
3406        return -1;
3407    }
3408    return i;
3409}
3410
3411/* Lookup the character ch in the mapping. If the character
3412   can't be found, Py_None is returned (or NULL, if another
3413   error occurred). */
3414static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
3415{
3416    PyObject *w = PyInt_FromLong((long)c);
3417    PyObject *x;
3418
3419    if (w == NULL)
3420	 return NULL;
3421    x = PyObject_GetItem(mapping, w);
3422    Py_DECREF(w);
3423    if (x == NULL) {
3424	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3425	    /* No mapping found means: mapping is undefined. */
3426	    PyErr_Clear();
3427	    x = Py_None;
3428	    Py_INCREF(x);
3429	    return x;
3430	} else
3431	    return NULL;
3432    }
3433    else if (x == Py_None)
3434	return x;
3435    else if (PyInt_Check(x)) {
3436	long value = PyInt_AS_LONG(x);
3437	if (value < 0 || value > 255) {
3438	    PyErr_SetString(PyExc_TypeError,
3439			     "character mapping must be in range(256)");
3440	    Py_DECREF(x);
3441	    return NULL;
3442	}
3443	return x;
3444    }
3445    else if (PyString_Check(x))
3446	return x;
3447    else {
3448	/* wrong return value */
3449	PyErr_SetString(PyExc_TypeError,
3450	      "character mapping must return integer, None or str");
3451	Py_DECREF(x);
3452	return NULL;
3453    }
3454}
3455
3456static int
3457charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3458{
3459	Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3460	/* exponentially overallocate to minimize reallocations */
3461	if (requiredsize < 2*outsize)
3462	    requiredsize = 2*outsize;
3463	if (_PyString_Resize(outobj, requiredsize)) {
3464	    return 0;
3465	}
3466	return 1;
3467}
3468
3469typedef enum charmapencode_result {
3470  enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3471}charmapencode_result;
3472/* lookup the character, put the result in the output string and adjust
3473   various state variables. Reallocate the output string if not enough
3474   space is available. Return a new reference to the object that
3475   was put in the output buffer, or Py_None, if the mapping was undefined
3476   (in which case no character was written) or NULL, if a
3477   reallocation error occurred. The caller must decref the result */
3478static
3479charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3480    PyObject **outobj, Py_ssize_t *outpos)
3481{
3482    PyObject *rep;
3483    char *outstart;
3484    Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3485
3486    if (mapping->ob_type == &EncodingMapType) {
3487        int res = encoding_map_lookup(c, mapping);
3488	Py_ssize_t requiredsize = *outpos+1;
3489        if (res == -1)
3490            return enc_FAILED;
3491	if (outsize<requiredsize)
3492	    if (!charmapencode_resize(outobj, outpos, requiredsize))
3493		return enc_EXCEPTION;
3494        outstart = PyString_AS_STRING(*outobj);
3495	outstart[(*outpos)++] = (char)res;
3496	return enc_SUCCESS;
3497    }
3498
3499    rep = charmapencode_lookup(c, mapping);
3500    if (rep==NULL)
3501	return enc_EXCEPTION;
3502    else if (rep==Py_None) {
3503	Py_DECREF(rep);
3504	return enc_FAILED;
3505    } else {
3506	if (PyInt_Check(rep)) {
3507	    Py_ssize_t requiredsize = *outpos+1;
3508	    if (outsize<requiredsize)
3509		if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3510		    Py_DECREF(rep);
3511		    return enc_EXCEPTION;
3512		}
3513            outstart = PyString_AS_STRING(*outobj);
3514	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3515	}
3516	else {
3517	    const char *repchars = PyString_AS_STRING(rep);
3518	    Py_ssize_t repsize = PyString_GET_SIZE(rep);
3519	    Py_ssize_t requiredsize = *outpos+repsize;
3520	    if (outsize<requiredsize)
3521		if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3522		    Py_DECREF(rep);
3523		    return enc_EXCEPTION;
3524		}
3525            outstart = PyString_AS_STRING(*outobj);
3526	    memcpy(outstart + *outpos, repchars, repsize);
3527	    *outpos += repsize;
3528	}
3529    }
3530    Py_DECREF(rep);
3531    return enc_SUCCESS;
3532}
3533
3534/* handle an error in PyUnicode_EncodeCharmap
3535   Return 0 on success, -1 on error */
3536static
3537int charmap_encoding_error(
3538    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
3539    PyObject **exceptionObject,
3540    int *known_errorHandler, PyObject **errorHandler, const char *errors,
3541    PyObject **res, Py_ssize_t *respos)
3542{
3543    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3544    Py_ssize_t repsize;
3545    Py_ssize_t newpos;
3546    Py_UNICODE *uni2;
3547    /* startpos for collecting unencodable chars */
3548    Py_ssize_t collstartpos = *inpos;
3549    Py_ssize_t collendpos = *inpos+1;
3550    Py_ssize_t collpos;
3551    char *encoding = "charmap";
3552    char *reason = "character maps to <undefined>";
3553    charmapencode_result x;
3554
3555    /* find all unencodable characters */
3556    while (collendpos < size) {
3557        PyObject *rep;
3558        if (mapping->ob_type == &EncodingMapType) {
3559	    int res = encoding_map_lookup(p[collendpos], mapping);
3560	    if (res != -1)
3561		break;
3562	    ++collendpos;
3563	    continue;
3564	}
3565
3566	rep = charmapencode_lookup(p[collendpos], mapping);
3567	if (rep==NULL)
3568	    return -1;
3569	else if (rep!=Py_None) {
3570	    Py_DECREF(rep);
3571	    break;
3572	}
3573	Py_DECREF(rep);
3574	++collendpos;
3575    }
3576    /* cache callback name lookup
3577     * (if not done yet, i.e. it's the first error) */
3578    if (*known_errorHandler==-1) {
3579	if ((errors==NULL) || (!strcmp(errors, "strict")))
3580	    *known_errorHandler = 1;
3581	else if (!strcmp(errors, "replace"))
3582	    *known_errorHandler = 2;
3583	else if (!strcmp(errors, "ignore"))
3584	    *known_errorHandler = 3;
3585	else if (!strcmp(errors, "xmlcharrefreplace"))
3586	    *known_errorHandler = 4;
3587	else
3588	    *known_errorHandler = 0;
3589    }
3590    switch (*known_errorHandler) {
3591	case 1: /* strict */
3592	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3593	    return -1;
3594	case 2: /* replace */
3595	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3596		x = charmapencode_output('?', mapping, res, respos);
3597		if (x==enc_EXCEPTION) {
3598		    return -1;
3599		}
3600		else if (x==enc_FAILED) {
3601		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3602		    return -1;
3603		}
3604	    }
3605	    /* fall through */
3606	case 3: /* ignore */
3607	    *inpos = collendpos;
3608	    break;
3609	case 4: /* xmlcharrefreplace */
3610	    /* generate replacement (temporarily (mis)uses p) */
3611	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3612		char buffer[2+29+1+1];
3613		char *cp;
3614		sprintf(buffer, "&#%d;", (int)p[collpos]);
3615		for (cp = buffer; *cp; ++cp) {
3616		    x = charmapencode_output(*cp, mapping, res, respos);
3617		    if (x==enc_EXCEPTION)
3618			return -1;
3619		    else if (x==enc_FAILED) {
3620			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3621			return -1;
3622		    }
3623		}
3624	    }
3625	    *inpos = collendpos;
3626	    break;
3627	default:
3628	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3629		encoding, reason, p, size, exceptionObject,
3630		collstartpos, collendpos, &newpos);
3631	    if (repunicode == NULL)
3632		return -1;
3633	    /* generate replacement  */
3634	    repsize = PyUnicode_GET_SIZE(repunicode);
3635	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3636		x = charmapencode_output(*uni2, mapping, res, respos);
3637		if (x==enc_EXCEPTION) {
3638		    return -1;
3639		}
3640		else if (x==enc_FAILED) {
3641		    Py_DECREF(repunicode);
3642		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3643		    return -1;
3644		}
3645	    }
3646	    *inpos = newpos;
3647	    Py_DECREF(repunicode);
3648    }
3649    return 0;
3650}
3651
3652PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3653				  Py_ssize_t size,
3654				  PyObject *mapping,
3655				  const char *errors)
3656{
3657    /* output object */
3658    PyObject *res = NULL;
3659    /* current input position */
3660    Py_ssize_t inpos = 0;
3661    /* current output position */
3662    Py_ssize_t respos = 0;
3663    PyObject *errorHandler = NULL;
3664    PyObject *exc = NULL;
3665    /* the following variable is used for caching string comparisons
3666     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3667     * 3=ignore, 4=xmlcharrefreplace */
3668    int known_errorHandler = -1;
3669
3670    /* Default to Latin-1 */
3671    if (mapping == NULL)
3672	return PyUnicode_EncodeLatin1(p, size, errors);
3673
3674    /* allocate enough for a simple encoding without
3675       replacements, if we need more, we'll resize */
3676    res = PyString_FromStringAndSize(NULL, size);
3677    if (res == NULL)
3678        goto onError;
3679    if (size == 0)
3680	return res;
3681
3682    while (inpos<size) {
3683	/* try to encode it */
3684	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3685	if (x==enc_EXCEPTION) /* error */
3686	    goto onError;
3687	if (x==enc_FAILED) { /* unencodable character */
3688	    if (charmap_encoding_error(p, size, &inpos, mapping,
3689		&exc,
3690		&known_errorHandler, &errorHandler, errors,
3691		&res, &respos)) {
3692		goto onError;
3693	    }
3694	}
3695	else
3696	    /* done with this character => adjust input position */
3697	    ++inpos;
3698    }
3699
3700    /* Resize if we allocated to much */
3701    if (respos<PyString_GET_SIZE(res)) {
3702	if (_PyString_Resize(&res, respos))
3703	    goto onError;
3704    }
3705    Py_XDECREF(exc);
3706    Py_XDECREF(errorHandler);
3707    return res;
3708
3709    onError:
3710    Py_XDECREF(res);
3711    Py_XDECREF(exc);
3712    Py_XDECREF(errorHandler);
3713    return NULL;
3714}
3715
3716PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3717				    PyObject *mapping)
3718{
3719    if (!PyUnicode_Check(unicode) || mapping == NULL) {
3720	PyErr_BadArgument();
3721	return NULL;
3722    }
3723    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3724				   PyUnicode_GET_SIZE(unicode),
3725				   mapping,
3726				   NULL);
3727}
3728
3729/* create or adjust a UnicodeTranslateError */
3730static void make_translate_exception(PyObject **exceptionObject,
3731    const Py_UNICODE *unicode, Py_ssize_t size,
3732    Py_ssize_t startpos, Py_ssize_t endpos,
3733    const char *reason)
3734{
3735    if (*exceptionObject == NULL) {
3736    	*exceptionObject = PyUnicodeTranslateError_Create(
3737	    unicode, size, startpos, endpos, reason);
3738    }
3739    else {
3740	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3741	    goto onError;
3742	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3743	    goto onError;
3744	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3745	    goto onError;
3746	return;
3747	onError:
3748	Py_DECREF(*exceptionObject);
3749	*exceptionObject = NULL;
3750    }
3751}
3752
3753/* raises a UnicodeTranslateError */
3754static void raise_translate_exception(PyObject **exceptionObject,
3755    const Py_UNICODE *unicode, Py_ssize_t size,
3756    Py_ssize_t startpos, Py_ssize_t endpos,
3757    const char *reason)
3758{
3759    make_translate_exception(exceptionObject,
3760	unicode, size, startpos, endpos, reason);
3761    if (*exceptionObject != NULL)
3762	PyCodec_StrictErrors(*exceptionObject);
3763}
3764
3765/* error handling callback helper:
3766   build arguments, call the callback and check the arguments,
3767   put the result into newpos and return the replacement string, which
3768   has to be freed by the caller */
3769static PyObject *unicode_translate_call_errorhandler(const char *errors,
3770    PyObject **errorHandler,
3771    const char *reason,
3772    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3773    Py_ssize_t startpos, Py_ssize_t endpos,
3774    Py_ssize_t *newpos)
3775{
3776    static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
3777
3778    Py_ssize_t i_newpos;
3779    PyObject *restuple;
3780    PyObject *resunicode;
3781
3782    if (*errorHandler == NULL) {
3783	*errorHandler = PyCodec_LookupError(errors);
3784        if (*errorHandler == NULL)
3785	    return NULL;
3786    }
3787
3788    make_translate_exception(exceptionObject,
3789	unicode, size, startpos, endpos, reason);
3790    if (*exceptionObject == NULL)
3791	return NULL;
3792
3793    restuple = PyObject_CallFunctionObjArgs(
3794	*errorHandler, *exceptionObject, NULL);
3795    if (restuple == NULL)
3796	return NULL;
3797    if (!PyTuple_Check(restuple)) {
3798	PyErr_Format(PyExc_TypeError, &argparse[4]);
3799	Py_DECREF(restuple);
3800	return NULL;
3801    }
3802    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3803	&resunicode, &i_newpos)) {
3804	Py_DECREF(restuple);
3805	return NULL;
3806    }
3807    if (i_newpos<0)
3808	*newpos = size+i_newpos;
3809    else
3810        *newpos = i_newpos;
3811    if (*newpos<0 || *newpos>size) {
3812	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3813	Py_DECREF(restuple);
3814	return NULL;
3815    }
3816    Py_INCREF(resunicode);
3817    Py_DECREF(restuple);
3818    return resunicode;
3819}
3820
3821/* Lookup the character ch in the mapping and put the result in result,
3822   which must be decrefed by the caller.
3823   Return 0 on success, -1 on error */
3824static
3825int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3826{
3827    PyObject *w = PyInt_FromLong((long)c);
3828    PyObject *x;
3829
3830    if (w == NULL)
3831	 return -1;
3832    x = PyObject_GetItem(mapping, w);
3833    Py_DECREF(w);
3834    if (x == NULL) {
3835	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3836	    /* No mapping found means: use 1:1 mapping. */
3837	    PyErr_Clear();
3838	    *result = NULL;
3839	    return 0;
3840	} else
3841	    return -1;
3842    }
3843    else if (x == Py_None) {
3844	*result = x;
3845	return 0;
3846    }
3847    else if (PyInt_Check(x)) {
3848	long value = PyInt_AS_LONG(x);
3849	long max = PyUnicode_GetMax();
3850	if (value < 0 || value > max) {
3851	    PyErr_Format(PyExc_TypeError,
3852			     "character mapping must be in range(0x%lx)", max+1);
3853	    Py_DECREF(x);
3854	    return -1;
3855	}
3856	*result = x;
3857	return 0;
3858    }
3859    else if (PyUnicode_Check(x)) {
3860	*result = x;
3861	return 0;
3862    }
3863    else {
3864	/* wrong return value */
3865	PyErr_SetString(PyExc_TypeError,
3866	      "character mapping must return integer, None or unicode");
3867	Py_DECREF(x);
3868	return -1;
3869    }
3870}
3871/* ensure that *outobj is at least requiredsize characters long,
3872if not reallocate and adjust various state variables.
3873Return 0 on success, -1 on error */
3874static
3875int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
3876    Py_ssize_t requiredsize)
3877{
3878    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
3879    if (requiredsize > oldsize) {
3880	/* remember old output position */
3881	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3882	/* exponentially overallocate to minimize reallocations */
3883	if (requiredsize < 2 * oldsize)
3884	    requiredsize = 2 * oldsize;
3885	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3886	    return -1;
3887	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3888    }
3889    return 0;
3890}
3891/* lookup the character, put the result in the output string and adjust
3892   various state variables. Return a new reference to the object that
3893   was put in the output buffer in *result, or Py_None, if the mapping was
3894   undefined (in which case no character was written).
3895   The called must decref result.
3896   Return 0 on success, -1 on error. */
3897static
3898int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3899    Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3900    PyObject **res)
3901{
3902    if (charmaptranslate_lookup(*curinp, mapping, res))
3903	return -1;
3904    if (*res==NULL) {
3905	/* not found => default to 1:1 mapping */
3906	*(*outp)++ = *curinp;
3907    }
3908    else if (*res==Py_None)
3909	;
3910    else if (PyInt_Check(*res)) {
3911	/* no overflow check, because we know that the space is enough */
3912	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3913    }
3914    else if (PyUnicode_Check(*res)) {
3915	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
3916	if (repsize==1) {
3917	    /* no overflow check, because we know that the space is enough */
3918	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3919	}
3920	else if (repsize!=0) {
3921	    /* more than one character */
3922	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3923		(insize - (curinp-startinp)) +
3924		repsize - 1;
3925	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
3926		return -1;
3927	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3928	    *outp += repsize;
3929	}
3930    }
3931    else
3932	return -1;
3933    return 0;
3934}
3935
3936PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3937				     Py_ssize_t size,
3938				     PyObject *mapping,
3939				     const char *errors)
3940{
3941    /* output object */
3942    PyObject *res = NULL;
3943    /* pointers to the beginning and end+1 of input */
3944    const Py_UNICODE *startp = p;
3945    const Py_UNICODE *endp = p + size;
3946    /* pointer into the output */
3947    Py_UNICODE *str;
3948    /* current output position */
3949    Py_ssize_t respos = 0;
3950    char *reason = "character maps to <undefined>";
3951    PyObject *errorHandler = NULL;
3952    PyObject *exc = NULL;
3953    /* the following variable is used for caching string comparisons
3954     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3955     * 3=ignore, 4=xmlcharrefreplace */
3956    int known_errorHandler = -1;
3957
3958    if (mapping == NULL) {
3959	PyErr_BadArgument();
3960	return NULL;
3961    }
3962
3963    /* allocate enough for a simple 1:1 translation without
3964       replacements, if we need more, we'll resize */
3965    res = PyUnicode_FromUnicode(NULL, size);
3966    if (res == NULL)
3967	goto onError;
3968    if (size == 0)
3969	return res;
3970    str = PyUnicode_AS_UNICODE(res);
3971
3972    while (p<endp) {
3973	/* try to encode it */
3974	PyObject *x = NULL;
3975	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3976	    Py_XDECREF(x);
3977	    goto onError;
3978	}
3979	Py_XDECREF(x);
3980	if (x!=Py_None) /* it worked => adjust input pointer */
3981	    ++p;
3982	else { /* untranslatable character */
3983	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3984	    Py_ssize_t repsize;
3985	    Py_ssize_t newpos;
3986	    Py_UNICODE *uni2;
3987	    /* startpos for collecting untranslatable chars */
3988	    const Py_UNICODE *collstart = p;
3989	    const Py_UNICODE *collend = p+1;
3990	    const Py_UNICODE *coll;
3991
3992	    /* find all untranslatable characters */
3993	    while (collend < endp) {
3994		if (charmaptranslate_lookup(*collend, mapping, &x))
3995		    goto onError;
3996		Py_XDECREF(x);
3997		if (x!=Py_None)
3998		    break;
3999		++collend;
4000	    }
4001	    /* cache callback name lookup
4002	     * (if not done yet, i.e. it's the first error) */
4003	    if (known_errorHandler==-1) {
4004		if ((errors==NULL) || (!strcmp(errors, "strict")))
4005		    known_errorHandler = 1;
4006		else if (!strcmp(errors, "replace"))
4007		    known_errorHandler = 2;
4008		else if (!strcmp(errors, "ignore"))
4009		    known_errorHandler = 3;
4010		else if (!strcmp(errors, "xmlcharrefreplace"))
4011		    known_errorHandler = 4;
4012		else
4013		    known_errorHandler = 0;
4014	    }
4015	    switch (known_errorHandler) {
4016		case 1: /* strict */
4017		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4018		    goto onError;
4019		case 2: /* replace */
4020		    /* No need to check for space, this is a 1:1 replacement */
4021		    for (coll = collstart; coll<collend; ++coll)
4022			*str++ = '?';
4023		    /* fall through */
4024		case 3: /* ignore */
4025		    p = collend;
4026		    break;
4027		case 4: /* xmlcharrefreplace */
4028		    /* generate replacement (temporarily (mis)uses p) */
4029		    for (p = collstart; p < collend; ++p) {
4030			char buffer[2+29+1+1];
4031			char *cp;
4032			sprintf(buffer, "&#%d;", (int)*p);
4033			if (charmaptranslate_makespace(&res, &str,
4034			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4035			    goto onError;
4036			for (cp = buffer; *cp; ++cp)
4037			    *str++ = *cp;
4038		    }
4039		    p = collend;
4040		    break;
4041		default:
4042		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4043			reason, startp, size, &exc,
4044			collstart-startp, collend-startp, &newpos);
4045		    if (repunicode == NULL)
4046			goto onError;
4047		    /* generate replacement  */
4048		    repsize = PyUnicode_GET_SIZE(repunicode);
4049		    if (charmaptranslate_makespace(&res, &str,
4050			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4051			Py_DECREF(repunicode);
4052			goto onError;
4053		    }
4054		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4055			*str++ = *uni2;
4056		    p = startp + newpos;
4057		    Py_DECREF(repunicode);
4058	    }
4059	}
4060    }
4061    /* Resize if we allocated to much */
4062    respos = str-PyUnicode_AS_UNICODE(res);
4063    if (respos<PyUnicode_GET_SIZE(res)) {
4064	if (_PyUnicode_Resize(&res, respos) < 0)
4065	    goto onError;
4066    }
4067    Py_XDECREF(exc);
4068    Py_XDECREF(errorHandler);
4069    return res;
4070
4071    onError:
4072    Py_XDECREF(res);
4073    Py_XDECREF(exc);
4074    Py_XDECREF(errorHandler);
4075    return NULL;
4076}
4077
4078PyObject *PyUnicode_Translate(PyObject *str,
4079			      PyObject *mapping,
4080			      const char *errors)
4081{
4082    PyObject *result;
4083
4084    str = PyUnicode_FromObject(str);
4085    if (str == NULL)
4086	goto onError;
4087    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4088					PyUnicode_GET_SIZE(str),
4089					mapping,
4090					errors);
4091    Py_DECREF(str);
4092    return result;
4093
4094 onError:
4095    Py_XDECREF(str);
4096    return NULL;
4097}
4098
4099/* --- Decimal Encoder ---------------------------------------------------- */
4100
4101int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4102			    Py_ssize_t length,
4103			    char *output,
4104			    const char *errors)
4105{
4106    Py_UNICODE *p, *end;
4107    PyObject *errorHandler = NULL;
4108    PyObject *exc = NULL;
4109    const char *encoding = "decimal";
4110    const char *reason = "invalid decimal Unicode string";
4111    /* the following variable is used for caching string comparisons
4112     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4113    int known_errorHandler = -1;
4114
4115    if (output == NULL) {
4116	PyErr_BadArgument();
4117	return -1;
4118    }
4119
4120    p = s;
4121    end = s + length;
4122    while (p < end) {
4123	register Py_UNICODE ch = *p;
4124	int decimal;
4125	PyObject *repunicode;
4126	Py_ssize_t repsize;
4127	Py_ssize_t newpos;
4128	Py_UNICODE *uni2;
4129	Py_UNICODE *collstart;
4130	Py_UNICODE *collend;
4131
4132	if (Py_UNICODE_ISSPACE(ch)) {
4133	    *output++ = ' ';
4134	    ++p;
4135	    continue;
4136	}
4137	decimal = Py_UNICODE_TODECIMAL(ch);
4138	if (decimal >= 0) {
4139	    *output++ = '0' + decimal;
4140	    ++p;
4141	    continue;
4142	}
4143	if (0 < ch && ch < 256) {
4144	    *output++ = (char)ch;
4145	    ++p;
4146	    continue;
4147	}
4148	/* All other characters are considered unencodable */
4149	collstart = p;
4150	collend = p+1;
4151	while (collend < end) {
4152	    if ((0 < *collend && *collend < 256) ||
4153	        !Py_UNICODE_ISSPACE(*collend) ||
4154	        Py_UNICODE_TODECIMAL(*collend))
4155		break;
4156	}
4157	/* cache callback name lookup
4158	 * (if not done yet, i.e. it's the first error) */
4159	if (known_errorHandler==-1) {
4160	    if ((errors==NULL) || (!strcmp(errors, "strict")))
4161		known_errorHandler = 1;
4162	    else if (!strcmp(errors, "replace"))
4163		known_errorHandler = 2;
4164	    else if (!strcmp(errors, "ignore"))
4165		known_errorHandler = 3;
4166	    else if (!strcmp(errors, "xmlcharrefreplace"))
4167		known_errorHandler = 4;
4168	    else
4169		known_errorHandler = 0;
4170	}
4171	switch (known_errorHandler) {
4172	    case 1: /* strict */
4173		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4174		goto onError;
4175	    case 2: /* replace */
4176		for (p = collstart; p < collend; ++p)
4177		    *output++ = '?';
4178		/* fall through */
4179	    case 3: /* ignore */
4180		p = collend;
4181		break;
4182	    case 4: /* xmlcharrefreplace */
4183		/* generate replacement (temporarily (mis)uses p) */
4184		for (p = collstart; p < collend; ++p)
4185		    output += sprintf(output, "&#%d;", (int)*p);
4186		p = collend;
4187		break;
4188	    default:
4189		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4190		    encoding, reason, s, length, &exc,
4191		    collstart-s, collend-s, &newpos);
4192		if (repunicode == NULL)
4193		    goto onError;
4194		/* generate replacement  */
4195		repsize = PyUnicode_GET_SIZE(repunicode);
4196		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4197		    Py_UNICODE ch = *uni2;
4198		    if (Py_UNICODE_ISSPACE(ch))
4199			*output++ = ' ';
4200		    else {
4201			decimal = Py_UNICODE_TODECIMAL(ch);
4202			if (decimal >= 0)
4203			    *output++ = '0' + decimal;
4204			else if (0 < ch && ch < 256)
4205			    *output++ = (char)ch;
4206			else {
4207			    Py_DECREF(repunicode);
4208			    raise_encode_exception(&exc, encoding,
4209				s, length, collstart-s, collend-s, reason);
4210			    goto onError;
4211			}
4212		    }
4213		}
4214		p = s + newpos;
4215		Py_DECREF(repunicode);
4216	}
4217    }
4218    /* 0-terminate the output string */
4219    *output++ = '\0';
4220    Py_XDECREF(exc);
4221    Py_XDECREF(errorHandler);
4222    return 0;
4223
4224 onError:
4225    Py_XDECREF(exc);
4226    Py_XDECREF(errorHandler);
4227    return -1;
4228}
4229
4230/* --- Helpers ------------------------------------------------------------ */
4231
4232#define STRINGLIB_CHAR Py_UNICODE
4233
4234#define STRINGLIB_LEN PyUnicode_GET_SIZE
4235#define STRINGLIB_NEW PyUnicode_FromUnicode
4236#define STRINGLIB_STR PyUnicode_AS_UNICODE
4237
4238Py_LOCAL_INLINE(int)
4239STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4240{
4241    if (str[0] != other[0])
4242        return 1;
4243    return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4244}
4245
4246#define STRINGLIB_EMPTY unicode_empty
4247
4248#include "stringlib/fastsearch.h"
4249
4250#include "stringlib/count.h"
4251#include "stringlib/find.h"
4252#include "stringlib/partition.h"
4253
4254/* helper macro to fixup start/end slice values */
4255#define FIX_START_END(obj)                      \
4256    if (start < 0)                              \
4257        start += (obj)->length;                 \
4258    if (start < 0)                              \
4259        start = 0;                              \
4260    if (end > (obj)->length)                    \
4261        end = (obj)->length;                    \
4262    if (end < 0)                                \
4263        end += (obj)->length;                   \
4264    if (end < 0)                                \
4265        end = 0;
4266
4267Py_ssize_t PyUnicode_Count(PyObject *str,
4268                           PyObject *substr,
4269                           Py_ssize_t start,
4270                           Py_ssize_t end)
4271{
4272    Py_ssize_t result;
4273    PyUnicodeObject* str_obj;
4274    PyUnicodeObject* sub_obj;
4275
4276    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4277    if (!str_obj)
4278	return -1;
4279    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4280    if (!sub_obj) {
4281	Py_DECREF(str_obj);
4282	return -1;
4283    }
4284
4285    FIX_START_END(str_obj);
4286
4287    result = stringlib_count(
4288        str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4289        );
4290
4291    Py_DECREF(sub_obj);
4292    Py_DECREF(str_obj);
4293
4294    return result;
4295}
4296
4297Py_ssize_t PyUnicode_Find(PyObject *str,
4298                          PyObject *sub,
4299                          Py_ssize_t start,
4300                          Py_ssize_t end,
4301                          int direction)
4302{
4303    Py_ssize_t result;
4304
4305    str = PyUnicode_FromObject(str);
4306    if (!str)
4307	return -2;
4308    sub = PyUnicode_FromObject(sub);
4309    if (!sub) {
4310	Py_DECREF(str);
4311	return -2;
4312    }
4313
4314    if (direction > 0)
4315        result = stringlib_find_slice(
4316            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4317            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4318            start, end
4319            );
4320    else
4321        result = stringlib_rfind_slice(
4322            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4323            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4324            start, end
4325            );
4326
4327    Py_DECREF(str);
4328    Py_DECREF(sub);
4329
4330    return result;
4331}
4332
4333static
4334int tailmatch(PyUnicodeObject *self,
4335	      PyUnicodeObject *substring,
4336	      Py_ssize_t start,
4337	      Py_ssize_t end,
4338	      int direction)
4339{
4340    if (substring->length == 0)
4341        return 1;
4342
4343    FIX_START_END(self);
4344
4345    end -= substring->length;
4346    if (end < start)
4347	return 0;
4348
4349    if (direction > 0) {
4350	if (Py_UNICODE_MATCH(self, end, substring))
4351	    return 1;
4352    } else {
4353        if (Py_UNICODE_MATCH(self, start, substring))
4354	    return 1;
4355    }
4356
4357    return 0;
4358}
4359
4360Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4361			PyObject *substr,
4362			Py_ssize_t start,
4363			Py_ssize_t end,
4364			int direction)
4365{
4366    Py_ssize_t result;
4367
4368    str = PyUnicode_FromObject(str);
4369    if (str == NULL)
4370	return -1;
4371    substr = PyUnicode_FromObject(substr);
4372    if (substr == NULL) {
4373	Py_DECREF(str);
4374	return -1;
4375    }
4376
4377    result = tailmatch((PyUnicodeObject *)str,
4378		       (PyUnicodeObject *)substr,
4379		       start, end, direction);
4380    Py_DECREF(str);
4381    Py_DECREF(substr);
4382    return result;
4383}
4384
4385/* Apply fixfct filter to the Unicode object self and return a
4386   reference to the modified object */
4387
4388static
4389PyObject *fixup(PyUnicodeObject *self,
4390		int (*fixfct)(PyUnicodeObject *s))
4391{
4392
4393    PyUnicodeObject *u;
4394
4395    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4396    if (u == NULL)
4397	return NULL;
4398
4399    Py_UNICODE_COPY(u->str, self->str, self->length);
4400
4401    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4402	/* fixfct should return TRUE if it modified the buffer. If
4403	   FALSE, return a reference to the original buffer instead
4404	   (to save space, not time) */
4405	Py_INCREF(self);
4406	Py_DECREF(u);
4407	return (PyObject*) self;
4408    }
4409    return (PyObject*) u;
4410}
4411
4412static
4413int fixupper(PyUnicodeObject *self)
4414{
4415    Py_ssize_t len = self->length;
4416    Py_UNICODE *s = self->str;
4417    int status = 0;
4418
4419    while (len-- > 0) {
4420	register Py_UNICODE ch;
4421
4422	ch = Py_UNICODE_TOUPPER(*s);
4423	if (ch != *s) {
4424            status = 1;
4425	    *s = ch;
4426	}
4427        s++;
4428    }
4429
4430    return status;
4431}
4432
4433static
4434int fixlower(PyUnicodeObject *self)
4435{
4436    Py_ssize_t len = self->length;
4437    Py_UNICODE *s = self->str;
4438    int status = 0;
4439
4440    while (len-- > 0) {
4441	register Py_UNICODE ch;
4442
4443	ch = Py_UNICODE_TOLOWER(*s);
4444	if (ch != *s) {
4445            status = 1;
4446	    *s = ch;
4447	}
4448        s++;
4449    }
4450
4451    return status;
4452}
4453
4454static
4455int fixswapcase(PyUnicodeObject *self)
4456{
4457    Py_ssize_t len = self->length;
4458    Py_UNICODE *s = self->str;
4459    int status = 0;
4460
4461    while (len-- > 0) {
4462        if (Py_UNICODE_ISUPPER(*s)) {
4463            *s = Py_UNICODE_TOLOWER(*s);
4464            status = 1;
4465        } else if (Py_UNICODE_ISLOWER(*s)) {
4466            *s = Py_UNICODE_TOUPPER(*s);
4467            status = 1;
4468        }
4469        s++;
4470    }
4471
4472    return status;
4473}
4474
4475static
4476int fixcapitalize(PyUnicodeObject *self)
4477{
4478    Py_ssize_t len = self->length;
4479    Py_UNICODE *s = self->str;
4480    int status = 0;
4481
4482    if (len == 0)
4483	return 0;
4484    if (Py_UNICODE_ISLOWER(*s)) {
4485	*s = Py_UNICODE_TOUPPER(*s);
4486	status = 1;
4487    }
4488    s++;
4489    while (--len > 0) {
4490        if (Py_UNICODE_ISUPPER(*s)) {
4491            *s = Py_UNICODE_TOLOWER(*s);
4492            status = 1;
4493        }
4494        s++;
4495    }
4496    return status;
4497}
4498
4499static
4500int fixtitle(PyUnicodeObject *self)
4501{
4502    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4503    register Py_UNICODE *e;
4504    int previous_is_cased;
4505
4506    /* Shortcut for single character strings */
4507    if (PyUnicode_GET_SIZE(self) == 1) {
4508	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4509	if (*p != ch) {
4510	    *p = ch;
4511	    return 1;
4512	}
4513	else
4514	    return 0;
4515    }
4516
4517    e = p + PyUnicode_GET_SIZE(self);
4518    previous_is_cased = 0;
4519    for (; p < e; p++) {
4520	register const Py_UNICODE ch = *p;
4521
4522	if (previous_is_cased)
4523	    *p = Py_UNICODE_TOLOWER(ch);
4524	else
4525	    *p = Py_UNICODE_TOTITLE(ch);
4526
4527	if (Py_UNICODE_ISLOWER(ch) ||
4528	    Py_UNICODE_ISUPPER(ch) ||
4529	    Py_UNICODE_ISTITLE(ch))
4530	    previous_is_cased = 1;
4531	else
4532	    previous_is_cased = 0;
4533    }
4534    return 1;
4535}
4536
4537PyObject *
4538PyUnicode_Join(PyObject *separator, PyObject *seq)
4539{
4540    PyObject *internal_separator = NULL;
4541    const Py_UNICODE blank = ' ';
4542    const Py_UNICODE *sep = &blank;
4543    Py_ssize_t seplen = 1;
4544    PyUnicodeObject *res = NULL; /* the result */
4545    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
4546    Py_ssize_t res_used;         /* # used bytes */
4547    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
4548    PyObject *fseq;          /* PySequence_Fast(seq) */
4549    Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
4550    PyObject *item;
4551    Py_ssize_t i;
4552
4553    fseq = PySequence_Fast(seq, "");
4554    if (fseq == NULL) {
4555    	return NULL;
4556    }
4557
4558    /* Grrrr.  A codec may be invoked to convert str objects to
4559     * Unicode, and so it's possible to call back into Python code
4560     * during PyUnicode_FromObject(), and so it's possible for a sick
4561     * codec to change the size of fseq (if seq is a list).  Therefore
4562     * we have to keep refetching the size -- can't assume seqlen
4563     * is invariant.
4564     */
4565    seqlen = PySequence_Fast_GET_SIZE(fseq);
4566    /* If empty sequence, return u"". */
4567    if (seqlen == 0) {
4568    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
4569    	goto Done;
4570    }
4571    /* If singleton sequence with an exact Unicode, return that. */
4572    if (seqlen == 1) {
4573	item = PySequence_Fast_GET_ITEM(fseq, 0);
4574	if (PyUnicode_CheckExact(item)) {
4575	    Py_INCREF(item);
4576	    res = (PyUnicodeObject *)item;
4577	    goto Done;
4578	}
4579    }
4580
4581    /* At least two items to join, or one that isn't exact Unicode. */
4582    if (seqlen > 1) {
4583        /* Set up sep and seplen -- they're needed. */
4584    	if (separator == NULL) {
4585	    sep = &blank;
4586	    seplen = 1;
4587        }
4588    	else {
4589	    internal_separator = PyUnicode_FromObject(separator);
4590	    if (internal_separator == NULL)
4591	        goto onError;
4592	    sep = PyUnicode_AS_UNICODE(internal_separator);
4593	    seplen = PyUnicode_GET_SIZE(internal_separator);
4594	    /* In case PyUnicode_FromObject() mutated seq. */
4595	    seqlen = PySequence_Fast_GET_SIZE(fseq);
4596        }
4597    }
4598
4599    /* Get space. */
4600    res = _PyUnicode_New(res_alloc);
4601    if (res == NULL)
4602        goto onError;
4603    res_p = PyUnicode_AS_UNICODE(res);
4604    res_used = 0;
4605
4606    for (i = 0; i < seqlen; ++i) {
4607	Py_ssize_t itemlen;
4608	Py_ssize_t new_res_used;
4609
4610	item = PySequence_Fast_GET_ITEM(fseq, i);
4611	/* Convert item to Unicode. */
4612	if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4613	    PyErr_Format(PyExc_TypeError,
4614			 "sequence item %zd: expected string or Unicode,"
4615			 " %.80s found",
4616			 i, item->ob_type->tp_name);
4617	    goto onError;
4618	}
4619	item = PyUnicode_FromObject(item);
4620	if (item == NULL)
4621	    goto onError;
4622	/* We own a reference to item from here on. */
4623
4624	/* In case PyUnicode_FromObject() mutated seq. */
4625	seqlen = PySequence_Fast_GET_SIZE(fseq);
4626
4627        /* Make sure we have enough space for the separator and the item. */
4628	itemlen = PyUnicode_GET_SIZE(item);
4629	new_res_used = res_used + itemlen;
4630	if (new_res_used < 0)
4631	    goto Overflow;
4632	if (i < seqlen - 1) {
4633	    new_res_used += seplen;
4634	    if (new_res_used < 0)
4635		goto Overflow;
4636	}
4637	if (new_res_used > res_alloc) {
4638	    /* double allocated size until it's big enough */
4639	    do {
4640	        res_alloc += res_alloc;
4641	        if (res_alloc <= 0)
4642	            goto Overflow;
4643	    } while (new_res_used > res_alloc);
4644	    if (_PyUnicode_Resize(&res, res_alloc) < 0) {
4645		Py_DECREF(item);
4646		goto onError;
4647	    }
4648            res_p = PyUnicode_AS_UNICODE(res) + res_used;
4649	}
4650
4651	/* Copy item, and maybe the separator. */
4652	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
4653	res_p += itemlen;
4654	if (i < seqlen - 1) {
4655	    Py_UNICODE_COPY(res_p, sep, seplen);
4656	    res_p += seplen;
4657	}
4658	Py_DECREF(item);
4659	res_used = new_res_used;
4660    }
4661
4662    /* Shrink res to match the used area; this probably can't fail,
4663     * but it's cheap to check.
4664     */
4665    if (_PyUnicode_Resize(&res, res_used) < 0)
4666	goto onError;
4667
4668 Done:
4669    Py_XDECREF(internal_separator);
4670    Py_DECREF(fseq);
4671    return (PyObject *)res;
4672
4673 Overflow:
4674    PyErr_SetString(PyExc_OverflowError,
4675                    "join() result is too long for a Python string");
4676    Py_DECREF(item);
4677    /* fall through */
4678
4679 onError:
4680    Py_XDECREF(internal_separator);
4681    Py_DECREF(fseq);
4682    Py_XDECREF(res);
4683    return NULL;
4684}
4685
4686static
4687PyUnicodeObject *pad(PyUnicodeObject *self,
4688		     Py_ssize_t left,
4689		     Py_ssize_t right,
4690		     Py_UNICODE fill)
4691{
4692    PyUnicodeObject *u;
4693
4694    if (left < 0)
4695        left = 0;
4696    if (right < 0)
4697        right = 0;
4698
4699    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4700        Py_INCREF(self);
4701        return self;
4702    }
4703
4704    u = _PyUnicode_New(left + self->length + right);
4705    if (u) {
4706        if (left)
4707            Py_UNICODE_FILL(u->str, fill, left);
4708        Py_UNICODE_COPY(u->str + left, self->str, self->length);
4709        if (right)
4710            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4711    }
4712
4713    return u;
4714}
4715
4716#define SPLIT_APPEND(data, left, right)					\
4717	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
4718	if (!str)							\
4719	    goto onError;						\
4720	if (PyList_Append(list, str)) {					\
4721	    Py_DECREF(str);						\
4722	    goto onError;						\
4723	}								\
4724        else								\
4725            Py_DECREF(str);
4726
4727static
4728PyObject *split_whitespace(PyUnicodeObject *self,
4729			   PyObject *list,
4730			   Py_ssize_t maxcount)
4731{
4732    register Py_ssize_t i;
4733    register Py_ssize_t j;
4734    Py_ssize_t len = self->length;
4735    PyObject *str;
4736
4737    for (i = j = 0; i < len; ) {
4738	/* find a token */
4739	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4740	    i++;
4741	j = i;
4742	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4743	    i++;
4744	if (j < i) {
4745	    if (maxcount-- <= 0)
4746		break;
4747	    SPLIT_APPEND(self->str, j, i);
4748	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4749		i++;
4750	    j = i;
4751	}
4752    }
4753    if (j < len) {
4754	SPLIT_APPEND(self->str, j, len);
4755    }
4756    return list;
4757
4758 onError:
4759    Py_DECREF(list);
4760    return NULL;
4761}
4762
4763PyObject *PyUnicode_Splitlines(PyObject *string,
4764			       int keepends)
4765{
4766    register Py_ssize_t i;
4767    register Py_ssize_t j;
4768    Py_ssize_t len;
4769    PyObject *list;
4770    PyObject *str;
4771    Py_UNICODE *data;
4772
4773    string = PyUnicode_FromObject(string);
4774    if (string == NULL)
4775	return NULL;
4776    data = PyUnicode_AS_UNICODE(string);
4777    len = PyUnicode_GET_SIZE(string);
4778
4779    list = PyList_New(0);
4780    if (!list)
4781        goto onError;
4782
4783    for (i = j = 0; i < len; ) {
4784	Py_ssize_t eol;
4785
4786	/* Find a line and append it */
4787	while (i < len && !BLOOM_LINEBREAK(data[i]))
4788	    i++;
4789
4790	/* Skip the line break reading CRLF as one line break */
4791	eol = i;
4792	if (i < len) {
4793	    if (data[i] == '\r' && i + 1 < len &&
4794		data[i+1] == '\n')
4795		i += 2;
4796	    else
4797		i++;
4798	    if (keepends)
4799		eol = i;
4800	}
4801	SPLIT_APPEND(data, j, eol);
4802	j = i;
4803    }
4804    if (j < len) {
4805	SPLIT_APPEND(data, j, len);
4806    }
4807
4808    Py_DECREF(string);
4809    return list;
4810
4811 onError:
4812    Py_XDECREF(list);
4813    Py_DECREF(string);
4814    return NULL;
4815}
4816
4817static
4818PyObject *split_char(PyUnicodeObject *self,
4819		     PyObject *list,
4820		     Py_UNICODE ch,
4821		     Py_ssize_t maxcount)
4822{
4823    register Py_ssize_t i;
4824    register Py_ssize_t j;
4825    Py_ssize_t len = self->length;
4826    PyObject *str;
4827
4828    for (i = j = 0; i < len; ) {
4829	if (self->str[i] == ch) {
4830	    if (maxcount-- <= 0)
4831		break;
4832	    SPLIT_APPEND(self->str, j, i);
4833	    i = j = i + 1;
4834	} else
4835	    i++;
4836    }
4837    if (j <= len) {
4838	SPLIT_APPEND(self->str, j, len);
4839    }
4840    return list;
4841
4842 onError:
4843    Py_DECREF(list);
4844    return NULL;
4845}
4846
4847static
4848PyObject *split_substring(PyUnicodeObject *self,
4849			  PyObject *list,
4850			  PyUnicodeObject *substring,
4851			  Py_ssize_t maxcount)
4852{
4853    register Py_ssize_t i;
4854    register Py_ssize_t j;
4855    Py_ssize_t len = self->length;
4856    Py_ssize_t sublen = substring->length;
4857    PyObject *str;
4858
4859    for (i = j = 0; i <= len - sublen; ) {
4860	if (Py_UNICODE_MATCH(self, i, substring)) {
4861	    if (maxcount-- <= 0)
4862		break;
4863	    SPLIT_APPEND(self->str, j, i);
4864	    i = j = i + sublen;
4865	} else
4866	    i++;
4867    }
4868    if (j <= len) {
4869	SPLIT_APPEND(self->str, j, len);
4870    }
4871    return list;
4872
4873 onError:
4874    Py_DECREF(list);
4875    return NULL;
4876}
4877
4878static
4879PyObject *rsplit_whitespace(PyUnicodeObject *self,
4880			    PyObject *list,
4881			    Py_ssize_t maxcount)
4882{
4883    register Py_ssize_t i;
4884    register Py_ssize_t j;
4885    Py_ssize_t len = self->length;
4886    PyObject *str;
4887
4888    for (i = j = len - 1; i >= 0; ) {
4889	/* find a token */
4890	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4891	    i--;
4892	j = i;
4893	while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4894	    i--;
4895	if (j > i) {
4896	    if (maxcount-- <= 0)
4897		break;
4898	    SPLIT_APPEND(self->str, i + 1, j + 1);
4899	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4900		i--;
4901	    j = i;
4902	}
4903    }
4904    if (j >= 0) {
4905	SPLIT_APPEND(self->str, 0, j + 1);
4906    }
4907    if (PyList_Reverse(list) < 0)
4908        goto onError;
4909    return list;
4910
4911 onError:
4912    Py_DECREF(list);
4913    return NULL;
4914}
4915
4916static
4917PyObject *rsplit_char(PyUnicodeObject *self,
4918		      PyObject *list,
4919		      Py_UNICODE ch,
4920		      Py_ssize_t maxcount)
4921{
4922    register Py_ssize_t i;
4923    register Py_ssize_t j;
4924    Py_ssize_t len = self->length;
4925    PyObject *str;
4926
4927    for (i = j = len - 1; i >= 0; ) {
4928	if (self->str[i] == ch) {
4929	    if (maxcount-- <= 0)
4930		break;
4931	    SPLIT_APPEND(self->str, i + 1, j + 1);
4932	    j = i = i - 1;
4933	} else
4934	    i--;
4935    }
4936    if (j >= -1) {
4937	SPLIT_APPEND(self->str, 0, j + 1);
4938    }
4939    if (PyList_Reverse(list) < 0)
4940        goto onError;
4941    return list;
4942
4943 onError:
4944    Py_DECREF(list);
4945    return NULL;
4946}
4947
4948static
4949PyObject *rsplit_substring(PyUnicodeObject *self,
4950			   PyObject *list,
4951			   PyUnicodeObject *substring,
4952			   Py_ssize_t maxcount)
4953{
4954    register Py_ssize_t i;
4955    register Py_ssize_t j;
4956    Py_ssize_t len = self->length;
4957    Py_ssize_t sublen = substring->length;
4958    PyObject *str;
4959
4960    for (i = len - sublen, j = len; i >= 0; ) {
4961	if (Py_UNICODE_MATCH(self, i, substring)) {
4962	    if (maxcount-- <= 0)
4963		break;
4964	    SPLIT_APPEND(self->str, i + sublen, j);
4965	    j = i;
4966	    i -= sublen;
4967	} else
4968	    i--;
4969    }
4970    if (j >= 0) {
4971	SPLIT_APPEND(self->str, 0, j);
4972    }
4973    if (PyList_Reverse(list) < 0)
4974        goto onError;
4975    return list;
4976
4977 onError:
4978    Py_DECREF(list);
4979    return NULL;
4980}
4981
4982#undef SPLIT_APPEND
4983
4984static
4985PyObject *split(PyUnicodeObject *self,
4986		PyUnicodeObject *substring,
4987		Py_ssize_t maxcount)
4988{
4989    PyObject *list;
4990
4991    if (maxcount < 0)
4992        maxcount = PY_SSIZE_T_MAX;
4993
4994    list = PyList_New(0);
4995    if (!list)
4996        return NULL;
4997
4998    if (substring == NULL)
4999	return split_whitespace(self,list,maxcount);
5000
5001    else if (substring->length == 1)
5002	return split_char(self,list,substring->str[0],maxcount);
5003
5004    else if (substring->length == 0) {
5005	Py_DECREF(list);
5006	PyErr_SetString(PyExc_ValueError, "empty separator");
5007	return NULL;
5008    }
5009    else
5010	return split_substring(self,list,substring,maxcount);
5011}
5012
5013static
5014PyObject *rsplit(PyUnicodeObject *self,
5015		 PyUnicodeObject *substring,
5016		 Py_ssize_t maxcount)
5017{
5018    PyObject *list;
5019
5020    if (maxcount < 0)
5021        maxcount = PY_SSIZE_T_MAX;
5022
5023    list = PyList_New(0);
5024    if (!list)
5025        return NULL;
5026
5027    if (substring == NULL)
5028	return rsplit_whitespace(self,list,maxcount);
5029
5030    else if (substring->length == 1)
5031	return rsplit_char(self,list,substring->str[0],maxcount);
5032
5033    else if (substring->length == 0) {
5034	Py_DECREF(list);
5035	PyErr_SetString(PyExc_ValueError, "empty separator");
5036	return NULL;
5037    }
5038    else
5039	return rsplit_substring(self,list,substring,maxcount);
5040}
5041
5042static
5043PyObject *replace(PyUnicodeObject *self,
5044		  PyUnicodeObject *str1,
5045		  PyUnicodeObject *str2,
5046		  Py_ssize_t maxcount)
5047{
5048    PyUnicodeObject *u;
5049
5050    if (maxcount < 0)
5051	maxcount = PY_SSIZE_T_MAX;
5052
5053    if (str1->length == str2->length) {
5054        /* same length */
5055        Py_ssize_t i;
5056        if (str1->length == 1) {
5057            /* replace characters */
5058            Py_UNICODE u1, u2;
5059            if (!findchar(self->str, self->length, str1->str[0]))
5060                goto nothing;
5061            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5062            if (!u)
5063                return NULL;
5064            Py_UNICODE_COPY(u->str, self->str, self->length);
5065            u1 = str1->str[0];
5066            u2 = str2->str[0];
5067            for (i = 0; i < u->length; i++)
5068                if (u->str[i] == u1) {
5069                    if (--maxcount < 0)
5070                        break;
5071                    u->str[i] = u2;
5072                }
5073        } else {
5074            i = fastsearch(
5075                self->str, self->length, str1->str, str1->length, FAST_SEARCH
5076                );
5077            if (i < 0)
5078                goto nothing;
5079            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5080            if (!u)
5081                return NULL;
5082            Py_UNICODE_COPY(u->str, self->str, self->length);
5083            while (i <= self->length - str1->length)
5084                if (Py_UNICODE_MATCH(self, i, str1)) {
5085                    if (--maxcount < 0)
5086                        break;
5087                    Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5088                    i += str1->length;
5089                } else
5090                    i++;
5091        }
5092    } else {
5093
5094        Py_ssize_t n, i, j, e;
5095        Py_ssize_t product, new_size, delta;
5096        Py_UNICODE *p;
5097
5098        /* replace strings */
5099        n = stringlib_count(self->str, self->length, str1->str, str1->length);
5100        if (n > maxcount)
5101            n = maxcount;
5102        if (n == 0)
5103            goto nothing;
5104        /* new_size = self->length + n * (str2->length - str1->length)); */
5105        delta = (str2->length - str1->length);
5106        if (delta == 0) {
5107            new_size = self->length;
5108        } else {
5109            product = n * (str2->length - str1->length);
5110            if ((product / (str2->length - str1->length)) != n) {
5111                PyErr_SetString(PyExc_OverflowError,
5112                                "replace string is too long");
5113                return NULL;
5114            }
5115            new_size = self->length + product;
5116            if (new_size < 0) {
5117                PyErr_SetString(PyExc_OverflowError,
5118                                "replace string is too long");
5119                return NULL;
5120            }
5121        }
5122        u = _PyUnicode_New(new_size);
5123        if (!u)
5124            return NULL;
5125        i = 0;
5126        p = u->str;
5127        e = self->length - str1->length;
5128        if (str1->length > 0) {
5129            while (n-- > 0) {
5130                /* look for next match */
5131                j = i;
5132                while (j <= e) {
5133                    if (Py_UNICODE_MATCH(self, j, str1))
5134                        break;
5135                    j++;
5136                }
5137		if (j > i) {
5138                    if (j > e)
5139                        break;
5140                    /* copy unchanged part [i:j] */
5141                    Py_UNICODE_COPY(p, self->str+i, j-i);
5142                    p += j - i;
5143                }
5144                /* copy substitution string */
5145                if (str2->length > 0) {
5146                    Py_UNICODE_COPY(p, str2->str, str2->length);
5147                    p += str2->length;
5148                }
5149                i = j + str1->length;
5150            }
5151            if (i < self->length)
5152                /* copy tail [i:] */
5153                Py_UNICODE_COPY(p, self->str+i, self->length-i);
5154        } else {
5155            /* interleave */
5156            while (n > 0) {
5157                Py_UNICODE_COPY(p, str2->str, str2->length);
5158                p += str2->length;
5159                if (--n <= 0)
5160                    break;
5161                *p++ = self->str[i++];
5162            }
5163            Py_UNICODE_COPY(p, self->str+i, self->length-i);
5164        }
5165    }
5166    return (PyObject *) u;
5167
5168nothing:
5169    /* nothing to replace; return original string (when possible) */
5170    if (PyUnicode_CheckExact(self)) {
5171        Py_INCREF(self);
5172        return (PyObject *) self;
5173    }
5174    return PyUnicode_FromUnicode(self->str, self->length);
5175}
5176
5177/* --- Unicode Object Methods --------------------------------------------- */
5178
5179PyDoc_STRVAR(title__doc__,
5180"S.title() -> unicode\n\
5181\n\
5182Return a titlecased version of S, i.e. words start with title case\n\
5183characters, all remaining cased characters have lower case.");
5184
5185static PyObject*
5186unicode_title(PyUnicodeObject *self)
5187{
5188    return fixup(self, fixtitle);
5189}
5190
5191PyDoc_STRVAR(capitalize__doc__,
5192"S.capitalize() -> unicode\n\
5193\n\
5194Return a capitalized version of S, i.e. make the first character\n\
5195have upper case.");
5196
5197static PyObject*
5198unicode_capitalize(PyUnicodeObject *self)
5199{
5200    return fixup(self, fixcapitalize);
5201}
5202
5203#if 0
5204PyDoc_STRVAR(capwords__doc__,
5205"S.capwords() -> unicode\n\
5206\n\
5207Apply .capitalize() to all words in S and return the result with\n\
5208normalized whitespace (all whitespace strings are replaced by ' ').");
5209
5210static PyObject*
5211unicode_capwords(PyUnicodeObject *self)
5212{
5213    PyObject *list;
5214    PyObject *item;
5215    Py_ssize_t i;
5216
5217    /* Split into words */
5218    list = split(self, NULL, -1);
5219    if (!list)
5220        return NULL;
5221
5222    /* Capitalize each word */
5223    for (i = 0; i < PyList_GET_SIZE(list); i++) {
5224        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5225		     fixcapitalize);
5226        if (item == NULL)
5227            goto onError;
5228        Py_DECREF(PyList_GET_ITEM(list, i));
5229        PyList_SET_ITEM(list, i, item);
5230    }
5231
5232    /* Join the words to form a new string */
5233    item = PyUnicode_Join(NULL, list);
5234
5235onError:
5236    Py_DECREF(list);
5237    return (PyObject *)item;
5238}
5239#endif
5240
5241/* Argument converter.  Coerces to a single unicode character */
5242
5243static int
5244convert_uc(PyObject *obj, void *addr)
5245{
5246	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5247	PyObject *uniobj;
5248	Py_UNICODE *unistr;
5249
5250	uniobj = PyUnicode_FromObject(obj);
5251	if (uniobj == NULL) {
5252		PyErr_SetString(PyExc_TypeError,
5253			"The fill character cannot be converted to Unicode");
5254		return 0;
5255	}
5256	if (PyUnicode_GET_SIZE(uniobj) != 1) {
5257		PyErr_SetString(PyExc_TypeError,
5258			"The fill character must be exactly one character long");
5259		Py_DECREF(uniobj);
5260		return 0;
5261	}
5262	unistr = PyUnicode_AS_UNICODE(uniobj);
5263	*fillcharloc = unistr[0];
5264	Py_DECREF(uniobj);
5265	return 1;
5266}
5267
5268PyDoc_STRVAR(center__doc__,
5269"S.center(width[, fillchar]) -> unicode\n\
5270\n\
5271Return S centered in a Unicode string of length width. Padding is\n\
5272done using the specified fill character (default is a space)");
5273
5274static PyObject *
5275unicode_center(PyUnicodeObject *self, PyObject *args)
5276{
5277    Py_ssize_t marg, left;
5278    Py_ssize_t width;
5279    Py_UNICODE fillchar = ' ';
5280
5281    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
5282        return NULL;
5283
5284    if (self->length >= width && PyUnicode_CheckExact(self)) {
5285        Py_INCREF(self);
5286        return (PyObject*) self;
5287    }
5288
5289    marg = width - self->length;
5290    left = marg / 2 + (marg & width & 1);
5291
5292    return (PyObject*) pad(self, left, marg - left, fillchar);
5293}
5294
5295#if 0
5296
5297/* This code should go into some future Unicode collation support
5298   module. The basic comparison should compare ordinals on a naive
5299   basis (this is what Java does and thus JPython too). */
5300
5301/* speedy UTF-16 code point order comparison */
5302/* gleaned from: */
5303/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5304
5305static short utf16Fixup[32] =
5306{
5307    0, 0, 0, 0, 0, 0, 0, 0,
5308    0, 0, 0, 0, 0, 0, 0, 0,
5309    0, 0, 0, 0, 0, 0, 0, 0,
5310    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5311};
5312
5313static int
5314unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5315{
5316    Py_ssize_t len1, len2;
5317
5318    Py_UNICODE *s1 = str1->str;
5319    Py_UNICODE *s2 = str2->str;
5320
5321    len1 = str1->length;
5322    len2 = str2->length;
5323
5324    while (len1 > 0 && len2 > 0) {
5325        Py_UNICODE c1, c2;
5326
5327        c1 = *s1++;
5328        c2 = *s2++;
5329
5330	if (c1 > (1<<11) * 26)
5331	    c1 += utf16Fixup[c1>>11];
5332	if (c2 > (1<<11) * 26)
5333            c2 += utf16Fixup[c2>>11];
5334        /* now c1 and c2 are in UTF-32-compatible order */
5335
5336        if (c1 != c2)
5337            return (c1 < c2) ? -1 : 1;
5338
5339        len1--; len2--;
5340    }
5341
5342    return (len1 < len2) ? -1 : (len1 != len2);
5343}
5344
5345#else
5346
5347static int
5348unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5349{
5350    register Py_ssize_t len1, len2;
5351
5352    Py_UNICODE *s1 = str1->str;
5353    Py_UNICODE *s2 = str2->str;
5354
5355    len1 = str1->length;
5356    len2 = str2->length;
5357
5358    while (len1 > 0 && len2 > 0) {
5359        Py_UNICODE c1, c2;
5360
5361        c1 = *s1++;
5362        c2 = *s2++;
5363
5364        if (c1 != c2)
5365            return (c1 < c2) ? -1 : 1;
5366
5367        len1--; len2--;
5368    }
5369
5370    return (len1 < len2) ? -1 : (len1 != len2);
5371}
5372
5373#endif
5374
5375int PyUnicode_Compare(PyObject *left,
5376		      PyObject *right)
5377{
5378    PyUnicodeObject *u = NULL, *v = NULL;
5379    int result;
5380
5381    /* Coerce the two arguments */
5382    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5383    if (u == NULL)
5384	goto onError;
5385    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5386    if (v == NULL)
5387	goto onError;
5388
5389    /* Shortcut for empty or interned objects */
5390    if (v == u) {
5391	Py_DECREF(u);
5392	Py_DECREF(v);
5393	return 0;
5394    }
5395
5396    result = unicode_compare(u, v);
5397
5398    Py_DECREF(u);
5399    Py_DECREF(v);
5400    return result;
5401
5402onError:
5403    Py_XDECREF(u);
5404    Py_XDECREF(v);
5405    return -1;
5406}
5407
5408int PyUnicode_Contains(PyObject *container,
5409		       PyObject *element)
5410{
5411    PyObject *str, *sub;
5412    int result;
5413
5414    /* Coerce the two arguments */
5415    sub = PyUnicode_FromObject(element);
5416    if (!sub) {
5417	PyErr_SetString(PyExc_TypeError,
5418	    "'in <string>' requires string as left operand");
5419        return -1;
5420    }
5421
5422    str = PyUnicode_FromObject(container);
5423    if (!str) {
5424        Py_DECREF(sub);
5425        return -1;
5426    }
5427
5428    result = stringlib_contains_obj(str, sub);
5429
5430    Py_DECREF(str);
5431    Py_DECREF(sub);
5432
5433    return result;
5434}
5435
5436/* Concat to string or Unicode object giving a new Unicode object. */
5437
5438PyObject *PyUnicode_Concat(PyObject *left,
5439			   PyObject *right)
5440{
5441    PyUnicodeObject *u = NULL, *v = NULL, *w;
5442
5443    /* Coerce the two arguments */
5444    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5445    if (u == NULL)
5446	goto onError;
5447    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5448    if (v == NULL)
5449	goto onError;
5450
5451    /* Shortcuts */
5452    if (v == unicode_empty) {
5453	Py_DECREF(v);
5454	return (PyObject *)u;
5455    }
5456    if (u == unicode_empty) {
5457	Py_DECREF(u);
5458	return (PyObject *)v;
5459    }
5460
5461    /* Concat the two Unicode strings */
5462    w = _PyUnicode_New(u->length + v->length);
5463    if (w == NULL)
5464	goto onError;
5465    Py_UNICODE_COPY(w->str, u->str, u->length);
5466    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5467
5468    Py_DECREF(u);
5469    Py_DECREF(v);
5470    return (PyObject *)w;
5471
5472onError:
5473    Py_XDECREF(u);
5474    Py_XDECREF(v);
5475    return NULL;
5476}
5477
5478PyDoc_STRVAR(count__doc__,
5479"S.count(sub[, start[, end]]) -> int\n\
5480\n\
5481Return the number of non-overlapping occurrences of substring sub in\n\
5482Unicode string S[start:end].  Optional arguments start and end are\n\
5483interpreted as in slice notation.");
5484
5485static PyObject *
5486unicode_count(PyUnicodeObject *self, PyObject *args)
5487{
5488    PyUnicodeObject *substring;
5489    Py_ssize_t start = 0;
5490    Py_ssize_t end = PY_SSIZE_T_MAX;
5491    PyObject *result;
5492
5493    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5494		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5495        return NULL;
5496
5497    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5498        (PyObject *)substring);
5499    if (substring == NULL)
5500	return NULL;
5501
5502    FIX_START_END(self);
5503
5504    result = PyInt_FromSsize_t(
5505        stringlib_count(self->str + start, end - start,
5506                        substring->str, substring->length)
5507        );
5508
5509    Py_DECREF(substring);
5510
5511    return result;
5512}
5513
5514PyDoc_STRVAR(encode__doc__,
5515"S.encode([encoding[,errors]]) -> string or unicode\n\
5516\n\
5517Encodes S using the codec registered for encoding. encoding defaults\n\
5518to the default encoding. errors may be given to set a different error\n\
5519handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5520a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5521'xmlcharrefreplace' as well as any other name registered with\n\
5522codecs.register_error that can handle UnicodeEncodeErrors.");
5523
5524static PyObject *
5525unicode_encode(PyUnicodeObject *self, PyObject *args)
5526{
5527    char *encoding = NULL;
5528    char *errors = NULL;
5529    PyObject *v;
5530
5531    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5532        return NULL;
5533    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5534    if (v == NULL)
5535        goto onError;
5536    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5537        PyErr_Format(PyExc_TypeError,
5538                     "encoder did not return a string/unicode object "
5539                     "(type=%.400s)",
5540                     v->ob_type->tp_name);
5541        Py_DECREF(v);
5542        return NULL;
5543    }
5544    return v;
5545
5546 onError:
5547    return NULL;
5548}
5549
5550PyDoc_STRVAR(decode__doc__,
5551"S.decode([encoding[,errors]]) -> string or unicode\n\
5552\n\
5553Decodes S using the codec registered for encoding. encoding defaults\n\
5554to the default encoding. errors may be given to set a different error\n\
5555handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5556a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5557as well as any other name registerd with codecs.register_error that is\n\
5558able to handle UnicodeDecodeErrors.");
5559
5560static PyObject *
5561unicode_decode(PyUnicodeObject *self, PyObject *args)
5562{
5563    char *encoding = NULL;
5564    char *errors = NULL;
5565    PyObject *v;
5566
5567    if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5568        return NULL;
5569    v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5570    if (v == NULL)
5571        goto onError;
5572    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5573        PyErr_Format(PyExc_TypeError,
5574                     "decoder did not return a string/unicode object "
5575                     "(type=%.400s)",
5576                     v->ob_type->tp_name);
5577        Py_DECREF(v);
5578        return NULL;
5579    }
5580    return v;
5581
5582 onError:
5583    return NULL;
5584}
5585
5586PyDoc_STRVAR(expandtabs__doc__,
5587"S.expandtabs([tabsize]) -> unicode\n\
5588\n\
5589Return a copy of S where all tab characters are expanded using spaces.\n\
5590If tabsize is not given, a tab size of 8 characters is assumed.");
5591
5592static PyObject*
5593unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5594{
5595    Py_UNICODE *e;
5596    Py_UNICODE *p;
5597    Py_UNICODE *q;
5598    Py_ssize_t i, j;
5599    PyUnicodeObject *u;
5600    int tabsize = 8;
5601
5602    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5603	return NULL;
5604
5605    /* First pass: determine size of output string */
5606    i = j = 0;
5607    e = self->str + self->length;
5608    for (p = self->str; p < e; p++)
5609        if (*p == '\t') {
5610	    if (tabsize > 0)
5611		j += tabsize - (j % tabsize);
5612	}
5613        else {
5614            j++;
5615            if (*p == '\n' || *p == '\r') {
5616                i += j;
5617                j = 0;
5618            }
5619        }
5620
5621    /* Second pass: create output string and fill it */
5622    u = _PyUnicode_New(i + j);
5623    if (!u)
5624        return NULL;
5625
5626    j = 0;
5627    q = u->str;
5628
5629    for (p = self->str; p < e; p++)
5630        if (*p == '\t') {
5631	    if (tabsize > 0) {
5632		i = tabsize - (j % tabsize);
5633		j += i;
5634		while (i--)
5635		    *q++ = ' ';
5636	    }
5637	}
5638	else {
5639            j++;
5640	    *q++ = *p;
5641            if (*p == '\n' || *p == '\r')
5642                j = 0;
5643        }
5644
5645    return (PyObject*) u;
5646}
5647
5648PyDoc_STRVAR(find__doc__,
5649"S.find(sub [,start [,end]]) -> int\n\
5650\n\
5651Return the lowest index in S where substring sub is found,\n\
5652such that sub is contained within s[start,end].  Optional\n\
5653arguments start and end are interpreted as in slice notation.\n\
5654\n\
5655Return -1 on failure.");
5656
5657static PyObject *
5658unicode_find(PyUnicodeObject *self, PyObject *args)
5659{
5660    PyObject *substring;
5661    Py_ssize_t start = 0;
5662    Py_ssize_t end = PY_SSIZE_T_MAX;
5663    Py_ssize_t result;
5664
5665    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5666		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5667        return NULL;
5668    substring = PyUnicode_FromObject(substring);
5669    if (!substring)
5670	return NULL;
5671
5672    result = stringlib_find_slice(
5673        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5674        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5675        start, end
5676        );
5677
5678    Py_DECREF(substring);
5679
5680    return PyInt_FromSsize_t(result);
5681}
5682
5683static PyObject *
5684unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
5685{
5686    if (index < 0 || index >= self->length) {
5687        PyErr_SetString(PyExc_IndexError, "string index out of range");
5688        return NULL;
5689    }
5690
5691    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5692}
5693
5694static long
5695unicode_hash(PyUnicodeObject *self)
5696{
5697    /* Since Unicode objects compare equal to their ASCII string
5698       counterparts, they should use the individual character values
5699       as basis for their hash value.  This is needed to assure that
5700       strings and Unicode objects behave in the same way as
5701       dictionary keys. */
5702
5703    register Py_ssize_t len;
5704    register Py_UNICODE *p;
5705    register long x;
5706
5707    if (self->hash != -1)
5708	return self->hash;
5709    len = PyUnicode_GET_SIZE(self);
5710    p = PyUnicode_AS_UNICODE(self);
5711    x = *p << 7;
5712    while (--len >= 0)
5713	x = (1000003*x) ^ *p++;
5714    x ^= PyUnicode_GET_SIZE(self);
5715    if (x == -1)
5716	x = -2;
5717    self->hash = x;
5718    return x;
5719}
5720
5721PyDoc_STRVAR(index__doc__,
5722"S.index(sub [,start [,end]]) -> int\n\
5723\n\
5724Like S.find() but raise ValueError when the substring is not found.");
5725
5726static PyObject *
5727unicode_index(PyUnicodeObject *self, PyObject *args)
5728{
5729    Py_ssize_t result;
5730    PyObject *substring;
5731    Py_ssize_t start = 0;
5732    Py_ssize_t end = PY_SSIZE_T_MAX;
5733
5734    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5735		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5736        return NULL;
5737    substring = PyUnicode_FromObject(substring);
5738    if (!substring)
5739	return NULL;
5740
5741    result = stringlib_find_slice(
5742        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
5743        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
5744        start, end
5745        );
5746
5747    Py_DECREF(substring);
5748
5749    if (result < 0) {
5750        PyErr_SetString(PyExc_ValueError, "substring not found");
5751        return NULL;
5752    }
5753
5754    return PyInt_FromSsize_t(result);
5755}
5756
5757PyDoc_STRVAR(islower__doc__,
5758"S.islower() -> bool\n\
5759\n\
5760Return True if all cased characters in S are lowercase and there is\n\
5761at least one cased character in S, False otherwise.");
5762
5763static PyObject*
5764unicode_islower(PyUnicodeObject *self)
5765{
5766    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5767    register const Py_UNICODE *e;
5768    int cased;
5769
5770    /* Shortcut for single character strings */
5771    if (PyUnicode_GET_SIZE(self) == 1)
5772	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5773
5774    /* Special case for empty strings */
5775    if (PyUnicode_GET_SIZE(self) == 0)
5776	return PyBool_FromLong(0);
5777
5778    e = p + PyUnicode_GET_SIZE(self);
5779    cased = 0;
5780    for (; p < e; p++) {
5781	register const Py_UNICODE ch = *p;
5782
5783	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
5784	    return PyBool_FromLong(0);
5785	else if (!cased && Py_UNICODE_ISLOWER(ch))
5786	    cased = 1;
5787    }
5788    return PyBool_FromLong(cased);
5789}
5790
5791PyDoc_STRVAR(isupper__doc__,
5792"S.isupper() -> bool\n\
5793\n\
5794Return True if all cased characters in S are uppercase and there is\n\
5795at least one cased character in S, False otherwise.");
5796
5797static PyObject*
5798unicode_isupper(PyUnicodeObject *self)
5799{
5800    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5801    register const Py_UNICODE *e;
5802    int cased;
5803
5804    /* Shortcut for single character strings */
5805    if (PyUnicode_GET_SIZE(self) == 1)
5806	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5807
5808    /* Special case for empty strings */
5809    if (PyUnicode_GET_SIZE(self) == 0)
5810	return PyBool_FromLong(0);
5811
5812    e = p + PyUnicode_GET_SIZE(self);
5813    cased = 0;
5814    for (; p < e; p++) {
5815	register const Py_UNICODE ch = *p;
5816
5817	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
5818	    return PyBool_FromLong(0);
5819	else if (!cased && Py_UNICODE_ISUPPER(ch))
5820	    cased = 1;
5821    }
5822    return PyBool_FromLong(cased);
5823}
5824
5825PyDoc_STRVAR(istitle__doc__,
5826"S.istitle() -> bool\n\
5827\n\
5828Return True if S is a titlecased string and there is at least one\n\
5829character in S, i.e. upper- and titlecase characters may only\n\
5830follow uncased characters and lowercase characters only cased ones.\n\
5831Return False otherwise.");
5832
5833static PyObject*
5834unicode_istitle(PyUnicodeObject *self)
5835{
5836    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5837    register const Py_UNICODE *e;
5838    int cased, previous_is_cased;
5839
5840    /* Shortcut for single character strings */
5841    if (PyUnicode_GET_SIZE(self) == 1)
5842	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5843			       (Py_UNICODE_ISUPPER(*p) != 0));
5844
5845    /* Special case for empty strings */
5846    if (PyUnicode_GET_SIZE(self) == 0)
5847	return PyBool_FromLong(0);
5848
5849    e = p + PyUnicode_GET_SIZE(self);
5850    cased = 0;
5851    previous_is_cased = 0;
5852    for (; p < e; p++) {
5853	register const Py_UNICODE ch = *p;
5854
5855	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5856	    if (previous_is_cased)
5857		return PyBool_FromLong(0);
5858	    previous_is_cased = 1;
5859	    cased = 1;
5860	}
5861	else if (Py_UNICODE_ISLOWER(ch)) {
5862	    if (!previous_is_cased)
5863		return PyBool_FromLong(0);
5864	    previous_is_cased = 1;
5865	    cased = 1;
5866	}
5867	else
5868	    previous_is_cased = 0;
5869    }
5870    return PyBool_FromLong(cased);
5871}
5872
5873PyDoc_STRVAR(isspace__doc__,
5874"S.isspace() -> bool\n\
5875\n\
5876Return True if all characters in S are whitespace\n\
5877and there is at least one character in S, False otherwise.");
5878
5879static PyObject*
5880unicode_isspace(PyUnicodeObject *self)
5881{
5882    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5883    register const Py_UNICODE *e;
5884
5885    /* Shortcut for single character strings */
5886    if (PyUnicode_GET_SIZE(self) == 1 &&
5887	Py_UNICODE_ISSPACE(*p))
5888	return PyBool_FromLong(1);
5889
5890    /* Special case for empty strings */
5891    if (PyUnicode_GET_SIZE(self) == 0)
5892	return PyBool_FromLong(0);
5893
5894    e = p + PyUnicode_GET_SIZE(self);
5895    for (; p < e; p++) {
5896	if (!Py_UNICODE_ISSPACE(*p))
5897	    return PyBool_FromLong(0);
5898    }
5899    return PyBool_FromLong(1);
5900}
5901
5902PyDoc_STRVAR(isalpha__doc__,
5903"S.isalpha() -> bool\n\
5904\n\
5905Return True if all characters in S are alphabetic\n\
5906and there is at least one character in S, False otherwise.");
5907
5908static PyObject*
5909unicode_isalpha(PyUnicodeObject *self)
5910{
5911    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5912    register const Py_UNICODE *e;
5913
5914    /* Shortcut for single character strings */
5915    if (PyUnicode_GET_SIZE(self) == 1 &&
5916	Py_UNICODE_ISALPHA(*p))
5917	return PyBool_FromLong(1);
5918
5919    /* Special case for empty strings */
5920    if (PyUnicode_GET_SIZE(self) == 0)
5921	return PyBool_FromLong(0);
5922
5923    e = p + PyUnicode_GET_SIZE(self);
5924    for (; p < e; p++) {
5925	if (!Py_UNICODE_ISALPHA(*p))
5926	    return PyBool_FromLong(0);
5927    }
5928    return PyBool_FromLong(1);
5929}
5930
5931PyDoc_STRVAR(isalnum__doc__,
5932"S.isalnum() -> bool\n\
5933\n\
5934Return True if all characters in S are alphanumeric\n\
5935and there is at least one character in S, False otherwise.");
5936
5937static PyObject*
5938unicode_isalnum(PyUnicodeObject *self)
5939{
5940    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5941    register const Py_UNICODE *e;
5942
5943    /* Shortcut for single character strings */
5944    if (PyUnicode_GET_SIZE(self) == 1 &&
5945	Py_UNICODE_ISALNUM(*p))
5946	return PyBool_FromLong(1);
5947
5948    /* Special case for empty strings */
5949    if (PyUnicode_GET_SIZE(self) == 0)
5950	return PyBool_FromLong(0);
5951
5952    e = p + PyUnicode_GET_SIZE(self);
5953    for (; p < e; p++) {
5954	if (!Py_UNICODE_ISALNUM(*p))
5955	    return PyBool_FromLong(0);
5956    }
5957    return PyBool_FromLong(1);
5958}
5959
5960PyDoc_STRVAR(isdecimal__doc__,
5961"S.isdecimal() -> bool\n\
5962\n\
5963Return True if there are only decimal characters in S,\n\
5964False otherwise.");
5965
5966static PyObject*
5967unicode_isdecimal(PyUnicodeObject *self)
5968{
5969    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5970    register const Py_UNICODE *e;
5971
5972    /* Shortcut for single character strings */
5973    if (PyUnicode_GET_SIZE(self) == 1 &&
5974	Py_UNICODE_ISDECIMAL(*p))
5975	return PyBool_FromLong(1);
5976
5977    /* Special case for empty strings */
5978    if (PyUnicode_GET_SIZE(self) == 0)
5979	return PyBool_FromLong(0);
5980
5981    e = p + PyUnicode_GET_SIZE(self);
5982    for (; p < e; p++) {
5983	if (!Py_UNICODE_ISDECIMAL(*p))
5984	    return PyBool_FromLong(0);
5985    }
5986    return PyBool_FromLong(1);
5987}
5988
5989PyDoc_STRVAR(isdigit__doc__,
5990"S.isdigit() -> bool\n\
5991\n\
5992Return True if all characters in S are digits\n\
5993and there is at least one character in S, False otherwise.");
5994
5995static PyObject*
5996unicode_isdigit(PyUnicodeObject *self)
5997{
5998    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5999    register const Py_UNICODE *e;
6000
6001    /* Shortcut for single character strings */
6002    if (PyUnicode_GET_SIZE(self) == 1 &&
6003	Py_UNICODE_ISDIGIT(*p))
6004	return PyBool_FromLong(1);
6005
6006    /* Special case for empty strings */
6007    if (PyUnicode_GET_SIZE(self) == 0)
6008	return PyBool_FromLong(0);
6009
6010    e = p + PyUnicode_GET_SIZE(self);
6011    for (; p < e; p++) {
6012	if (!Py_UNICODE_ISDIGIT(*p))
6013	    return PyBool_FromLong(0);
6014    }
6015    return PyBool_FromLong(1);
6016}
6017
6018PyDoc_STRVAR(isnumeric__doc__,
6019"S.isnumeric() -> bool\n\
6020\n\
6021Return True if there are only numeric characters in S,\n\
6022False otherwise.");
6023
6024static PyObject*
6025unicode_isnumeric(PyUnicodeObject *self)
6026{
6027    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6028    register const Py_UNICODE *e;
6029
6030    /* Shortcut for single character strings */
6031    if (PyUnicode_GET_SIZE(self) == 1 &&
6032	Py_UNICODE_ISNUMERIC(*p))
6033	return PyBool_FromLong(1);
6034
6035    /* Special case for empty strings */
6036    if (PyUnicode_GET_SIZE(self) == 0)
6037	return PyBool_FromLong(0);
6038
6039    e = p + PyUnicode_GET_SIZE(self);
6040    for (; p < e; p++) {
6041	if (!Py_UNICODE_ISNUMERIC(*p))
6042	    return PyBool_FromLong(0);
6043    }
6044    return PyBool_FromLong(1);
6045}
6046
6047PyDoc_STRVAR(join__doc__,
6048"S.join(sequence) -> unicode\n\
6049\n\
6050Return a string which is the concatenation of the strings in the\n\
6051sequence.  The separator between elements is S.");
6052
6053static PyObject*
6054unicode_join(PyObject *self, PyObject *data)
6055{
6056    return PyUnicode_Join(self, data);
6057}
6058
6059static Py_ssize_t
6060unicode_length(PyUnicodeObject *self)
6061{
6062    return self->length;
6063}
6064
6065PyDoc_STRVAR(ljust__doc__,
6066"S.ljust(width[, fillchar]) -> int\n\
6067\n\
6068Return S left justified in a Unicode string of length width. Padding is\n\
6069done using the specified fill character (default is a space).");
6070
6071static PyObject *
6072unicode_ljust(PyUnicodeObject *self, PyObject *args)
6073{
6074    Py_ssize_t width;
6075    Py_UNICODE fillchar = ' ';
6076
6077    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6078        return NULL;
6079
6080    if (self->length >= width && PyUnicode_CheckExact(self)) {
6081        Py_INCREF(self);
6082        return (PyObject*) self;
6083    }
6084
6085    return (PyObject*) pad(self, 0, width - self->length, fillchar);
6086}
6087
6088PyDoc_STRVAR(lower__doc__,
6089"S.lower() -> unicode\n\
6090\n\
6091Return a copy of the string S converted to lowercase.");
6092
6093static PyObject*
6094unicode_lower(PyUnicodeObject *self)
6095{
6096    return fixup(self, fixlower);
6097}
6098
6099#define LEFTSTRIP 0
6100#define RIGHTSTRIP 1
6101#define BOTHSTRIP 2
6102
6103/* Arrays indexed by above */
6104static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6105
6106#define STRIPNAME(i) (stripformat[i]+3)
6107
6108/* externally visible for str.strip(unicode) */
6109PyObject *
6110_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6111{
6112	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6113	Py_ssize_t len = PyUnicode_GET_SIZE(self);
6114	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6115	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6116	Py_ssize_t i, j;
6117
6118        BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6119
6120	i = 0;
6121	if (striptype != RIGHTSTRIP) {
6122            while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6123                i++;
6124            }
6125	}
6126
6127	j = len;
6128	if (striptype != LEFTSTRIP) {
6129            do {
6130                j--;
6131            } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6132            j++;
6133	}
6134
6135	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6136            Py_INCREF(self);
6137            return (PyObject*)self;
6138	}
6139	else
6140            return PyUnicode_FromUnicode(s+i, j-i);
6141}
6142
6143
6144static PyObject *
6145do_strip(PyUnicodeObject *self, int striptype)
6146{
6147	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6148	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6149
6150	i = 0;
6151	if (striptype != RIGHTSTRIP) {
6152		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6153			i++;
6154		}
6155	}
6156
6157	j = len;
6158	if (striptype != LEFTSTRIP) {
6159		do {
6160			j--;
6161		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6162		j++;
6163	}
6164
6165	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6166		Py_INCREF(self);
6167		return (PyObject*)self;
6168	}
6169	else
6170		return PyUnicode_FromUnicode(s+i, j-i);
6171}
6172
6173
6174static PyObject *
6175do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6176{
6177	PyObject *sep = NULL;
6178
6179	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6180		return NULL;
6181
6182	if (sep != NULL && sep != Py_None) {
6183		if (PyUnicode_Check(sep))
6184			return _PyUnicode_XStrip(self, striptype, sep);
6185		else if (PyString_Check(sep)) {
6186			PyObject *res;
6187			sep = PyUnicode_FromObject(sep);
6188			if (sep==NULL)
6189				return NULL;
6190			res = _PyUnicode_XStrip(self, striptype, sep);
6191			Py_DECREF(sep);
6192			return res;
6193		}
6194		else {
6195			PyErr_Format(PyExc_TypeError,
6196				     "%s arg must be None, unicode or str",
6197				     STRIPNAME(striptype));
6198			return NULL;
6199		}
6200	}
6201
6202	return do_strip(self, striptype);
6203}
6204
6205
6206PyDoc_STRVAR(strip__doc__,
6207"S.strip([chars]) -> unicode\n\
6208\n\
6209Return a copy of the string S with leading and trailing\n\
6210whitespace removed.\n\
6211If chars is given and not None, remove characters in chars instead.\n\
6212If chars is a str, it will be converted to unicode before stripping");
6213
6214static PyObject *
6215unicode_strip(PyUnicodeObject *self, PyObject *args)
6216{
6217	if (PyTuple_GET_SIZE(args) == 0)
6218		return do_strip(self, BOTHSTRIP); /* Common case */
6219	else
6220		return do_argstrip(self, BOTHSTRIP, args);
6221}
6222
6223
6224PyDoc_STRVAR(lstrip__doc__,
6225"S.lstrip([chars]) -> unicode\n\
6226\n\
6227Return a copy of the string S with leading whitespace removed.\n\
6228If chars is given and not None, remove characters in chars instead.\n\
6229If chars is a str, it will be converted to unicode before stripping");
6230
6231static PyObject *
6232unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6233{
6234	if (PyTuple_GET_SIZE(args) == 0)
6235		return do_strip(self, LEFTSTRIP); /* Common case */
6236	else
6237		return do_argstrip(self, LEFTSTRIP, args);
6238}
6239
6240
6241PyDoc_STRVAR(rstrip__doc__,
6242"S.rstrip([chars]) -> unicode\n\
6243\n\
6244Return a copy of the string S with trailing whitespace removed.\n\
6245If chars is given and not None, remove characters in chars instead.\n\
6246If chars is a str, it will be converted to unicode before stripping");
6247
6248static PyObject *
6249unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6250{
6251	if (PyTuple_GET_SIZE(args) == 0)
6252		return do_strip(self, RIGHTSTRIP); /* Common case */
6253	else
6254		return do_argstrip(self, RIGHTSTRIP, args);
6255}
6256
6257
6258static PyObject*
6259unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6260{
6261    PyUnicodeObject *u;
6262    Py_UNICODE *p;
6263    Py_ssize_t nchars;
6264    size_t nbytes;
6265
6266    if (len < 0)
6267        len = 0;
6268
6269    if (len == 1 && PyUnicode_CheckExact(str)) {
6270        /* no repeat, return original string */
6271        Py_INCREF(str);
6272        return (PyObject*) str;
6273    }
6274
6275    /* ensure # of chars needed doesn't overflow int and # of bytes
6276     * needed doesn't overflow size_t
6277     */
6278    nchars = len * str->length;
6279    if (len && nchars / len != str->length) {
6280        PyErr_SetString(PyExc_OverflowError,
6281                        "repeated string is too long");
6282        return NULL;
6283    }
6284    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6285    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6286        PyErr_SetString(PyExc_OverflowError,
6287                        "repeated string is too long");
6288        return NULL;
6289    }
6290    u = _PyUnicode_New(nchars);
6291    if (!u)
6292        return NULL;
6293
6294    p = u->str;
6295
6296    if (str->length == 1 && len > 0) {
6297        Py_UNICODE_FILL(p, str->str[0], len);
6298    } else {
6299	Py_ssize_t done = 0; /* number of characters copied this far */
6300	if (done < nchars) {
6301            Py_UNICODE_COPY(p, str->str, str->length);
6302            done = str->length;
6303	}
6304	while (done < nchars) {
6305            int n = (done <= nchars-done) ? done : nchars-done;
6306            Py_UNICODE_COPY(p+done, p, n);
6307            done += n;
6308	}
6309    }
6310
6311    return (PyObject*) u;
6312}
6313
6314PyObject *PyUnicode_Replace(PyObject *obj,
6315			    PyObject *subobj,
6316			    PyObject *replobj,
6317			    Py_ssize_t maxcount)
6318{
6319    PyObject *self;
6320    PyObject *str1;
6321    PyObject *str2;
6322    PyObject *result;
6323
6324    self = PyUnicode_FromObject(obj);
6325    if (self == NULL)
6326	return NULL;
6327    str1 = PyUnicode_FromObject(subobj);
6328    if (str1 == NULL) {
6329	Py_DECREF(self);
6330	return NULL;
6331    }
6332    str2 = PyUnicode_FromObject(replobj);
6333    if (str2 == NULL) {
6334	Py_DECREF(self);
6335	Py_DECREF(str1);
6336	return NULL;
6337    }
6338    result = replace((PyUnicodeObject *)self,
6339		     (PyUnicodeObject *)str1,
6340		     (PyUnicodeObject *)str2,
6341		     maxcount);
6342    Py_DECREF(self);
6343    Py_DECREF(str1);
6344    Py_DECREF(str2);
6345    return result;
6346}
6347
6348PyDoc_STRVAR(replace__doc__,
6349"S.replace (old, new[, maxsplit]) -> unicode\n\
6350\n\
6351Return a copy of S with all occurrences of substring\n\
6352old replaced by new.  If the optional argument maxsplit is\n\
6353given, only the first maxsplit occurrences are replaced.");
6354
6355static PyObject*
6356unicode_replace(PyUnicodeObject *self, PyObject *args)
6357{
6358    PyUnicodeObject *str1;
6359    PyUnicodeObject *str2;
6360    Py_ssize_t maxcount = -1;
6361    PyObject *result;
6362
6363    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
6364        return NULL;
6365    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6366    if (str1 == NULL)
6367	return NULL;
6368    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
6369    if (str2 == NULL) {
6370	Py_DECREF(str1);
6371	return NULL;
6372    }
6373
6374    result = replace(self, str1, str2, maxcount);
6375
6376    Py_DECREF(str1);
6377    Py_DECREF(str2);
6378    return result;
6379}
6380
6381static
6382PyObject *unicode_repr(PyObject *unicode)
6383{
6384    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6385				PyUnicode_GET_SIZE(unicode),
6386				1);
6387}
6388
6389PyDoc_STRVAR(rfind__doc__,
6390"S.rfind(sub [,start [,end]]) -> int\n\
6391\n\
6392Return the highest index in S where substring sub is found,\n\
6393such that sub is contained within s[start,end].  Optional\n\
6394arguments start and end are interpreted as in slice notation.\n\
6395\n\
6396Return -1 on failure.");
6397
6398static PyObject *
6399unicode_rfind(PyUnicodeObject *self, PyObject *args)
6400{
6401    PyObject *substring;
6402    Py_ssize_t start = 0;
6403    Py_ssize_t end = PY_SSIZE_T_MAX;
6404    Py_ssize_t result;
6405
6406    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6407		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6408        return NULL;
6409    substring = PyUnicode_FromObject(substring);
6410    if (!substring)
6411	return NULL;
6412
6413    result = stringlib_rfind_slice(
6414        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6415        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6416        start, end
6417        );
6418
6419    Py_DECREF(substring);
6420
6421    return PyInt_FromSsize_t(result);
6422}
6423
6424PyDoc_STRVAR(rindex__doc__,
6425"S.rindex(sub [,start [,end]]) -> int\n\
6426\n\
6427Like S.rfind() but raise ValueError when the substring is not found.");
6428
6429static PyObject *
6430unicode_rindex(PyUnicodeObject *self, PyObject *args)
6431{
6432    PyObject *substring;
6433    Py_ssize_t start = 0;
6434    Py_ssize_t end = PY_SSIZE_T_MAX;
6435    Py_ssize_t result;
6436
6437    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6438		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6439        return NULL;
6440    substring = PyUnicode_FromObject(substring);
6441    if (!substring)
6442	return NULL;
6443
6444    result = stringlib_rfind_slice(
6445        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6446        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6447        start, end
6448        );
6449
6450    Py_DECREF(substring);
6451
6452    if (result < 0) {
6453        PyErr_SetString(PyExc_ValueError, "substring not found");
6454        return NULL;
6455    }
6456    return PyInt_FromSsize_t(result);
6457}
6458
6459PyDoc_STRVAR(rjust__doc__,
6460"S.rjust(width[, fillchar]) -> unicode\n\
6461\n\
6462Return S right justified in a Unicode string of length width. Padding is\n\
6463done using the specified fill character (default is a space).");
6464
6465static PyObject *
6466unicode_rjust(PyUnicodeObject *self, PyObject *args)
6467{
6468    Py_ssize_t width;
6469    Py_UNICODE fillchar = ' ';
6470
6471    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
6472        return NULL;
6473
6474    if (self->length >= width && PyUnicode_CheckExact(self)) {
6475        Py_INCREF(self);
6476        return (PyObject*) self;
6477    }
6478
6479    return (PyObject*) pad(self, width - self->length, 0, fillchar);
6480}
6481
6482static PyObject*
6483unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
6484{
6485    /* standard clamping */
6486    if (start < 0)
6487        start = 0;
6488    if (end < 0)
6489        end = 0;
6490    if (end > self->length)
6491        end = self->length;
6492    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6493        /* full slice, return original string */
6494        Py_INCREF(self);
6495        return (PyObject*) self;
6496    }
6497    if (start > end)
6498        start = end;
6499    /* copy slice */
6500    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6501					     end - start);
6502}
6503
6504PyObject *PyUnicode_Split(PyObject *s,
6505			  PyObject *sep,
6506			  Py_ssize_t maxsplit)
6507{
6508    PyObject *result;
6509
6510    s = PyUnicode_FromObject(s);
6511    if (s == NULL)
6512	return NULL;
6513    if (sep != NULL) {
6514	sep = PyUnicode_FromObject(sep);
6515	if (sep == NULL) {
6516	    Py_DECREF(s);
6517	    return NULL;
6518	}
6519    }
6520
6521    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6522
6523    Py_DECREF(s);
6524    Py_XDECREF(sep);
6525    return result;
6526}
6527
6528PyDoc_STRVAR(split__doc__,
6529"S.split([sep [,maxsplit]]) -> list of strings\n\
6530\n\
6531Return a list of the words in S, using sep as the\n\
6532delimiter string.  If maxsplit is given, at most maxsplit\n\
6533splits are done. If sep is not specified or is None,\n\
6534any whitespace string is a separator.");
6535
6536static PyObject*
6537unicode_split(PyUnicodeObject *self, PyObject *args)
6538{
6539    PyObject *substring = Py_None;
6540    Py_ssize_t maxcount = -1;
6541
6542    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
6543        return NULL;
6544
6545    if (substring == Py_None)
6546	return split(self, NULL, maxcount);
6547    else if (PyUnicode_Check(substring))
6548	return split(self, (PyUnicodeObject *)substring, maxcount);
6549    else
6550	return PyUnicode_Split((PyObject *)self, substring, maxcount);
6551}
6552
6553PyObject *
6554PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6555{
6556    PyObject* str_obj;
6557    PyObject* sep_obj;
6558    PyObject* out;
6559
6560    str_obj = PyUnicode_FromObject(str_in);
6561    if (!str_obj)
6562	return NULL;
6563    sep_obj = PyUnicode_FromObject(sep_in);
6564    if (!sep_obj) {
6565        Py_DECREF(str_obj);
6566        return NULL;
6567    }
6568
6569    out = stringlib_partition(
6570        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6571        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6572        );
6573
6574    Py_DECREF(sep_obj);
6575    Py_DECREF(str_obj);
6576
6577    return out;
6578}
6579
6580
6581PyObject *
6582PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6583{
6584    PyObject* str_obj;
6585    PyObject* sep_obj;
6586    PyObject* out;
6587
6588    str_obj = PyUnicode_FromObject(str_in);
6589    if (!str_obj)
6590	return NULL;
6591    sep_obj = PyUnicode_FromObject(sep_in);
6592    if (!sep_obj) {
6593        Py_DECREF(str_obj);
6594        return NULL;
6595    }
6596
6597    out = stringlib_rpartition(
6598        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6599        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6600        );
6601
6602    Py_DECREF(sep_obj);
6603    Py_DECREF(str_obj);
6604
6605    return out;
6606}
6607
6608PyDoc_STRVAR(partition__doc__,
6609"S.partition(sep) -> (head, sep, tail)\n\
6610\n\
6611Searches for the separator sep in S, and returns the part before it,\n\
6612the separator itself, and the part after it.  If the separator is not\n\
6613found, returns S and two empty strings.");
6614
6615static PyObject*
6616unicode_partition(PyUnicodeObject *self, PyObject *separator)
6617{
6618    return PyUnicode_Partition((PyObject *)self, separator);
6619}
6620
6621PyDoc_STRVAR(rpartition__doc__,
6622"S.rpartition(sep) -> (head, sep, tail)\n\
6623\n\
6624Searches for the separator sep in S, starting at the end of S, and returns\n\
6625the part before it, the separator itself, and the part after it.  If the\n\
6626separator is not found, returns S and two empty strings.");
6627
6628static PyObject*
6629unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
6630{
6631    return PyUnicode_RPartition((PyObject *)self, separator);
6632}
6633
6634PyObject *PyUnicode_RSplit(PyObject *s,
6635			   PyObject *sep,
6636			   Py_ssize_t maxsplit)
6637{
6638    PyObject *result;
6639
6640    s = PyUnicode_FromObject(s);
6641    if (s == NULL)
6642	return NULL;
6643    if (sep != NULL) {
6644	sep = PyUnicode_FromObject(sep);
6645	if (sep == NULL) {
6646	    Py_DECREF(s);
6647	    return NULL;
6648	}
6649    }
6650
6651    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6652
6653    Py_DECREF(s);
6654    Py_XDECREF(sep);
6655    return result;
6656}
6657
6658PyDoc_STRVAR(rsplit__doc__,
6659"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6660\n\
6661Return a list of the words in S, using sep as the\n\
6662delimiter string, starting at the end of the string and\n\
6663working to the front.  If maxsplit is given, at most maxsplit\n\
6664splits are done. If sep is not specified, any whitespace string\n\
6665is a separator.");
6666
6667static PyObject*
6668unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6669{
6670    PyObject *substring = Py_None;
6671    Py_ssize_t maxcount = -1;
6672
6673    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
6674        return NULL;
6675
6676    if (substring == Py_None)
6677	return rsplit(self, NULL, maxcount);
6678    else if (PyUnicode_Check(substring))
6679	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6680    else
6681	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6682}
6683
6684PyDoc_STRVAR(splitlines__doc__,
6685"S.splitlines([keepends]]) -> list of strings\n\
6686\n\
6687Return a list of the lines in S, breaking at line boundaries.\n\
6688Line breaks are not included in the resulting list unless keepends\n\
6689is given and true.");
6690
6691static PyObject*
6692unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6693{
6694    int keepends = 0;
6695
6696    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
6697        return NULL;
6698
6699    return PyUnicode_Splitlines((PyObject *)self, keepends);
6700}
6701
6702static
6703PyObject *unicode_str(PyUnicodeObject *self)
6704{
6705    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6706}
6707
6708PyDoc_STRVAR(swapcase__doc__,
6709"S.swapcase() -> unicode\n\
6710\n\
6711Return a copy of S with uppercase characters converted to lowercase\n\
6712and vice versa.");
6713
6714static PyObject*
6715unicode_swapcase(PyUnicodeObject *self)
6716{
6717    return fixup(self, fixswapcase);
6718}
6719
6720PyDoc_STRVAR(translate__doc__,
6721"S.translate(table) -> unicode\n\
6722\n\
6723Return a copy of the string S, where all characters have been mapped\n\
6724through the given translation table, which must be a mapping of\n\
6725Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6726Unmapped characters are left untouched. Characters mapped to None\n\
6727are deleted.");
6728
6729static PyObject*
6730unicode_translate(PyUnicodeObject *self, PyObject *table)
6731{
6732    return PyUnicode_TranslateCharmap(self->str,
6733				      self->length,
6734				      table,
6735				      "ignore");
6736}
6737
6738PyDoc_STRVAR(upper__doc__,
6739"S.upper() -> unicode\n\
6740\n\
6741Return a copy of S converted to uppercase.");
6742
6743static PyObject*
6744unicode_upper(PyUnicodeObject *self)
6745{
6746    return fixup(self, fixupper);
6747}
6748
6749PyDoc_STRVAR(zfill__doc__,
6750"S.zfill(width) -> unicode\n\
6751\n\
6752Pad a numeric string x with zeros on the left, to fill a field\n\
6753of the specified width. The string x is never truncated.");
6754
6755static PyObject *
6756unicode_zfill(PyUnicodeObject *self, PyObject *args)
6757{
6758    Py_ssize_t fill;
6759    PyUnicodeObject *u;
6760
6761    Py_ssize_t width;
6762    if (!PyArg_ParseTuple(args, "n:zfill", &width))
6763        return NULL;
6764
6765    if (self->length >= width) {
6766        if (PyUnicode_CheckExact(self)) {
6767            Py_INCREF(self);
6768            return (PyObject*) self;
6769        }
6770        else
6771            return PyUnicode_FromUnicode(
6772                PyUnicode_AS_UNICODE(self),
6773                PyUnicode_GET_SIZE(self)
6774            );
6775    }
6776
6777    fill = width - self->length;
6778
6779    u = pad(self, fill, 0, '0');
6780
6781    if (u == NULL)
6782        return NULL;
6783
6784    if (u->str[fill] == '+' || u->str[fill] == '-') {
6785        /* move sign to beginning of string */
6786        u->str[0] = u->str[fill];
6787        u->str[fill] = '0';
6788    }
6789
6790    return (PyObject*) u;
6791}
6792
6793#if 0
6794static PyObject*
6795unicode_freelistsize(PyUnicodeObject *self)
6796{
6797    return PyInt_FromLong(unicode_freelist_size);
6798}
6799#endif
6800
6801PyDoc_STRVAR(startswith__doc__,
6802"S.startswith(prefix[, start[, end]]) -> bool\n\
6803\n\
6804Return True if S starts with the specified prefix, False otherwise.\n\
6805With optional start, test S beginning at that position.\n\
6806With optional end, stop comparing S at that position.\n\
6807prefix can also be a tuple of strings to try.");
6808
6809static PyObject *
6810unicode_startswith(PyUnicodeObject *self,
6811		   PyObject *args)
6812{
6813    PyObject *subobj;
6814    PyUnicodeObject *substring;
6815    Py_ssize_t start = 0;
6816    Py_ssize_t end = PY_SSIZE_T_MAX;
6817    int result;
6818
6819    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
6820		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6821	return NULL;
6822    if (PyTuple_Check(subobj)) {
6823        Py_ssize_t i;
6824        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6825            substring = (PyUnicodeObject *)PyUnicode_FromObject(
6826                            PyTuple_GET_ITEM(subobj, i));
6827            if (substring == NULL)
6828                return NULL;
6829            result = tailmatch(self, substring, start, end, -1);
6830            Py_DECREF(substring);
6831            if (result) {
6832                Py_RETURN_TRUE;
6833            }
6834        }
6835        /* nothing matched */
6836        Py_RETURN_FALSE;
6837    }
6838    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6839    if (substring == NULL)
6840         return NULL;
6841    result = tailmatch(self, substring, start, end, -1);
6842    Py_DECREF(substring);
6843    return PyBool_FromLong(result);
6844}
6845
6846
6847PyDoc_STRVAR(endswith__doc__,
6848"S.endswith(suffix[, start[, end]]) -> bool\n\
6849\n\
6850Return True if S ends with the specified suffix, False otherwise.\n\
6851With optional start, test S beginning at that position.\n\
6852With optional end, stop comparing S at that position.\n\
6853suffix can also be a tuple of strings to try.");
6854
6855static PyObject *
6856unicode_endswith(PyUnicodeObject *self,
6857		 PyObject *args)
6858{
6859    PyObject *subobj;
6860    PyUnicodeObject *substring;
6861    Py_ssize_t start = 0;
6862    Py_ssize_t end = PY_SSIZE_T_MAX;
6863    int result;
6864
6865    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
6866        _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6867	return NULL;
6868    if (PyTuple_Check(subobj)) {
6869        Py_ssize_t i;
6870        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
6871            substring = (PyUnicodeObject *)PyUnicode_FromObject(
6872                            PyTuple_GET_ITEM(subobj, i));
6873            if (substring == NULL)
6874            return NULL;
6875            result = tailmatch(self, substring, start, end, +1);
6876            Py_DECREF(substring);
6877            if (result) {
6878                Py_RETURN_TRUE;
6879            }
6880        }
6881        Py_RETURN_FALSE;
6882    }
6883    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
6884    if (substring == NULL)
6885    return NULL;
6886
6887    result = tailmatch(self, substring, start, end, +1);
6888    Py_DECREF(substring);
6889    return PyBool_FromLong(result);
6890}
6891
6892
6893
6894static PyObject *
6895unicode_getnewargs(PyUnicodeObject *v)
6896{
6897	return Py_BuildValue("(u#)", v->str, v->length);
6898}
6899
6900
6901static PyMethodDef unicode_methods[] = {
6902
6903    /* Order is according to common usage: often used methods should
6904       appear first, since lookup is done sequentially. */
6905
6906    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6907    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6908    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
6909    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
6910    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6911    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6912    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6913    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6914    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6915    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6916    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6917    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
6918    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6919    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6920    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
6921    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
6922    {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
6923/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6924    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6925    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6926    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
6927    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
6928    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
6929    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
6930    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
6931    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6932    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6933    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6934    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6935    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6936    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6937    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6938    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6939    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6940    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6941    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6942    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6943    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6944    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
6945    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
6946#if 0
6947    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
6948#endif
6949
6950#if 0
6951    /* This one is just used for debugging the implementation. */
6952    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
6953#endif
6954
6955    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
6956    {NULL, NULL}
6957};
6958
6959static PyObject *
6960unicode_mod(PyObject *v, PyObject *w)
6961{
6962       if (!PyUnicode_Check(v)) {
6963               Py_INCREF(Py_NotImplemented);
6964               return Py_NotImplemented;
6965       }
6966       return PyUnicode_Format(v, w);
6967}
6968
6969static PyNumberMethods unicode_as_number = {
6970	0,				/*nb_add*/
6971	0,				/*nb_subtract*/
6972	0,				/*nb_multiply*/
6973	unicode_mod,			/*nb_remainder*/
6974};
6975
6976static PySequenceMethods unicode_as_sequence = {
6977    (lenfunc) unicode_length, 		/* sq_length */
6978    PyUnicode_Concat,		 	/* sq_concat */
6979    (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
6980    (ssizeargfunc) unicode_getitem, 	/* sq_item */
6981    (ssizessizeargfunc) unicode_slice, 	/* sq_slice */
6982    0, 					/* sq_ass_item */
6983    0, 					/* sq_ass_slice */
6984    PyUnicode_Contains, 		/* sq_contains */
6985};
6986
6987static PyObject*
6988unicode_subscript(PyUnicodeObject* self, PyObject* item)
6989{
6990    PyNumberMethods *nb = item->ob_type->tp_as_number;
6991    if (nb != NULL && nb->nb_index != NULL) {
6992        Py_ssize_t i = nb->nb_index(item);
6993        if (i == -1 && PyErr_Occurred())
6994            return NULL;
6995        if (i < 0)
6996            i += PyUnicode_GET_SIZE(self);
6997        return unicode_getitem(self, i);
6998    } else if (PySlice_Check(item)) {
6999        Py_ssize_t start, stop, step, slicelength, cur, i;
7000        Py_UNICODE* source_buf;
7001        Py_UNICODE* result_buf;
7002        PyObject* result;
7003
7004        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7005				 &start, &stop, &step, &slicelength) < 0) {
7006            return NULL;
7007        }
7008
7009        if (slicelength <= 0) {
7010            return PyUnicode_FromUnicode(NULL, 0);
7011        } else {
7012            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7013            result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7014                                                    sizeof(Py_UNICODE));
7015
7016	    if (result_buf == NULL)
7017		    return PyErr_NoMemory();
7018
7019            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7020                result_buf[i] = source_buf[cur];
7021            }
7022
7023            result = PyUnicode_FromUnicode(result_buf, slicelength);
7024            PyMem_FREE(result_buf);
7025            return result;
7026        }
7027    } else {
7028        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7029        return NULL;
7030    }
7031}
7032
7033static PyMappingMethods unicode_as_mapping = {
7034    (lenfunc)unicode_length,		/* mp_length */
7035    (binaryfunc)unicode_subscript,	/* mp_subscript */
7036    (objobjargproc)0,			/* mp_ass_subscript */
7037};
7038
7039static Py_ssize_t
7040unicode_buffer_getreadbuf(PyUnicodeObject *self,
7041			  Py_ssize_t index,
7042			  const void **ptr)
7043{
7044    if (index != 0) {
7045        PyErr_SetString(PyExc_SystemError,
7046			"accessing non-existent unicode segment");
7047        return -1;
7048    }
7049    *ptr = (void *) self->str;
7050    return PyUnicode_GET_DATA_SIZE(self);
7051}
7052
7053static Py_ssize_t
7054unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7055			   const void **ptr)
7056{
7057    PyErr_SetString(PyExc_TypeError,
7058		    "cannot use unicode as modifiable buffer");
7059    return -1;
7060}
7061
7062static int
7063unicode_buffer_getsegcount(PyUnicodeObject *self,
7064			   Py_ssize_t *lenp)
7065{
7066    if (lenp)
7067        *lenp = PyUnicode_GET_DATA_SIZE(self);
7068    return 1;
7069}
7070
7071static Py_ssize_t
7072unicode_buffer_getcharbuf(PyUnicodeObject *self,
7073			  Py_ssize_t index,
7074			  const void **ptr)
7075{
7076    PyObject *str;
7077
7078    if (index != 0) {
7079        PyErr_SetString(PyExc_SystemError,
7080			"accessing non-existent unicode segment");
7081        return -1;
7082    }
7083    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7084    if (str == NULL)
7085	return -1;
7086    *ptr = (void *) PyString_AS_STRING(str);
7087    return PyString_GET_SIZE(str);
7088}
7089
7090/* Helpers for PyUnicode_Format() */
7091
7092static PyObject *
7093getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7094{
7095    Py_ssize_t argidx = *p_argidx;
7096    if (argidx < arglen) {
7097	(*p_argidx)++;
7098	if (arglen < 0)
7099	    return args;
7100	else
7101	    return PyTuple_GetItem(args, argidx);
7102    }
7103    PyErr_SetString(PyExc_TypeError,
7104		    "not enough arguments for format string");
7105    return NULL;
7106}
7107
7108#define F_LJUST (1<<0)
7109#define F_SIGN	(1<<1)
7110#define F_BLANK (1<<2)
7111#define F_ALT	(1<<3)
7112#define F_ZERO	(1<<4)
7113
7114static Py_ssize_t
7115strtounicode(Py_UNICODE *buffer, const char *charbuffer)
7116{
7117    register Py_ssize_t i;
7118    Py_ssize_t len = strlen(charbuffer);
7119    for (i = len - 1; i >= 0; i--)
7120	buffer[i] = (Py_UNICODE) charbuffer[i];
7121
7122    return len;
7123}
7124
7125static int
7126doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7127{
7128    Py_ssize_t result;
7129
7130    PyOS_ascii_formatd((char *)buffer, len, format, x);
7131    result = strtounicode(buffer, (char *)buffer);
7132    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7133}
7134
7135static int
7136longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7137{
7138    Py_ssize_t result;
7139
7140    PyOS_snprintf((char *)buffer, len, format, x);
7141    result = strtounicode(buffer, (char *)buffer);
7142    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7143}
7144
7145/* XXX To save some code duplication, formatfloat/long/int could have been
7146   shared with stringobject.c, converting from 8-bit to Unicode after the
7147   formatting is done. */
7148
7149static int
7150formatfloat(Py_UNICODE *buf,
7151	    size_t buflen,
7152	    int flags,
7153	    int prec,
7154	    int type,
7155	    PyObject *v)
7156{
7157    /* fmt = '%#.' + `prec` + `type`
7158       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7159    char fmt[20];
7160    double x;
7161
7162    x = PyFloat_AsDouble(v);
7163    if (x == -1.0 && PyErr_Occurred())
7164	return -1;
7165    if (prec < 0)
7166	prec = 6;
7167    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7168	type = 'g';
7169    /* Worst case length calc to ensure no buffer overrun:
7170
7171       'g' formats:
7172	 fmt = %#.<prec>g
7173	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7174	    for any double rep.)
7175	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7176
7177       'f' formats:
7178	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7179	 len = 1 + 50 + 1 + prec = 52 + prec
7180
7181       If prec=0 the effective precision is 1 (the leading digit is
7182       always given), therefore increase the length by one.
7183
7184    */
7185    if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
7186	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7187	PyErr_SetString(PyExc_OverflowError,
7188			"formatted float is too long (precision too large?)");
7189	return -1;
7190    }
7191    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7192		  (flags&F_ALT) ? "#" : "",
7193		  prec, type);
7194    return doubletounicode(buf, buflen, fmt, x);
7195}
7196
7197static PyObject*
7198formatlong(PyObject *val, int flags, int prec, int type)
7199{
7200	char *buf;
7201	int i, len;
7202	PyObject *str; /* temporary string object. */
7203	PyUnicodeObject *result;
7204
7205	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7206	if (!str)
7207		return NULL;
7208	result = _PyUnicode_New(len);
7209	if (!result) {
7210		Py_DECREF(str);
7211		return NULL;
7212	}
7213	for (i = 0; i < len; i++)
7214		result->str[i] = buf[i];
7215	result->str[len] = 0;
7216	Py_DECREF(str);
7217	return (PyObject*)result;
7218}
7219
7220static int
7221formatint(Py_UNICODE *buf,
7222	  size_t buflen,
7223	  int flags,
7224	  int prec,
7225	  int type,
7226	  PyObject *v)
7227{
7228    /* fmt = '%#.' + `prec` + 'l' + `type`
7229     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7230     *                     + 1 + 1
7231     *                   = 24
7232     */
7233    char fmt[64]; /* plenty big enough! */
7234    char *sign;
7235    long x;
7236
7237    x = PyInt_AsLong(v);
7238    if (x == -1 && PyErr_Occurred())
7239        return -1;
7240    if (x < 0 && type == 'u') {
7241        type = 'd';
7242    }
7243    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7244        sign = "-";
7245    else
7246        sign = "";
7247    if (prec < 0)
7248        prec = 1;
7249
7250    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7251     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7252     */
7253    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
7254        PyErr_SetString(PyExc_OverflowError,
7255    	        "formatted integer is too long (precision too large?)");
7256        return -1;
7257    }
7258
7259    if ((flags & F_ALT) &&
7260        (type == 'x' || type == 'X')) {
7261        /* When converting under %#x or %#X, there are a number
7262         * of issues that cause pain:
7263         * - when 0 is being converted, the C standard leaves off
7264         *   the '0x' or '0X', which is inconsistent with other
7265         *   %#x/%#X conversions and inconsistent with Python's
7266         *   hex() function
7267         * - there are platforms that violate the standard and
7268         *   convert 0 with the '0x' or '0X'
7269         *   (Metrowerks, Compaq Tru64)
7270         * - there are platforms that give '0x' when converting
7271         *   under %#X, but convert 0 in accordance with the
7272         *   standard (OS/2 EMX)
7273         *
7274         * We can achieve the desired consistency by inserting our
7275         * own '0x' or '0X' prefix, and substituting %x/%X in place
7276         * of %#x/%#X.
7277         *
7278         * Note that this is the same approach as used in
7279         * formatint() in stringobject.c
7280         */
7281        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7282                      sign, type, prec, type);
7283    }
7284    else {
7285        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7286                      sign, (flags&F_ALT) ? "#" : "",
7287                      prec, type);
7288    }
7289    if (sign[0])
7290        return longtounicode(buf, buflen, fmt, -x);
7291    else
7292        return longtounicode(buf, buflen, fmt, x);
7293}
7294
7295static int
7296formatchar(Py_UNICODE *buf,
7297           size_t buflen,
7298           PyObject *v)
7299{
7300    /* presume that the buffer is at least 2 characters long */
7301    if (PyUnicode_Check(v)) {
7302	if (PyUnicode_GET_SIZE(v) != 1)
7303	    goto onError;
7304	buf[0] = PyUnicode_AS_UNICODE(v)[0];
7305    }
7306
7307    else if (PyString_Check(v)) {
7308	if (PyString_GET_SIZE(v) != 1)
7309	    goto onError;
7310	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7311    }
7312
7313    else {
7314	/* Integer input truncated to a character */
7315        long x;
7316	x = PyInt_AsLong(v);
7317	if (x == -1 && PyErr_Occurred())
7318	    goto onError;
7319#ifdef Py_UNICODE_WIDE
7320	if (x < 0 || x > 0x10ffff) {
7321	    PyErr_SetString(PyExc_OverflowError,
7322			    "%c arg not in range(0x110000) "
7323			    "(wide Python build)");
7324	    return -1;
7325	}
7326#else
7327	if (x < 0 || x > 0xffff) {
7328	    PyErr_SetString(PyExc_OverflowError,
7329			    "%c arg not in range(0x10000) "
7330			    "(narrow Python build)");
7331	    return -1;
7332	}
7333#endif
7334	buf[0] = (Py_UNICODE) x;
7335    }
7336    buf[1] = '\0';
7337    return 1;
7338
7339 onError:
7340    PyErr_SetString(PyExc_TypeError,
7341		    "%c requires int or char");
7342    return -1;
7343}
7344
7345/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7346
7347   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7348   chars are formatted. XXX This is a magic number. Each formatting
7349   routine does bounds checking to ensure no overflow, but a better
7350   solution may be to malloc a buffer of appropriate size for each
7351   format. For now, the current solution is sufficient.
7352*/
7353#define FORMATBUFLEN (size_t)120
7354
7355PyObject *PyUnicode_Format(PyObject *format,
7356			   PyObject *args)
7357{
7358    Py_UNICODE *fmt, *res;
7359    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
7360    int args_owned = 0;
7361    PyUnicodeObject *result = NULL;
7362    PyObject *dict = NULL;
7363    PyObject *uformat;
7364
7365    if (format == NULL || args == NULL) {
7366	PyErr_BadInternalCall();
7367	return NULL;
7368    }
7369    uformat = PyUnicode_FromObject(format);
7370    if (uformat == NULL)
7371	return NULL;
7372    fmt = PyUnicode_AS_UNICODE(uformat);
7373    fmtcnt = PyUnicode_GET_SIZE(uformat);
7374
7375    reslen = rescnt = fmtcnt + 100;
7376    result = _PyUnicode_New(reslen);
7377    if (result == NULL)
7378	goto onError;
7379    res = PyUnicode_AS_UNICODE(result);
7380
7381    if (PyTuple_Check(args)) {
7382	arglen = PyTuple_Size(args);
7383	argidx = 0;
7384    }
7385    else {
7386	arglen = -1;
7387	argidx = -2;
7388    }
7389    if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
7390        !PyObject_TypeCheck(args, &PyBaseString_Type))
7391	dict = args;
7392
7393    while (--fmtcnt >= 0) {
7394	if (*fmt != '%') {
7395	    if (--rescnt < 0) {
7396		rescnt = fmtcnt + 100;
7397		reslen += rescnt;
7398		if (_PyUnicode_Resize(&result, reslen) < 0)
7399		    goto onError;
7400		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7401		--rescnt;
7402	    }
7403	    *res++ = *fmt++;
7404	}
7405	else {
7406	    /* Got a format specifier */
7407	    int flags = 0;
7408	    Py_ssize_t width = -1;
7409	    int prec = -1;
7410	    Py_UNICODE c = '\0';
7411	    Py_UNICODE fill;
7412	    PyObject *v = NULL;
7413	    PyObject *temp = NULL;
7414	    Py_UNICODE *pbuf;
7415	    Py_UNICODE sign;
7416	    Py_ssize_t len;
7417	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
7418
7419	    fmt++;
7420	    if (*fmt == '(') {
7421		Py_UNICODE *keystart;
7422		Py_ssize_t keylen;
7423		PyObject *key;
7424		int pcount = 1;
7425
7426		if (dict == NULL) {
7427		    PyErr_SetString(PyExc_TypeError,
7428				    "format requires a mapping");
7429		    goto onError;
7430		}
7431		++fmt;
7432		--fmtcnt;
7433		keystart = fmt;
7434		/* Skip over balanced parentheses */
7435		while (pcount > 0 && --fmtcnt >= 0) {
7436		    if (*fmt == ')')
7437			--pcount;
7438		    else if (*fmt == '(')
7439			++pcount;
7440		    fmt++;
7441		}
7442		keylen = fmt - keystart - 1;
7443		if (fmtcnt < 0 || pcount > 0) {
7444		    PyErr_SetString(PyExc_ValueError,
7445				    "incomplete format key");
7446		    goto onError;
7447		}
7448#if 0
7449		/* keys are converted to strings using UTF-8 and
7450		   then looked up since Python uses strings to hold
7451		   variables names etc. in its namespaces and we
7452		   wouldn't want to break common idioms. */
7453		key = PyUnicode_EncodeUTF8(keystart,
7454					   keylen,
7455					   NULL);
7456#else
7457		key = PyUnicode_FromUnicode(keystart, keylen);
7458#endif
7459		if (key == NULL)
7460		    goto onError;
7461		if (args_owned) {
7462		    Py_DECREF(args);
7463		    args_owned = 0;
7464		}
7465		args = PyObject_GetItem(dict, key);
7466		Py_DECREF(key);
7467		if (args == NULL) {
7468		    goto onError;
7469		}
7470		args_owned = 1;
7471		arglen = -1;
7472		argidx = -2;
7473	    }
7474	    while (--fmtcnt >= 0) {
7475		switch (c = *fmt++) {
7476		case '-': flags |= F_LJUST; continue;
7477		case '+': flags |= F_SIGN; continue;
7478		case ' ': flags |= F_BLANK; continue;
7479		case '#': flags |= F_ALT; continue;
7480		case '0': flags |= F_ZERO; continue;
7481		}
7482		break;
7483	    }
7484	    if (c == '*') {
7485		v = getnextarg(args, arglen, &argidx);
7486		if (v == NULL)
7487		    goto onError;
7488		if (!PyInt_Check(v)) {
7489		    PyErr_SetString(PyExc_TypeError,
7490				    "* wants int");
7491		    goto onError;
7492		}
7493		width = PyInt_AsLong(v);
7494		if (width < 0) {
7495		    flags |= F_LJUST;
7496		    width = -width;
7497		}
7498		if (--fmtcnt >= 0)
7499		    c = *fmt++;
7500	    }
7501	    else if (c >= '0' && c <= '9') {
7502		width = c - '0';
7503		while (--fmtcnt >= 0) {
7504		    c = *fmt++;
7505		    if (c < '0' || c > '9')
7506			break;
7507		    if ((width*10) / 10 != width) {
7508			PyErr_SetString(PyExc_ValueError,
7509					"width too big");
7510			goto onError;
7511		    }
7512		    width = width*10 + (c - '0');
7513		}
7514	    }
7515	    if (c == '.') {
7516		prec = 0;
7517		if (--fmtcnt >= 0)
7518		    c = *fmt++;
7519		if (c == '*') {
7520		    v = getnextarg(args, arglen, &argidx);
7521		    if (v == NULL)
7522			goto onError;
7523		    if (!PyInt_Check(v)) {
7524			PyErr_SetString(PyExc_TypeError,
7525					"* wants int");
7526			goto onError;
7527		    }
7528		    prec = PyInt_AsLong(v);
7529		    if (prec < 0)
7530			prec = 0;
7531		    if (--fmtcnt >= 0)
7532			c = *fmt++;
7533		}
7534		else if (c >= '0' && c <= '9') {
7535		    prec = c - '0';
7536		    while (--fmtcnt >= 0) {
7537			c = Py_CHARMASK(*fmt++);
7538			if (c < '0' || c > '9')
7539			    break;
7540			if ((prec*10) / 10 != prec) {
7541			    PyErr_SetString(PyExc_ValueError,
7542					    "prec too big");
7543			    goto onError;
7544			}
7545			prec = prec*10 + (c - '0');
7546		    }
7547		}
7548	    } /* prec */
7549	    if (fmtcnt >= 0) {
7550		if (c == 'h' || c == 'l' || c == 'L') {
7551		    if (--fmtcnt >= 0)
7552			c = *fmt++;
7553		}
7554	    }
7555	    if (fmtcnt < 0) {
7556		PyErr_SetString(PyExc_ValueError,
7557				"incomplete format");
7558		goto onError;
7559	    }
7560	    if (c != '%') {
7561		v = getnextarg(args, arglen, &argidx);
7562		if (v == NULL)
7563		    goto onError;
7564	    }
7565	    sign = 0;
7566	    fill = ' ';
7567	    switch (c) {
7568
7569	    case '%':
7570		pbuf = formatbuf;
7571		/* presume that buffer length is at least 1 */
7572		pbuf[0] = '%';
7573		len = 1;
7574		break;
7575
7576	    case 's':
7577	    case 'r':
7578		if (PyUnicode_Check(v) && c == 's') {
7579		    temp = v;
7580		    Py_INCREF(temp);
7581		}
7582		else {
7583		    PyObject *unicode;
7584		    if (c == 's')
7585			temp = PyObject_Unicode(v);
7586		    else
7587			temp = PyObject_Repr(v);
7588		    if (temp == NULL)
7589			goto onError;
7590                    if (PyUnicode_Check(temp))
7591                        /* nothing to do */;
7592                    else if (PyString_Check(temp)) {
7593                        /* convert to string to Unicode */
7594		        unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7595						   PyString_GET_SIZE(temp),
7596						   NULL,
7597						   "strict");
7598		        Py_DECREF(temp);
7599		        temp = unicode;
7600		        if (temp == NULL)
7601			    goto onError;
7602		    }
7603		    else {
7604			Py_DECREF(temp);
7605			PyErr_SetString(PyExc_TypeError,
7606					"%s argument has non-string str()");
7607			goto onError;
7608		    }
7609		}
7610		pbuf = PyUnicode_AS_UNICODE(temp);
7611		len = PyUnicode_GET_SIZE(temp);
7612		if (prec >= 0 && len > prec)
7613		    len = prec;
7614		break;
7615
7616	    case 'i':
7617	    case 'd':
7618	    case 'u':
7619	    case 'o':
7620	    case 'x':
7621	    case 'X':
7622		if (c == 'i')
7623		    c = 'd';
7624		if (PyLong_Check(v)) {
7625		    temp = formatlong(v, flags, prec, c);
7626		    if (!temp)
7627			goto onError;
7628		    pbuf = PyUnicode_AS_UNICODE(temp);
7629		    len = PyUnicode_GET_SIZE(temp);
7630		    sign = 1;
7631		}
7632		else {
7633		    pbuf = formatbuf;
7634		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7635				    flags, prec, c, v);
7636		    if (len < 0)
7637			goto onError;
7638		    sign = 1;
7639		}
7640		if (flags & F_ZERO)
7641		    fill = '0';
7642		break;
7643
7644	    case 'e':
7645	    case 'E':
7646	    case 'f':
7647	    case 'F':
7648	    case 'g':
7649	    case 'G':
7650		if (c == 'F')
7651			c = 'f';
7652		pbuf = formatbuf;
7653		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7654			flags, prec, c, v);
7655		if (len < 0)
7656		    goto onError;
7657		sign = 1;
7658		if (flags & F_ZERO)
7659		    fill = '0';
7660		break;
7661
7662	    case 'c':
7663		pbuf = formatbuf;
7664		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
7665		if (len < 0)
7666		    goto onError;
7667		break;
7668
7669	    default:
7670		PyErr_Format(PyExc_ValueError,
7671			     "unsupported format character '%c' (0x%x) "
7672			     "at index %i",
7673			     (31<=c && c<=126) ? (char)c : '?',
7674                             (int)c,
7675			     (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
7676		goto onError;
7677	    }
7678	    if (sign) {
7679		if (*pbuf == '-' || *pbuf == '+') {
7680		    sign = *pbuf++;
7681		    len--;
7682		}
7683		else if (flags & F_SIGN)
7684		    sign = '+';
7685		else if (flags & F_BLANK)
7686		    sign = ' ';
7687		else
7688		    sign = 0;
7689	    }
7690	    if (width < len)
7691		width = len;
7692	    if (rescnt - (sign != 0) < width) {
7693		reslen -= rescnt;
7694		rescnt = width + fmtcnt + 100;
7695		reslen += rescnt;
7696		if (reslen < 0) {
7697		    Py_XDECREF(temp);
7698		    PyErr_NoMemory();
7699		    goto onError;
7700		}
7701		if (_PyUnicode_Resize(&result, reslen) < 0) {
7702		    Py_XDECREF(temp);
7703		    goto onError;
7704		}
7705		res = PyUnicode_AS_UNICODE(result)
7706		    + reslen - rescnt;
7707	    }
7708	    if (sign) {
7709		if (fill != ' ')
7710		    *res++ = sign;
7711		rescnt--;
7712		if (width > len)
7713		    width--;
7714	    }
7715	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7716		assert(pbuf[0] == '0');
7717		assert(pbuf[1] == c);
7718		if (fill != ' ') {
7719		    *res++ = *pbuf++;
7720		    *res++ = *pbuf++;
7721		}
7722		rescnt -= 2;
7723		width -= 2;
7724		if (width < 0)
7725		    width = 0;
7726		len -= 2;
7727	    }
7728	    if (width > len && !(flags & F_LJUST)) {
7729		do {
7730		    --rescnt;
7731		    *res++ = fill;
7732		} while (--width > len);
7733	    }
7734	    if (fill == ' ') {
7735		if (sign)
7736		    *res++ = sign;
7737		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7738		    assert(pbuf[0] == '0');
7739		    assert(pbuf[1] == c);
7740		    *res++ = *pbuf++;
7741		    *res++ = *pbuf++;
7742		}
7743	    }
7744	    Py_UNICODE_COPY(res, pbuf, len);
7745	    res += len;
7746	    rescnt -= len;
7747	    while (--width >= len) {
7748		--rescnt;
7749		*res++ = ' ';
7750	    }
7751	    if (dict && (argidx < arglen) && c != '%') {
7752		PyErr_SetString(PyExc_TypeError,
7753				"not all arguments converted during string formatting");
7754                Py_XDECREF(temp);
7755		goto onError;
7756	    }
7757	    Py_XDECREF(temp);
7758	} /* '%' */
7759    } /* until end */
7760    if (argidx < arglen && !dict) {
7761	PyErr_SetString(PyExc_TypeError,
7762			"not all arguments converted during string formatting");
7763	goto onError;
7764    }
7765
7766    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7767	goto onError;
7768    if (args_owned) {
7769	Py_DECREF(args);
7770    }
7771    Py_DECREF(uformat);
7772    return (PyObject *)result;
7773
7774 onError:
7775    Py_XDECREF(result);
7776    Py_DECREF(uformat);
7777    if (args_owned) {
7778	Py_DECREF(args);
7779    }
7780    return NULL;
7781}
7782
7783static PyBufferProcs unicode_as_buffer = {
7784    (readbufferproc) unicode_buffer_getreadbuf,
7785    (writebufferproc) unicode_buffer_getwritebuf,
7786    (segcountproc) unicode_buffer_getsegcount,
7787    (charbufferproc) unicode_buffer_getcharbuf,
7788};
7789
7790static PyObject *
7791unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7792
7793static PyObject *
7794unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7795{
7796        PyObject *x = NULL;
7797	static char *kwlist[] = {"string", "encoding", "errors", 0};
7798	char *encoding = NULL;
7799	char *errors = NULL;
7800
7801	if (type != &PyUnicode_Type)
7802		return unicode_subtype_new(type, args, kwds);
7803	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7804					  kwlist, &x, &encoding, &errors))
7805	    return NULL;
7806	if (x == NULL)
7807		return (PyObject *)_PyUnicode_New(0);
7808	if (encoding == NULL && errors == NULL)
7809	    return PyObject_Unicode(x);
7810	else
7811	return PyUnicode_FromEncodedObject(x, encoding, errors);
7812}
7813
7814static PyObject *
7815unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7816{
7817	PyUnicodeObject *tmp, *pnew;
7818	Py_ssize_t n;
7819
7820	assert(PyType_IsSubtype(type, &PyUnicode_Type));
7821	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7822	if (tmp == NULL)
7823		return NULL;
7824	assert(PyUnicode_Check(tmp));
7825	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7826	if (pnew == NULL) {
7827		Py_DECREF(tmp);
7828		return NULL;
7829	}
7830	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7831	if (pnew->str == NULL) {
7832		_Py_ForgetReference((PyObject *)pnew);
7833		PyObject_Del(pnew);
7834		Py_DECREF(tmp);
7835		return PyErr_NoMemory();
7836	}
7837	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7838	pnew->length = n;
7839	pnew->hash = tmp->hash;
7840	Py_DECREF(tmp);
7841	return (PyObject *)pnew;
7842}
7843
7844PyDoc_STRVAR(unicode_doc,
7845"unicode(string [, encoding[, errors]]) -> object\n\
7846\n\
7847Create a new Unicode object from the given encoded string.\n\
7848encoding defaults to the current default string encoding.\n\
7849errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7850
7851PyTypeObject PyUnicode_Type = {
7852    PyObject_HEAD_INIT(&PyType_Type)
7853    0, 					/* ob_size */
7854    "unicode", 				/* tp_name */
7855    sizeof(PyUnicodeObject), 		/* tp_size */
7856    0, 					/* tp_itemsize */
7857    /* Slots */
7858    (destructor)unicode_dealloc, 	/* tp_dealloc */
7859    0, 					/* tp_print */
7860    0,				 	/* tp_getattr */
7861    0, 					/* tp_setattr */
7862    (cmpfunc) unicode_compare, 		/* tp_compare */
7863    unicode_repr, 			/* tp_repr */
7864    &unicode_as_number, 		/* tp_as_number */
7865    &unicode_as_sequence, 		/* tp_as_sequence */
7866    &unicode_as_mapping, 		/* tp_as_mapping */
7867    (hashfunc) unicode_hash, 		/* tp_hash*/
7868    0, 					/* tp_call*/
7869    (reprfunc) unicode_str,	 	/* tp_str */
7870    PyObject_GenericGetAttr, 		/* tp_getattro */
7871    0,			 		/* tp_setattro */
7872    &unicode_as_buffer,			/* tp_as_buffer */
7873    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
7874    unicode_doc,			/* tp_doc */
7875    0,					/* tp_traverse */
7876    0,					/* tp_clear */
7877    0,					/* tp_richcompare */
7878    0,					/* tp_weaklistoffset */
7879    0,					/* tp_iter */
7880    0,					/* tp_iternext */
7881    unicode_methods,			/* tp_methods */
7882    0,					/* tp_members */
7883    0,					/* tp_getset */
7884    &PyBaseString_Type,			/* tp_base */
7885    0,					/* tp_dict */
7886    0,					/* tp_descr_get */
7887    0,					/* tp_descr_set */
7888    0,					/* tp_dictoffset */
7889    0,					/* tp_init */
7890    0,					/* tp_alloc */
7891    unicode_new,			/* tp_new */
7892    PyObject_Del,      		/* tp_free */
7893};
7894
7895/* Initialize the Unicode implementation */
7896
7897void _PyUnicode_Init(void)
7898{
7899    int i;
7900
7901    /* XXX - move this array to unicodectype.c ? */
7902    Py_UNICODE linebreak[] = {
7903        0x000A, /* LINE FEED */
7904        0x000D, /* CARRIAGE RETURN */
7905        0x001C, /* FILE SEPARATOR */
7906        0x001D, /* GROUP SEPARATOR */
7907        0x001E, /* RECORD SEPARATOR */
7908        0x0085, /* NEXT LINE */
7909        0x2028, /* LINE SEPARATOR */
7910        0x2029, /* PARAGRAPH SEPARATOR */
7911    };
7912
7913    /* Init the implementation */
7914    unicode_freelist = NULL;
7915    unicode_freelist_size = 0;
7916    unicode_empty = _PyUnicode_New(0);
7917    if (!unicode_empty)
7918	return;
7919
7920    strcpy(unicode_default_encoding, "ascii");
7921    for (i = 0; i < 256; i++)
7922	unicode_latin1[i] = NULL;
7923    if (PyType_Ready(&PyUnicode_Type) < 0)
7924	Py_FatalError("Can't initialize 'unicode'");
7925
7926    /* initialize the linebreak bloom filter */
7927    bloom_linebreak = make_bloom_mask(
7928        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
7929        );
7930
7931    PyType_Ready(&EncodingMapType);
7932}
7933
7934/* Finalize the Unicode implementation */
7935
7936void
7937_PyUnicode_Fini(void)
7938{
7939    PyUnicodeObject *u;
7940    int i;
7941
7942    Py_XDECREF(unicode_empty);
7943    unicode_empty = NULL;
7944
7945    for (i = 0; i < 256; i++) {
7946	if (unicode_latin1[i]) {
7947	    Py_DECREF(unicode_latin1[i]);
7948	    unicode_latin1[i] = NULL;
7949	}
7950    }
7951
7952    for (u = unicode_freelist; u != NULL;) {
7953	PyUnicodeObject *v = u;
7954	u = *(PyUnicodeObject **)u;
7955	if (v->str)
7956	    PyMem_DEL(v->str);
7957	Py_XDECREF(v->defenc);
7958	PyObject_Del(v);
7959    }
7960    unicode_freelist = NULL;
7961    unicode_freelist_size = 0;
7962}
7963
7964#ifdef __cplusplus
7965}
7966#endif
7967
7968
7969/*
7970Local variables:
7971c-basic-offset: 4
7972indent-tabs-mode: nil
7973End:
7974*/
7975