unicodeobject.c revision ce30bc9f49dd77a9e6707eabaa1f3ceb8e6e458e
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WINDOWS
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106Py_UNICODE
107PyUnicode_GetMax(void)
108{
109#ifdef Py_UNICODE_WIDE
110	return 0x10FFFF;
111#else
112	/* This is actually an illegal character, so it should
113	   not be passed to unichr. */
114	return 0xFFFF;
115#endif
116}
117
118/* --- Unicode Object ----------------------------------------------------- */
119
120static
121int unicode_resize(register PyUnicodeObject *unicode,
122                      int length)
123{
124    void *oldstr;
125
126    /* Shortcut if there's nothing much to do. */
127    if (unicode->length == length)
128	goto reset;
129
130    /* Resizing shared object (unicode_empty or single character
131       objects) in-place is not allowed. Use PyUnicode_Resize()
132       instead ! */
133    if (unicode == unicode_empty ||
134	(unicode->length == 1 &&
135	 unicode->str[0] < 256 &&
136	 unicode_latin1[unicode->str[0]] == unicode)) {
137        PyErr_SetString(PyExc_SystemError,
138                        "can't resize shared unicode objects");
139        return -1;
140    }
141
142    /* We allocate one more byte to make sure the string is
143       Ux0000 terminated -- XXX is this needed ? */
144    oldstr = unicode->str;
145    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146    if (!unicode->str) {
147	unicode->str = oldstr;
148        PyErr_NoMemory();
149        return -1;
150    }
151    unicode->str[length] = 0;
152    unicode->length = length;
153
154 reset:
155    /* Reset the object caches */
156    if (unicode->defenc) {
157        Py_DECREF(unicode->defenc);
158        unicode->defenc = NULL;
159    }
160    unicode->hash = -1;
161
162    return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166   Ux0000 terminated -- XXX is this needed ?
167
168   XXX This allocator could further be enhanced by assuring that the
169       free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176    register PyUnicodeObject *unicode;
177
178    /* Optimization for empty strings */
179    if (length == 0 && unicode_empty != NULL) {
180        Py_INCREF(unicode_empty);
181        return unicode_empty;
182    }
183
184    /* Unicode freelist & memory allocation */
185    if (unicode_freelist) {
186        unicode = unicode_freelist;
187        unicode_freelist = *(PyUnicodeObject **)unicode;
188        unicode_freelist_size--;
189	if (unicode->str) {
190	    /* Keep-Alive optimization: we only upsize the buffer,
191	       never downsize it. */
192	    if ((unicode->length < length) &&
193		unicode_resize(unicode, length)) {
194		PyMem_DEL(unicode->str);
195		goto onError;
196	    }
197	}
198        else {
199	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200        }
201        PyObject_INIT(unicode, &PyUnicode_Type);
202    }
203    else {
204        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
205        if (unicode == NULL)
206            return NULL;
207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208    }
209
210    if (!unicode->str) {
211	PyErr_NoMemory();
212	goto onError;
213    }
214    unicode->str[length] = 0;
215    unicode->length = length;
216    unicode->hash = -1;
217    unicode->defenc = NULL;
218    return unicode;
219
220 onError:
221    _Py_ForgetReference((PyObject *)unicode);
222    PyObject_Del(unicode);
223    return NULL;
224}
225
226static
227void unicode_dealloc(register PyUnicodeObject *unicode)
228{
229    if (PyUnicode_CheckExact(unicode) &&
230	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
231        /* Keep-Alive optimization */
232	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
233	    PyMem_DEL(unicode->str);
234	    unicode->str = NULL;
235	    unicode->length = 0;
236	}
237	if (unicode->defenc) {
238	    Py_DECREF(unicode->defenc);
239	    unicode->defenc = NULL;
240	}
241	/* Add to free list */
242        *(PyUnicodeObject **)unicode = unicode_freelist;
243        unicode_freelist = unicode;
244        unicode_freelist_size++;
245    }
246    else {
247	PyMem_DEL(unicode->str);
248	Py_XDECREF(unicode->defenc);
249	unicode->ob_type->tp_free((PyObject *)unicode);
250    }
251}
252
253int PyUnicode_Resize(PyObject **unicode,
254		     int length)
255{
256    register PyUnicodeObject *v;
257
258    /* Argument checks */
259    if (unicode == NULL) {
260	PyErr_BadInternalCall();
261	return -1;
262    }
263    v = (PyUnicodeObject *)*unicode;
264    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
265	PyErr_BadInternalCall();
266	return -1;
267    }
268
269    /* Resizing unicode_empty and single character objects is not
270       possible since these are being shared. We simply return a fresh
271       copy with the same Unicode content. */
272    if (v->length != length &&
273	(v == unicode_empty || v->length == 1)) {
274	PyUnicodeObject *w = _PyUnicode_New(length);
275	if (w == NULL)
276	    return -1;
277	Py_UNICODE_COPY(w->str, v->str,
278			length < v->length ? length : v->length);
279	*unicode = (PyObject *)w;
280	return 0;
281    }
282
283    /* Note that we don't have to modify *unicode for unshared Unicode
284       objects, since we can modify them in-place. */
285    return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293				int size)
294{
295    PyUnicodeObject *unicode;
296
297    /* If the Unicode data is known at construction time, we can apply
298       some optimizations which share commonly used objects. */
299    if (u != NULL) {
300
301	/* Optimization for empty strings */
302	if (size == 0 && unicode_empty != NULL) {
303	    Py_INCREF(unicode_empty);
304	    return (PyObject *)unicode_empty;
305	}
306
307	/* Single character Unicode objects in the Latin-1 range are
308	   shared when using this constructor */
309	if (size == 1 && *u < 256) {
310	    unicode = unicode_latin1[*u];
311	    if (!unicode) {
312		unicode = _PyUnicode_New(1);
313		if (!unicode)
314		    return NULL;
315		unicode->str[0] = *u;
316		unicode_latin1[*u] = unicode;
317	    }
318	    Py_INCREF(unicode);
319	    return (PyObject *)unicode;
320	}
321    }
322
323    unicode = _PyUnicode_New(size);
324    if (!unicode)
325        return NULL;
326
327    /* Copy the Unicode data into the new object */
328    if (u != NULL)
329	Py_UNICODE_COPY(unicode->str, u, size);
330
331    return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337				 int size)
338{
339    PyUnicodeObject *unicode;
340
341    if (w == NULL) {
342	PyErr_BadInternalCall();
343	return NULL;
344    }
345
346    unicode = _PyUnicode_New(size);
347    if (!unicode)
348        return NULL;
349
350    /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352    memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354    {
355	register Py_UNICODE *u;
356	register int i;
357	u = PyUnicode_AS_UNICODE(unicode);
358	for (i = size; i >= 0; i--)
359	    *u++ = *w++;
360    }
361#endif
362
363    return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367			 register wchar_t *w,
368			 int size)
369{
370    if (unicode == NULL) {
371	PyErr_BadInternalCall();
372	return -1;
373    }
374    if (size > PyUnicode_GET_SIZE(unicode))
375	size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377    memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379    {
380	register Py_UNICODE *u;
381	register int i;
382	u = PyUnicode_AS_UNICODE(unicode);
383	for (i = size; i >= 0; i--)
384	    *w++ = *u++;
385    }
386#endif
387
388    return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromOrdinal(int ordinal)
394{
395    Py_UNICODE s[2];
396
397#ifdef Py_UNICODE_WIDE
398    if (ordinal < 0 || ordinal > 0x10ffff) {
399	PyErr_SetString(PyExc_ValueError,
400			"unichr() arg not in range(0x110000) "
401			"(wide Python build)");
402	return NULL;
403    }
404#else
405    if (ordinal < 0 || ordinal > 0xffff) {
406	PyErr_SetString(PyExc_ValueError,
407			"unichr() arg not in range(0x10000) "
408			"(narrow Python build)");
409	return NULL;
410    }
411#endif
412
413    if (ordinal <= 0xffff) {
414	/* UCS-2 character */
415	s[0] = (Py_UNICODE) ordinal;
416	return PyUnicode_FromUnicode(s, 1);
417    }
418    else {
419#ifndef Py_UNICODE_WIDE
420	/* UCS-4 character.  store as two surrogate characters */
421	ordinal -= 0x10000L;
422	s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
423	s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
424	return PyUnicode_FromUnicode(s, 2);
425#else
426	s[0] = (Py_UNICODE)ordinal;
427	return PyUnicode_FromUnicode(s, 1);
428#endif
429    }
430}
431
432PyObject *PyUnicode_FromObject(register PyObject *obj)
433{
434    /* XXX Perhaps we should make this API an alias of
435           PyObject_Unicode() instead ?! */
436    if (PyUnicode_CheckExact(obj)) {
437	Py_INCREF(obj);
438	return obj;
439    }
440    if (PyUnicode_Check(obj)) {
441	/* For a Unicode subtype that's not a Unicode object,
442	   return a true Unicode object with the same data. */
443	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
444				     PyUnicode_GET_SIZE(obj));
445    }
446    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
447}
448
449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
450				      const char *encoding,
451				      const char *errors)
452{
453    const char *s = NULL;
454    int len;
455    int owned = 0;
456    PyObject *v;
457
458    if (obj == NULL) {
459	PyErr_BadInternalCall();
460	return NULL;
461    }
462
463#if 0
464    /* For b/w compatibility we also accept Unicode objects provided
465       that no encodings is given and then redirect to
466       PyObject_Unicode() which then applies the additional logic for
467       Unicode subclasses.
468
469       NOTE: This API should really only be used for object which
470             represent *encoded* Unicode !
471
472    */
473	if (PyUnicode_Check(obj)) {
474	    if (encoding) {
475		PyErr_SetString(PyExc_TypeError,
476				"decoding Unicode is not supported");
477	    return NULL;
478	    }
479	return PyObject_Unicode(obj);
480	    }
481#else
482    if (PyUnicode_Check(obj)) {
483	PyErr_SetString(PyExc_TypeError,
484			"decoding Unicode is not supported");
485	return NULL;
486	}
487#endif
488
489    /* Coerce object */
490    if (PyString_Check(obj)) {
491	    s = PyString_AS_STRING(obj);
492	    len = PyString_GET_SIZE(obj);
493	    }
494    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495	/* Overwrite the error message with something more useful in
496	   case of a TypeError. */
497	if (PyErr_ExceptionMatches(PyExc_TypeError))
498	PyErr_Format(PyExc_TypeError,
499			 "coercing to Unicode: need string or buffer, "
500			 "%.80s found",
501		     obj->ob_type->tp_name);
502	goto onError;
503    }
504
505    /* Convert to Unicode */
506    if (len == 0) {
507	Py_INCREF(unicode_empty);
508	v = (PyObject *)unicode_empty;
509    }
510    else
511	v = PyUnicode_Decode(s, len, encoding, errors);
512
513    if (owned) {
514	Py_DECREF(obj);
515    }
516    return v;
517
518 onError:
519    if (owned) {
520	Py_DECREF(obj);
521    }
522    return NULL;
523}
524
525PyObject *PyUnicode_Decode(const char *s,
526			   int size,
527			   const char *encoding,
528			   const char *errors)
529{
530    PyObject *buffer = NULL, *unicode;
531
532    if (encoding == NULL)
533	encoding = PyUnicode_GetDefaultEncoding();
534
535    /* Shortcuts for common default encodings */
536    if (strcmp(encoding, "utf-8") == 0)
537        return PyUnicode_DecodeUTF8(s, size, errors);
538    else if (strcmp(encoding, "latin-1") == 0)
539        return PyUnicode_DecodeLatin1(s, size, errors);
540    else if (strcmp(encoding, "ascii") == 0)
541        return PyUnicode_DecodeASCII(s, size, errors);
542
543    /* Decode via the codec registry */
544    buffer = PyBuffer_FromMemory((void *)s, size);
545    if (buffer == NULL)
546        goto onError;
547    unicode = PyCodec_Decode(buffer, encoding, errors);
548    if (unicode == NULL)
549        goto onError;
550    if (!PyUnicode_Check(unicode)) {
551        PyErr_Format(PyExc_TypeError,
552                     "decoder did not return an unicode object (type=%.400s)",
553                     unicode->ob_type->tp_name);
554        Py_DECREF(unicode);
555        goto onError;
556    }
557    Py_DECREF(buffer);
558    return unicode;
559
560 onError:
561    Py_XDECREF(buffer);
562    return NULL;
563}
564
565PyObject *PyUnicode_Encode(const Py_UNICODE *s,
566			   int size,
567			   const char *encoding,
568			   const char *errors)
569{
570    PyObject *v, *unicode;
571
572    unicode = PyUnicode_FromUnicode(s, size);
573    if (unicode == NULL)
574	return NULL;
575    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
576    Py_DECREF(unicode);
577    return v;
578}
579
580PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
581                                    const char *encoding,
582                                    const char *errors)
583{
584    PyObject *v;
585
586    if (!PyUnicode_Check(unicode)) {
587        PyErr_BadArgument();
588        goto onError;
589    }
590
591    if (encoding == NULL)
592	encoding = PyUnicode_GetDefaultEncoding();
593
594    /* Shortcuts for common default encodings */
595    if (errors == NULL) {
596	if (strcmp(encoding, "utf-8") == 0)
597	    return PyUnicode_AsUTF8String(unicode);
598	else if (strcmp(encoding, "latin-1") == 0)
599	    return PyUnicode_AsLatin1String(unicode);
600	else if (strcmp(encoding, "ascii") == 0)
601	    return PyUnicode_AsASCIIString(unicode);
602    }
603
604    /* Encode via the codec registry */
605    v = PyCodec_Encode(unicode, encoding, errors);
606    if (v == NULL)
607        goto onError;
608    /* XXX Should we really enforce this ? */
609    if (!PyString_Check(v)) {
610        PyErr_Format(PyExc_TypeError,
611                     "encoder did not return a string object (type=%.400s)",
612                     v->ob_type->tp_name);
613        Py_DECREF(v);
614        goto onError;
615    }
616    return v;
617
618 onError:
619    return NULL;
620}
621
622PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
623					    const char *errors)
624{
625    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
626
627    if (v)
628        return v;
629    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
630    if (v && errors == NULL)
631        ((PyUnicodeObject *)unicode)->defenc = v;
632    return v;
633}
634
635Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
636{
637    if (!PyUnicode_Check(unicode)) {
638        PyErr_BadArgument();
639        goto onError;
640    }
641    return PyUnicode_AS_UNICODE(unicode);
642
643 onError:
644    return NULL;
645}
646
647int PyUnicode_GetSize(PyObject *unicode)
648{
649    if (!PyUnicode_Check(unicode)) {
650        PyErr_BadArgument();
651        goto onError;
652    }
653    return PyUnicode_GET_SIZE(unicode);
654
655 onError:
656    return -1;
657}
658
659const char *PyUnicode_GetDefaultEncoding(void)
660{
661    return unicode_default_encoding;
662}
663
664int PyUnicode_SetDefaultEncoding(const char *encoding)
665{
666    PyObject *v;
667
668    /* Make sure the encoding is valid. As side effect, this also
669       loads the encoding into the codec registry cache. */
670    v = _PyCodec_Lookup(encoding);
671    if (v == NULL)
672	goto onError;
673    Py_DECREF(v);
674    strncpy(unicode_default_encoding,
675	    encoding,
676	    sizeof(unicode_default_encoding));
677    return 0;
678
679 onError:
680    return -1;
681}
682
683/* error handling callback helper:
684   build arguments, call the callback and check the arguments,
685   if no exception occured, copy the replacement to the output
686   and adjust various state variables.
687   return 0 on success, -1 on error
688*/
689
690static
691int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
692                 const char *encoding, const char *reason,
693                 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
694                 PyObject **output, int *outpos, Py_UNICODE **outptr)
695{
696    static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
697
698    PyObject *restuple = NULL;
699    PyObject *repunicode = NULL;
700    int outsize = PyUnicode_GET_SIZE(*output);
701    int requiredsize;
702    int newpos;
703    Py_UNICODE *repptr;
704    int repsize;
705    int res = -1;
706
707    if (*errorHandler == NULL) {
708	*errorHandler = PyCodec_LookupError(errors);
709	if (*errorHandler == NULL)
710	   goto onError;
711    }
712
713    if (*exceptionObject == NULL) {
714    	*exceptionObject = PyUnicodeDecodeError_Create(
715	    encoding, input, insize, *startinpos, *endinpos, reason);
716	if (*exceptionObject == NULL)
717	   goto onError;
718    }
719    else {
720	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
721	    goto onError;
722	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
723	    goto onError;
724	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
725	    goto onError;
726    }
727
728    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
729    if (restuple == NULL)
730	goto onError;
731    if (!PyTuple_Check(restuple)) {
732	PyErr_Format(PyExc_TypeError, &argparse[4]);
733	goto onError;
734    }
735    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
736	goto onError;
737    if (newpos<0)
738	newpos = 0;
739    else if (newpos>insize)
740	newpos = insize;
741
742    /* need more space? (at least enough for what we
743       have+the replacement+the rest of the string (starting
744       at the new input position), so we won't have to check space
745       when there are no errors in the rest of the string) */
746    repptr = PyUnicode_AS_UNICODE(repunicode);
747    repsize = PyUnicode_GET_SIZE(repunicode);
748    requiredsize = *outpos + repsize + insize-newpos;
749    if (requiredsize > outsize) {
750	if (requiredsize<2*outsize)
751	    requiredsize = 2*outsize;
752	if (PyUnicode_Resize(output, requiredsize))
753	    goto onError;
754	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
755    }
756    *endinpos = newpos;
757    *inptr = input + newpos;
758    Py_UNICODE_COPY(*outptr, repptr, repsize);
759    *outptr += repsize;
760    *outpos += repsize;
761    /* we made it! */
762    res = 0;
763
764    onError:
765    Py_XDECREF(restuple);
766    return res;
767}
768
769/* --- UTF-7 Codec -------------------------------------------------------- */
770
771/* see RFC2152 for details */
772
773static
774char utf7_special[128] = {
775    /* indicate whether a UTF-7 character is special i.e. cannot be directly
776       encoded:
777	   0 - not special
778	   1 - special
779	   2 - whitespace (optional)
780	   3 - RFC2152 Set O (optional) */
781    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
782    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
783    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
784    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
785    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
786    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
787    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
788    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
789
790};
791
792#define SPECIAL(c, encodeO, encodeWS) \
793	(((c)>127 || utf7_special[(c)] == 1) || \
794	 (encodeWS && (utf7_special[(c)] == 2)) || \
795     (encodeO && (utf7_special[(c)] == 3)))
796
797#define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
798#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
799#define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
800                        (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
801
802#define ENCODE(out, ch, bits) \
803    while (bits >= 6) { \
804        *out++ = B64(ch >> (bits-6)); \
805        bits -= 6; \
806    }
807
808#define DECODE(out, ch, bits, surrogate) \
809    while (bits >= 16) { \
810        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
811        bits -= 16; \
812		if (surrogate) { \
813			/* We have already generated an error for the high surrogate
814               so let's not bother seeing if the low surrogate is correct or not */\
815			surrogate = 0; \
816		} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
817            /* This is a surrogate pair. Unfortunately we can't represent \
818               it in a 16-bit character */ \
819			surrogate = 1; \
820            errmsg = "code pairs are not supported"; \
821	        goto utf7Error; \
822		} else { \
823				*out++ = outCh; \
824		} \
825    } \
826
827PyObject *PyUnicode_DecodeUTF7(const char *s,
828			       int size,
829			       const char *errors)
830{
831    const char *starts = s;
832    int startinpos;
833    int endinpos;
834    int outpos;
835    const char *e;
836    PyUnicodeObject *unicode;
837    Py_UNICODE *p;
838    const char *errmsg = "";
839    int inShift = 0;
840    unsigned int bitsleft = 0;
841    unsigned long charsleft = 0;
842    int surrogate = 0;
843    PyObject *errorHandler = NULL;
844    PyObject *exc = NULL;
845
846    unicode = _PyUnicode_New(size);
847    if (!unicode)
848        return NULL;
849    if (size == 0)
850        return (PyObject *)unicode;
851
852    p = unicode->str;
853    e = s + size;
854
855    while (s < e) {
856        Py_UNICODE ch;
857        restart:
858        ch = *s;
859
860        if (inShift) {
861            if ((ch == '-') || !B64CHAR(ch)) {
862                inShift = 0;
863                s++;
864
865                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
866                if (bitsleft >= 6) {
867                    /* The shift sequence has a partial character in it. If
868                       bitsleft < 6 then we could just classify it as padding
869                       but that is not the case here */
870
871                    errmsg = "partial character in shift sequence";
872                    goto utf7Error;
873                }
874                /* According to RFC2152 the remaining bits should be zero. We
875                   choose to signal an error/insert a replacement character
876                   here so indicate the potential of a misencoded character. */
877
878                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
879                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
880                    errmsg = "non-zero padding bits in shift sequence";
881                    goto utf7Error;
882                }
883
884                if (ch == '-') {
885                    if ((s < e) && (*(s) == '-')) {
886                        *p++ = '-';
887                        inShift = 1;
888                    }
889                } else if (SPECIAL(ch,0,0)) {
890                    errmsg = "unexpected special character";
891	                goto utf7Error;
892                } else  {
893                    *p++ = ch;
894                }
895            } else {
896                charsleft = (charsleft << 6) | UB64(ch);
897                bitsleft += 6;
898                s++;
899                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
900            }
901        }
902        else if ( ch == '+' ) {
903            startinpos = s-starts;
904            s++;
905            if (s < e && *s == '-') {
906                s++;
907                *p++ = '+';
908            } else
909            {
910                inShift = 1;
911                bitsleft = 0;
912            }
913        }
914        else if (SPECIAL(ch,0,0)) {
915            errmsg = "unexpected special character";
916            s++;
917	        goto utf7Error;
918        }
919        else {
920            *p++ = ch;
921            s++;
922        }
923        continue;
924    utf7Error:
925        outpos = p-PyUnicode_AS_UNICODE(unicode);
926        endinpos = s-starts;
927        if (unicode_decode_call_errorhandler(
928             errors, &errorHandler,
929             "utf7", errmsg,
930             starts, size, &startinpos, &endinpos, &exc, &s,
931             (PyObject **)&unicode, &outpos, &p))
932        goto onError;
933    }
934
935    if (inShift) {
936        outpos = p-PyUnicode_AS_UNICODE(unicode);
937        endinpos = size;
938        if (unicode_decode_call_errorhandler(
939             errors, &errorHandler,
940             "utf7", "unterminated shift sequence",
941             starts, size, &startinpos, &endinpos, &exc, &s,
942             (PyObject **)&unicode, &outpos, &p))
943            goto onError;
944        if (s < e)
945           goto restart;
946    }
947
948    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
949        goto onError;
950
951    Py_XDECREF(errorHandler);
952    Py_XDECREF(exc);
953    return (PyObject *)unicode;
954
955onError:
956    Py_XDECREF(errorHandler);
957    Py_XDECREF(exc);
958    Py_DECREF(unicode);
959    return NULL;
960}
961
962
963PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
964                   int size,
965                   int encodeSetO,
966                   int encodeWhiteSpace,
967                   const char *errors)
968{
969    PyObject *v;
970    /* It might be possible to tighten this worst case */
971    unsigned int cbAllocated = 5 * size;
972    int inShift = 0;
973    int i = 0;
974    unsigned int bitsleft = 0;
975    unsigned long charsleft = 0;
976    char * out;
977    char * start;
978
979    if (size == 0)
980		return PyString_FromStringAndSize(NULL, 0);
981
982    v = PyString_FromStringAndSize(NULL, cbAllocated);
983    if (v == NULL)
984        return NULL;
985
986    start = out = PyString_AS_STRING(v);
987    for (;i < size; ++i) {
988        Py_UNICODE ch = s[i];
989
990        if (!inShift) {
991			if (ch == '+') {
992				*out++ = '+';
993                *out++ = '-';
994            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
995                charsleft = ch;
996                bitsleft = 16;
997                *out++ = '+';
998				/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
999                inShift = bitsleft > 0;
1000			} else {
1001				*out++ = (char) ch;
1002			}
1003		} else {
1004            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1005                *out++ = B64(charsleft << (6-bitsleft));
1006                charsleft = 0;
1007                bitsleft = 0;
1008                /* Characters not in the BASE64 set implicitly unshift the sequence
1009                   so no '-' is required, except if the character is itself a '-' */
1010                if (B64CHAR(ch) || ch == '-') {
1011                    *out++ = '-';
1012                }
1013                inShift = 0;
1014                *out++ = (char) ch;
1015            } else {
1016                bitsleft += 16;
1017                charsleft = (charsleft << 16) | ch;
1018                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1019
1020                /* If the next character is special then we dont' need to terminate
1021                   the shift sequence. If the next character is not a BASE64 character
1022                   or '-' then the shift sequence will be terminated implicitly and we
1023                   don't have to insert a '-'. */
1024
1025                if (bitsleft == 0) {
1026                    if (i + 1 < size) {
1027                        Py_UNICODE ch2 = s[i+1];
1028
1029                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1030
1031                        } else if (B64CHAR(ch2) || ch2 == '-') {
1032                            *out++ = '-';
1033                            inShift = 0;
1034                        } else {
1035                            inShift = 0;
1036                        }
1037
1038                    }
1039                    else {
1040                        *out++ = '-';
1041                        inShift = 0;
1042                    }
1043                }
1044            }
1045        }
1046	}
1047    if (bitsleft) {
1048        *out++= B64(charsleft << (6-bitsleft) );
1049        *out++ = '-';
1050    }
1051
1052    _PyString_Resize(&v, out - start);
1053    return v;
1054}
1055
1056#undef SPECIAL
1057#undef B64
1058#undef B64CHAR
1059#undef UB64
1060#undef ENCODE
1061#undef DECODE
1062
1063/* --- UTF-8 Codec -------------------------------------------------------- */
1064
1065static
1066char utf8_code_length[256] = {
1067    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1068       illegal prefix.  see RFC 2279 for details */
1069    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1074    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1075    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1076    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1077    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1078    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1079    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1080    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1081    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1082    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1083    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1084    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1085};
1086
1087PyObject *PyUnicode_DecodeUTF8(const char *s,
1088			       int size,
1089			       const char *errors)
1090{
1091    const char *starts = s;
1092    int n;
1093    int startinpos;
1094    int endinpos;
1095    int outpos;
1096    const char *e;
1097    PyUnicodeObject *unicode;
1098    Py_UNICODE *p;
1099    const char *errmsg = "";
1100    PyObject *errorHandler = NULL;
1101    PyObject *exc = NULL;
1102
1103    /* Note: size will always be longer than the resulting Unicode
1104       character count */
1105    unicode = _PyUnicode_New(size);
1106    if (!unicode)
1107        return NULL;
1108    if (size == 0)
1109        return (PyObject *)unicode;
1110
1111    /* Unpack UTF-8 encoded data */
1112    p = unicode->str;
1113    e = s + size;
1114
1115    while (s < e) {
1116        Py_UCS4 ch = (unsigned char)*s;
1117
1118        if (ch < 0x80) {
1119            *p++ = (Py_UNICODE)ch;
1120            s++;
1121            continue;
1122        }
1123
1124        n = utf8_code_length[ch];
1125
1126        if (s + n > e) {
1127	    errmsg = "unexpected end of data";
1128	    startinpos = s-starts;
1129	    endinpos = size;
1130	    goto utf8Error;
1131	}
1132
1133        switch (n) {
1134
1135        case 0:
1136            errmsg = "unexpected code byte";
1137	    startinpos = s-starts;
1138	    endinpos = startinpos+1;
1139	    goto utf8Error;
1140
1141        case 1:
1142            errmsg = "internal error";
1143	    startinpos = s-starts;
1144	    endinpos = startinpos+1;
1145	    goto utf8Error;
1146
1147        case 2:
1148            if ((s[1] & 0xc0) != 0x80) {
1149                errmsg = "invalid data";
1150		startinpos = s-starts;
1151		endinpos = startinpos+2;
1152		goto utf8Error;
1153	    }
1154            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1155            if (ch < 0x80) {
1156		startinpos = s-starts;
1157		endinpos = startinpos+2;
1158                errmsg = "illegal encoding";
1159		goto utf8Error;
1160	    }
1161	    else
1162		*p++ = (Py_UNICODE)ch;
1163            break;
1164
1165        case 3:
1166            if ((s[1] & 0xc0) != 0x80 ||
1167                (s[2] & 0xc0) != 0x80) {
1168                errmsg = "invalid data";
1169		startinpos = s-starts;
1170		endinpos = startinpos+3;
1171		goto utf8Error;
1172	    }
1173            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1174            if (ch < 0x0800) {
1175		/* Note: UTF-8 encodings of surrogates are considered
1176		   legal UTF-8 sequences;
1177
1178		   XXX For wide builds (UCS-4) we should probably try
1179		       to recombine the surrogates into a single code
1180		       unit.
1181		*/
1182                errmsg = "illegal encoding";
1183		startinpos = s-starts;
1184		endinpos = startinpos+3;
1185		goto utf8Error;
1186	    }
1187	    else
1188		*p++ = (Py_UNICODE)ch;
1189            break;
1190
1191        case 4:
1192            if ((s[1] & 0xc0) != 0x80 ||
1193                (s[2] & 0xc0) != 0x80 ||
1194                (s[3] & 0xc0) != 0x80) {
1195                errmsg = "invalid data";
1196		startinpos = s-starts;
1197		endinpos = startinpos+4;
1198		goto utf8Error;
1199	    }
1200            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1201                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1202            /* validate and convert to UTF-16 */
1203            if ((ch < 0x10000)        /* minimum value allowed for 4
1204					 byte encoding */
1205                || (ch > 0x10ffff))   /* maximum value allowed for
1206					 UTF-16 */
1207	    {
1208                errmsg = "illegal encoding";
1209		startinpos = s-starts;
1210		endinpos = startinpos+4;
1211		goto utf8Error;
1212	    }
1213#ifdef Py_UNICODE_WIDE
1214	    *p++ = (Py_UNICODE)ch;
1215#else
1216            /*  compute and append the two surrogates: */
1217
1218            /*  translate from 10000..10FFFF to 0..FFFF */
1219            ch -= 0x10000;
1220
1221            /*  high surrogate = top 10 bits added to D800 */
1222            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1223
1224            /*  low surrogate = bottom 10 bits added to DC00 */
1225            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1226#endif
1227            break;
1228
1229        default:
1230            /* Other sizes are only needed for UCS-4 */
1231            errmsg = "unsupported Unicode code range";
1232	    startinpos = s-starts;
1233	    endinpos = startinpos+n;
1234	    goto utf8Error;
1235        }
1236        s += n;
1237	continue;
1238
1239    utf8Error:
1240    outpos = p-PyUnicode_AS_UNICODE(unicode);
1241    if (unicode_decode_call_errorhandler(
1242	     errors, &errorHandler,
1243	     "utf8", errmsg,
1244	     starts, size, &startinpos, &endinpos, &exc, &s,
1245	     (PyObject **)&unicode, &outpos, &p))
1246	goto onError;
1247    }
1248
1249    /* Adjust length */
1250    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1251        goto onError;
1252
1253    Py_XDECREF(errorHandler);
1254    Py_XDECREF(exc);
1255    return (PyObject *)unicode;
1256
1257onError:
1258    Py_XDECREF(errorHandler);
1259    Py_XDECREF(exc);
1260    Py_DECREF(unicode);
1261    return NULL;
1262}
1263
1264/* Allocation strategy:  if the string is short, convert into a stack buffer
1265   and allocate exactly as much space needed at the end.  Else allocate the
1266   maximum possible needed (4 result bytes per Unicode character), and return
1267   the excess memory at the end.
1268*/
1269PyObject *
1270PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1271		     int size,
1272		     const char *errors)
1273{
1274#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1275
1276    int i;              /* index into s of next input byte */
1277    PyObject *v;        /* result string object */
1278    char *p;            /* next free byte in output buffer */
1279    int nallocated;     /* number of result bytes allocated */
1280    int nneeded;        /* number of result bytes needed */
1281    char stackbuf[MAX_SHORT_UNICHARS * 4];
1282
1283    assert(s != NULL);
1284    assert(size >= 0);
1285
1286    if (size <= MAX_SHORT_UNICHARS) {
1287        /* Write into the stack buffer; nallocated can't overflow.
1288         * At the end, we'll allocate exactly as much heap space as it
1289         * turns out we need.
1290         */
1291        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1292        v = NULL;   /* will allocate after we're done */
1293        p = stackbuf;
1294    }
1295    else {
1296        /* Overallocate on the heap, and give the excess back at the end. */
1297        nallocated = size * 4;
1298        if (nallocated / 4 != size)  /* overflow! */
1299            return PyErr_NoMemory();
1300        v = PyString_FromStringAndSize(NULL, nallocated);
1301        if (v == NULL)
1302            return NULL;
1303        p = PyString_AS_STRING(v);
1304    }
1305
1306    for (i = 0; i < size;) {
1307        Py_UCS4 ch = s[i++];
1308
1309        if (ch < 0x80)
1310            /* Encode ASCII */
1311            *p++ = (char) ch;
1312
1313        else if (ch < 0x0800) {
1314            /* Encode Latin-1 */
1315            *p++ = (char)(0xc0 | (ch >> 6));
1316            *p++ = (char)(0x80 | (ch & 0x3f));
1317        }
1318        else {
1319            /* Encode UCS2 Unicode ordinals */
1320            if (ch < 0x10000) {
1321                /* Special case: check for high surrogate */
1322                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1323                    Py_UCS4 ch2 = s[i];
1324                    /* Check for low surrogate and combine the two to
1325                       form a UCS4 value */
1326                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1327                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1328                        i++;
1329                        goto encodeUCS4;
1330                    }
1331                    /* Fall through: handles isolated high surrogates */
1332                }
1333                *p++ = (char)(0xe0 | (ch >> 12));
1334                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1335                *p++ = (char)(0x80 | (ch & 0x3f));
1336                continue;
1337    	    }
1338encodeUCS4:
1339            /* Encode UCS4 Unicode ordinals */
1340            *p++ = (char)(0xf0 | (ch >> 18));
1341            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1342            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1343            *p++ = (char)(0x80 | (ch & 0x3f));
1344        }
1345    }
1346
1347    if (v == NULL) {
1348        /* This was stack allocated. */
1349        nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1350        assert(nneeded <= nallocated);
1351        v = PyString_FromStringAndSize(stackbuf, nneeded);
1352    }
1353    else {
1354    	/* Cut back to size actually needed. */
1355        nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1356        assert(nneeded <= nallocated);
1357        _PyString_Resize(&v, nneeded);
1358    }
1359    return v;
1360
1361#undef MAX_SHORT_UNICHARS
1362}
1363
1364PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1365{
1366    if (!PyUnicode_Check(unicode)) {
1367        PyErr_BadArgument();
1368        return NULL;
1369    }
1370    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1371				PyUnicode_GET_SIZE(unicode),
1372				NULL);
1373}
1374
1375/* --- UTF-16 Codec ------------------------------------------------------- */
1376
1377PyObject *
1378PyUnicode_DecodeUTF16(const char *s,
1379		      int size,
1380		      const char *errors,
1381		      int *byteorder)
1382{
1383    const char *starts = s;
1384    int startinpos;
1385    int endinpos;
1386    int outpos;
1387    PyUnicodeObject *unicode;
1388    Py_UNICODE *p;
1389    const unsigned char *q, *e;
1390    int bo = 0;       /* assume native ordering by default */
1391    const char *errmsg = "";
1392    /* Offsets from q for retrieving byte pairs in the right order. */
1393#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1394    int ihi = 1, ilo = 0;
1395#else
1396    int ihi = 0, ilo = 1;
1397#endif
1398    PyObject *errorHandler = NULL;
1399    PyObject *exc = NULL;
1400
1401    /* Note: size will always be longer than the resulting Unicode
1402       character count */
1403    unicode = _PyUnicode_New(size);
1404    if (!unicode)
1405        return NULL;
1406    if (size == 0)
1407        return (PyObject *)unicode;
1408
1409    /* Unpack UTF-16 encoded data */
1410    p = unicode->str;
1411    q = (unsigned char *)s;
1412    e = q + size;
1413
1414    if (byteorder)
1415        bo = *byteorder;
1416
1417    /* Check for BOM marks (U+FEFF) in the input and adjust current
1418       byte order setting accordingly. In native mode, the leading BOM
1419       mark is skipped, in all other modes, it is copied to the output
1420       stream as-is (giving a ZWNBSP character). */
1421    if (bo == 0) {
1422        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1424	if (bom == 0xFEFF) {
1425	    q += 2;
1426	    bo = -1;
1427	}
1428        else if (bom == 0xFFFE) {
1429	    q += 2;
1430	    bo = 1;
1431	}
1432#else
1433	if (bom == 0xFEFF) {
1434	    q += 2;
1435	    bo = 1;
1436	}
1437        else if (bom == 0xFFFE) {
1438	    q += 2;
1439	    bo = -1;
1440	}
1441#endif
1442    }
1443
1444    if (bo == -1) {
1445        /* force LE */
1446        ihi = 1;
1447        ilo = 0;
1448    }
1449    else if (bo == 1) {
1450        /* force BE */
1451        ihi = 0;
1452        ilo = 1;
1453    }
1454
1455    while (q < e) {
1456	Py_UNICODE ch;
1457	/* remaing bytes at the end? (size should be even) */
1458	if (e-q<2) {
1459	    errmsg = "truncated data";
1460	    startinpos = ((const char *)q)-starts;
1461	    endinpos = ((const char *)e)-starts;
1462	    goto utf16Error;
1463	    /* The remaining input chars are ignored if the callback
1464	       chooses to skip the input */
1465	}
1466	ch = (q[ihi] << 8) | q[ilo];
1467
1468	q += 2;
1469
1470	if (ch < 0xD800 || ch > 0xDFFF) {
1471	    *p++ = ch;
1472	    continue;
1473	}
1474
1475	/* UTF-16 code pair: */
1476	if (q >= e) {
1477	    errmsg = "unexpected end of data";
1478	    startinpos = (((const char *)q)-2)-starts;
1479	    endinpos = ((const char *)e)-starts;
1480	    goto utf16Error;
1481	}
1482	if (0xD800 <= ch && ch <= 0xDBFF) {
1483	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1484	    q += 2;
1485	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1486#ifndef Py_UNICODE_WIDE
1487		*p++ = ch;
1488		*p++ = ch2;
1489#else
1490		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1491#endif
1492		continue;
1493	    }
1494	    else {
1495                errmsg = "illegal UTF-16 surrogate";
1496		startinpos = (((const char *)q)-4)-starts;
1497		endinpos = startinpos+2;
1498		goto utf16Error;
1499	    }
1500
1501	}
1502	errmsg = "illegal encoding";
1503	startinpos = (((const char *)q)-2)-starts;
1504	endinpos = startinpos+2;
1505	/* Fall through to report the error */
1506
1507    utf16Error:
1508	outpos = p-PyUnicode_AS_UNICODE(unicode);
1509	if (unicode_decode_call_errorhandler(
1510	         errors, &errorHandler,
1511	         "utf16", errmsg,
1512	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1513	         (PyObject **)&unicode, &outpos, &p))
1514	    goto onError;
1515    }
1516
1517    if (byteorder)
1518        *byteorder = bo;
1519
1520    /* Adjust length */
1521    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1522        goto onError;
1523
1524    Py_XDECREF(errorHandler);
1525    Py_XDECREF(exc);
1526    return (PyObject *)unicode;
1527
1528onError:
1529    Py_DECREF(unicode);
1530    Py_XDECREF(errorHandler);
1531    Py_XDECREF(exc);
1532    return NULL;
1533}
1534
1535PyObject *
1536PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1537		      int size,
1538		      const char *errors,
1539		      int byteorder)
1540{
1541    PyObject *v;
1542    unsigned char *p;
1543    int i, pairs;
1544    /* Offsets from p for storing byte pairs in the right order. */
1545#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1546    int ihi = 1, ilo = 0;
1547#else
1548    int ihi = 0, ilo = 1;
1549#endif
1550
1551#define STORECHAR(CH)                   \
1552    do {                                \
1553        p[ihi] = ((CH) >> 8) & 0xff;    \
1554        p[ilo] = (CH) & 0xff;           \
1555        p += 2;                         \
1556    } while(0)
1557
1558    for (i = pairs = 0; i < size; i++)
1559	if (s[i] >= 0x10000)
1560	    pairs++;
1561    v = PyString_FromStringAndSize(NULL,
1562		  2 * (size + pairs + (byteorder == 0)));
1563    if (v == NULL)
1564        return NULL;
1565
1566    p = (unsigned char *)PyString_AS_STRING(v);
1567    if (byteorder == 0)
1568	STORECHAR(0xFEFF);
1569    if (size == 0)
1570        return v;
1571
1572    if (byteorder == -1) {
1573        /* force LE */
1574        ihi = 1;
1575        ilo = 0;
1576    }
1577    else if (byteorder == 1) {
1578        /* force BE */
1579        ihi = 0;
1580        ilo = 1;
1581    }
1582
1583    while (size-- > 0) {
1584	Py_UNICODE ch = *s++;
1585	Py_UNICODE ch2 = 0;
1586	if (ch >= 0x10000) {
1587	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1588	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1589	}
1590        STORECHAR(ch);
1591        if (ch2)
1592            STORECHAR(ch2);
1593    }
1594    return v;
1595#undef STORECHAR
1596}
1597
1598PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1599{
1600    if (!PyUnicode_Check(unicode)) {
1601        PyErr_BadArgument();
1602        return NULL;
1603    }
1604    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1605				 PyUnicode_GET_SIZE(unicode),
1606				 NULL,
1607				 0);
1608}
1609
1610/* --- Unicode Escape Codec ----------------------------------------------- */
1611
1612static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1613
1614PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1615					int size,
1616					const char *errors)
1617{
1618    const char *starts = s;
1619    int startinpos;
1620    int endinpos;
1621    int outpos;
1622    int i;
1623    PyUnicodeObject *v;
1624    Py_UNICODE *p;
1625    const char *end;
1626    char* message;
1627    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1628    PyObject *errorHandler = NULL;
1629    PyObject *exc = NULL;
1630
1631    /* Escaped strings will always be longer than the resulting
1632       Unicode string, so we start with size here and then reduce the
1633       length after conversion to the true value.
1634       (but if the error callback returns a long replacement string
1635       we'll have to allocate more space) */
1636    v = _PyUnicode_New(size);
1637    if (v == NULL)
1638        goto onError;
1639    if (size == 0)
1640        return (PyObject *)v;
1641
1642    p = PyUnicode_AS_UNICODE(v);
1643    end = s + size;
1644
1645    while (s < end) {
1646        unsigned char c;
1647        Py_UNICODE x;
1648        int digits;
1649
1650        /* Non-escape characters are interpreted as Unicode ordinals */
1651        if (*s != '\\') {
1652            *p++ = (unsigned char) *s++;
1653            continue;
1654        }
1655
1656        startinpos = s-starts;
1657        /* \ - Escapes */
1658        s++;
1659        switch (*s++) {
1660
1661        /* \x escapes */
1662        case '\n': break;
1663        case '\\': *p++ = '\\'; break;
1664        case '\'': *p++ = '\''; break;
1665        case '\"': *p++ = '\"'; break;
1666        case 'b': *p++ = '\b'; break;
1667        case 'f': *p++ = '\014'; break; /* FF */
1668        case 't': *p++ = '\t'; break;
1669        case 'n': *p++ = '\n'; break;
1670        case 'r': *p++ = '\r'; break;
1671        case 'v': *p++ = '\013'; break; /* VT */
1672        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1673
1674        /* \OOO (octal) escapes */
1675        case '0': case '1': case '2': case '3':
1676        case '4': case '5': case '6': case '7':
1677            x = s[-1] - '0';
1678            if ('0' <= *s && *s <= '7') {
1679                x = (x<<3) + *s++ - '0';
1680                if ('0' <= *s && *s <= '7')
1681                    x = (x<<3) + *s++ - '0';
1682            }
1683            *p++ = x;
1684            break;
1685
1686        /* hex escapes */
1687        /* \xXX */
1688        case 'x':
1689            digits = 2;
1690            message = "truncated \\xXX escape";
1691            goto hexescape;
1692
1693        /* \uXXXX */
1694        case 'u':
1695            digits = 4;
1696            message = "truncated \\uXXXX escape";
1697            goto hexescape;
1698
1699        /* \UXXXXXXXX */
1700        case 'U':
1701            digits = 8;
1702            message = "truncated \\UXXXXXXXX escape";
1703        hexescape:
1704            chr = 0;
1705            outpos = p-PyUnicode_AS_UNICODE(v);
1706            if (s+digits>end) {
1707                endinpos = size;
1708                if (unicode_decode_call_errorhandler(
1709                    errors, &errorHandler,
1710                    "unicodeescape", "end of string in escape sequence",
1711                    starts, size, &startinpos, &endinpos, &exc, &s,
1712                    (PyObject **)&v, &outpos, &p))
1713                    goto onError;
1714                goto nextByte;
1715            }
1716            for (i = 0; i < digits; ++i) {
1717                c = (unsigned char) s[i];
1718                if (!isxdigit(c)) {
1719                    endinpos = (s+i+1)-starts;
1720                    if (unicode_decode_call_errorhandler(
1721                        errors, &errorHandler,
1722                        "unicodeescape", message,
1723                        starts, size, &startinpos, &endinpos, &exc, &s,
1724                        (PyObject **)&v, &outpos, &p))
1725                        goto onError;
1726                    goto nextByte;
1727                }
1728                chr = (chr<<4) & ~0xF;
1729                if (c >= '0' && c <= '9')
1730                    chr += c - '0';
1731                else if (c >= 'a' && c <= 'f')
1732                    chr += 10 + c - 'a';
1733                else
1734                    chr += 10 + c - 'A';
1735            }
1736            s += i;
1737            if (chr == 0xffffffff)
1738                /* _decoding_error will have already written into the
1739                   target buffer. */
1740                break;
1741        store:
1742            /* when we get here, chr is a 32-bit unicode character */
1743            if (chr <= 0xffff)
1744                /* UCS-2 character */
1745                *p++ = (Py_UNICODE) chr;
1746            else if (chr <= 0x10ffff) {
1747                /* UCS-4 character. Either store directly, or as
1748                   surrogate pair. */
1749#ifdef Py_UNICODE_WIDE
1750                *p++ = chr;
1751#else
1752                chr -= 0x10000L;
1753                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1754                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1755#endif
1756            } else {
1757                endinpos = s-starts;
1758                outpos = p-PyUnicode_AS_UNICODE(v);
1759                if (unicode_decode_call_errorhandler(
1760                    errors, &errorHandler,
1761                    "unicodeescape", "illegal Unicode character",
1762                    starts, size, &startinpos, &endinpos, &exc, &s,
1763                    (PyObject **)&v, &outpos, &p))
1764                    goto onError;
1765            }
1766            break;
1767
1768        /* \N{name} */
1769        case 'N':
1770            message = "malformed \\N character escape";
1771            if (ucnhash_CAPI == NULL) {
1772                /* load the unicode data module */
1773                PyObject *m, *v;
1774                m = PyImport_ImportModule("unicodedata");
1775                if (m == NULL)
1776                    goto ucnhashError;
1777                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1778                Py_DECREF(m);
1779                if (v == NULL)
1780                    goto ucnhashError;
1781                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1782                Py_DECREF(v);
1783                if (ucnhash_CAPI == NULL)
1784                    goto ucnhashError;
1785            }
1786            if (*s == '{') {
1787                const char *start = s+1;
1788                /* look for the closing brace */
1789                while (*s != '}' && s < end)
1790                    s++;
1791                if (s > start && s < end && *s == '}') {
1792                    /* found a name.  look it up in the unicode database */
1793                    message = "unknown Unicode character name";
1794                    s++;
1795                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1796                        goto store;
1797                }
1798            }
1799            endinpos = s-starts;
1800            outpos = p-PyUnicode_AS_UNICODE(v);
1801            if (unicode_decode_call_errorhandler(
1802                errors, &errorHandler,
1803                "unicodeescape", message,
1804                starts, size, &startinpos, &endinpos, &exc, &s,
1805                (PyObject **)&v, &outpos, &p))
1806                goto onError;
1807            break;
1808
1809        default:
1810            if (s > end) {
1811                message = "\\ at end of string";
1812                s--;
1813                endinpos = s-starts;
1814                outpos = p-PyUnicode_AS_UNICODE(v);
1815                if (unicode_decode_call_errorhandler(
1816                    errors, &errorHandler,
1817                    "unicodeescape", message,
1818                    starts, size, &startinpos, &endinpos, &exc, &s,
1819                    (PyObject **)&v, &outpos, &p))
1820                    goto onError;
1821            }
1822            else {
1823                *p++ = '\\';
1824                *p++ = (unsigned char)s[-1];
1825            }
1826            break;
1827        }
1828        nextByte:
1829        ;
1830    }
1831    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1832        goto onError;
1833    return (PyObject *)v;
1834
1835ucnhashError:
1836    PyErr_SetString(
1837        PyExc_UnicodeError,
1838        "\\N escapes not supported (can't load unicodedata module)"
1839        );
1840    Py_XDECREF(errorHandler);
1841    Py_XDECREF(exc);
1842    return NULL;
1843
1844onError:
1845    Py_XDECREF(v);
1846    Py_XDECREF(errorHandler);
1847    Py_XDECREF(exc);
1848    return NULL;
1849}
1850
1851/* Return a Unicode-Escape string version of the Unicode object.
1852
1853   If quotes is true, the string is enclosed in u"" or u'' quotes as
1854   appropriate.
1855
1856*/
1857
1858static const Py_UNICODE *findchar(const Py_UNICODE *s,
1859				  int size,
1860				  Py_UNICODE ch);
1861
1862static
1863PyObject *unicodeescape_string(const Py_UNICODE *s,
1864                               int size,
1865                               int quotes)
1866{
1867    PyObject *repr;
1868    char *p;
1869
1870    static const char *hexdigit = "0123456789abcdef";
1871
1872    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1873    if (repr == NULL)
1874        return NULL;
1875
1876    p = PyString_AS_STRING(repr);
1877
1878    if (quotes) {
1879        *p++ = 'u';
1880        *p++ = (findchar(s, size, '\'') &&
1881                !findchar(s, size, '"')) ? '"' : '\'';
1882    }
1883    while (size-- > 0) {
1884        Py_UNICODE ch = *s++;
1885
1886        /* Escape quotes */
1887        if (quotes &&
1888	    (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1889            *p++ = '\\';
1890            *p++ = (char) ch;
1891	    continue;
1892        }
1893
1894#ifdef Py_UNICODE_WIDE
1895        /* Map 21-bit characters to '\U00xxxxxx' */
1896        else if (ch >= 0x10000) {
1897	    int offset = p - PyString_AS_STRING(repr);
1898
1899	    /* Resize the string if necessary */
1900	    if (offset + 12 > PyString_GET_SIZE(repr)) {
1901		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1902		    return NULL;
1903		p = PyString_AS_STRING(repr) + offset;
1904	    }
1905
1906            *p++ = '\\';
1907            *p++ = 'U';
1908            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1909            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1910            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1911            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1912            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1913            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1914            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1915            *p++ = hexdigit[ch & 0x0000000F];
1916	    continue;
1917        }
1918#endif
1919	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1920	else if (ch >= 0xD800 && ch < 0xDC00) {
1921	    Py_UNICODE ch2;
1922	    Py_UCS4 ucs;
1923
1924	    ch2 = *s++;
1925	    size--;
1926	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1927		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1928		*p++ = '\\';
1929		*p++ = 'U';
1930		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1931		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1932		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1933		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1934		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1935		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1936		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1937		*p++ = hexdigit[ucs & 0x0000000F];
1938		continue;
1939	    }
1940	    /* Fall through: isolated surrogates are copied as-is */
1941	    s--;
1942	    size++;
1943	}
1944
1945        /* Map 16-bit characters to '\uxxxx' */
1946        if (ch >= 256) {
1947            *p++ = '\\';
1948            *p++ = 'u';
1949            *p++ = hexdigit[(ch >> 12) & 0x000F];
1950            *p++ = hexdigit[(ch >> 8) & 0x000F];
1951            *p++ = hexdigit[(ch >> 4) & 0x000F];
1952            *p++ = hexdigit[ch & 0x000F];
1953        }
1954
1955        /* Map special whitespace to '\t', \n', '\r' */
1956        else if (ch == '\t') {
1957            *p++ = '\\';
1958            *p++ = 't';
1959        }
1960        else if (ch == '\n') {
1961            *p++ = '\\';
1962            *p++ = 'n';
1963        }
1964        else if (ch == '\r') {
1965            *p++ = '\\';
1966            *p++ = 'r';
1967        }
1968
1969        /* Map non-printable US ASCII to '\xhh' */
1970        else if (ch < ' ' || ch >= 0x7F) {
1971            *p++ = '\\';
1972            *p++ = 'x';
1973            *p++ = hexdigit[(ch >> 4) & 0x000F];
1974            *p++ = hexdigit[ch & 0x000F];
1975        }
1976
1977        /* Copy everything else as-is */
1978        else
1979            *p++ = (char) ch;
1980    }
1981    if (quotes)
1982        *p++ = PyString_AS_STRING(repr)[1];
1983
1984    *p = '\0';
1985    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
1986    return repr;
1987}
1988
1989PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1990					int size)
1991{
1992    return unicodeescape_string(s, size, 0);
1993}
1994
1995PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1996{
1997    if (!PyUnicode_Check(unicode)) {
1998        PyErr_BadArgument();
1999        return NULL;
2000    }
2001    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2002					 PyUnicode_GET_SIZE(unicode));
2003}
2004
2005/* --- Raw Unicode Escape Codec ------------------------------------------- */
2006
2007PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2008					   int size,
2009					   const char *errors)
2010{
2011    const char *starts = s;
2012    int startinpos;
2013    int endinpos;
2014    int outpos;
2015    PyUnicodeObject *v;
2016    Py_UNICODE *p;
2017    const char *end;
2018    const char *bs;
2019    PyObject *errorHandler = NULL;
2020    PyObject *exc = NULL;
2021
2022    /* Escaped strings will always be longer than the resulting
2023       Unicode string, so we start with size here and then reduce the
2024       length after conversion to the true value. (But decoding error
2025       handler might have to resize the string) */
2026    v = _PyUnicode_New(size);
2027    if (v == NULL)
2028	goto onError;
2029    if (size == 0)
2030	return (PyObject *)v;
2031    p = PyUnicode_AS_UNICODE(v);
2032    end = s + size;
2033    while (s < end) {
2034	unsigned char c;
2035	Py_UCS4 x;
2036	int i;
2037
2038	/* Non-escape characters are interpreted as Unicode ordinals */
2039	if (*s != '\\') {
2040	    *p++ = (unsigned char)*s++;
2041	    continue;
2042	}
2043	startinpos = s-starts;
2044
2045	/* \u-escapes are only interpreted iff the number of leading
2046	   backslashes if odd */
2047	bs = s;
2048	for (;s < end;) {
2049	    if (*s != '\\')
2050		break;
2051	    *p++ = (unsigned char)*s++;
2052	}
2053	if (((s - bs) & 1) == 0 ||
2054	    s >= end ||
2055	    *s != 'u') {
2056	    continue;
2057	}
2058	p--;
2059	s++;
2060
2061	/* \uXXXX with 4 hex digits */
2062	outpos = p-PyUnicode_AS_UNICODE(v);
2063	for (x = 0, i = 0; i < 4; ++i, ++s) {
2064	    c = (unsigned char)*s;
2065	    if (!isxdigit(c)) {
2066		endinpos = s-starts;
2067		if (unicode_decode_call_errorhandler(
2068		    errors, &errorHandler,
2069		    "rawunicodeescape", "truncated \\uXXXX",
2070		    starts, size, &startinpos, &endinpos, &exc, &s,
2071		    (PyObject **)&v, &outpos, &p))
2072		    goto onError;
2073		goto nextByte;
2074	    }
2075	    x = (x<<4) & ~0xF;
2076	    if (c >= '0' && c <= '9')
2077		x += c - '0';
2078	    else if (c >= 'a' && c <= 'f')
2079		x += 10 + c - 'a';
2080	    else
2081		x += 10 + c - 'A';
2082	}
2083	*p++ = x;
2084	nextByte:
2085	;
2086    }
2087    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2088	goto onError;
2089    Py_XDECREF(errorHandler);
2090    Py_XDECREF(exc);
2091    return (PyObject *)v;
2092
2093 onError:
2094    Py_XDECREF(v);
2095    Py_XDECREF(errorHandler);
2096    Py_XDECREF(exc);
2097    return NULL;
2098}
2099
2100PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2101					   int size)
2102{
2103    PyObject *repr;
2104    char *p;
2105    char *q;
2106
2107    static const char *hexdigit = "0123456789abcdef";
2108
2109    repr = PyString_FromStringAndSize(NULL, 6 * size);
2110    if (repr == NULL)
2111        return NULL;
2112    if (size == 0)
2113	return repr;
2114
2115    p = q = PyString_AS_STRING(repr);
2116    while (size-- > 0) {
2117        Py_UNICODE ch = *s++;
2118	/* Map 16-bit characters to '\uxxxx' */
2119	if (ch >= 256) {
2120            *p++ = '\\';
2121            *p++ = 'u';
2122            *p++ = hexdigit[(ch >> 12) & 0xf];
2123            *p++ = hexdigit[(ch >> 8) & 0xf];
2124            *p++ = hexdigit[(ch >> 4) & 0xf];
2125            *p++ = hexdigit[ch & 15];
2126        }
2127	/* Copy everything else as-is */
2128	else
2129            *p++ = (char) ch;
2130    }
2131    *p = '\0';
2132    _PyString_Resize(&repr, p - q);
2133    return repr;
2134}
2135
2136PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2137{
2138    if (!PyUnicode_Check(unicode)) {
2139	PyErr_BadArgument();
2140	return NULL;
2141    }
2142    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2143					    PyUnicode_GET_SIZE(unicode));
2144}
2145
2146/* --- Latin-1 Codec ------------------------------------------------------ */
2147
2148PyObject *PyUnicode_DecodeLatin1(const char *s,
2149				 int size,
2150				 const char *errors)
2151{
2152    PyUnicodeObject *v;
2153    Py_UNICODE *p;
2154
2155    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2156    if (size == 1 && *(unsigned char*)s < 256) {
2157	Py_UNICODE r = *(unsigned char*)s;
2158	return PyUnicode_FromUnicode(&r, 1);
2159    }
2160
2161    v = _PyUnicode_New(size);
2162    if (v == NULL)
2163	goto onError;
2164    if (size == 0)
2165	return (PyObject *)v;
2166    p = PyUnicode_AS_UNICODE(v);
2167    while (size-- > 0)
2168	*p++ = (unsigned char)*s++;
2169    return (PyObject *)v;
2170
2171 onError:
2172    Py_XDECREF(v);
2173    return NULL;
2174}
2175
2176/* create or adjust a UnicodeEncodeError */
2177static void make_encode_exception(PyObject **exceptionObject,
2178    const char *encoding,
2179    const Py_UNICODE *unicode, int size,
2180    int startpos, int endpos,
2181    const char *reason)
2182{
2183    if (*exceptionObject == NULL) {
2184	*exceptionObject = PyUnicodeEncodeError_Create(
2185	    encoding, unicode, size, startpos, endpos, reason);
2186    }
2187    else {
2188	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2189	    goto onError;
2190	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2191	    goto onError;
2192	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2193	    goto onError;
2194	return;
2195	onError:
2196	Py_DECREF(*exceptionObject);
2197	*exceptionObject = NULL;
2198    }
2199}
2200
2201/* raises a UnicodeEncodeError */
2202static void raise_encode_exception(PyObject **exceptionObject,
2203    const char *encoding,
2204    const Py_UNICODE *unicode, int size,
2205    int startpos, int endpos,
2206    const char *reason)
2207{
2208    make_encode_exception(exceptionObject,
2209	encoding, unicode, size, startpos, endpos, reason);
2210    if (*exceptionObject != NULL)
2211	PyCodec_StrictErrors(*exceptionObject);
2212}
2213
2214/* error handling callback helper:
2215   build arguments, call the callback and check the arguments,
2216   put the result into newpos and return the replacement string, which
2217   has to be freed by the caller */
2218static PyObject *unicode_encode_call_errorhandler(const char *errors,
2219    PyObject **errorHandler,
2220    const char *encoding, const char *reason,
2221    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2222    int startpos, int endpos,
2223    int *newpos)
2224{
2225    static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2226
2227    PyObject *restuple;
2228    PyObject *resunicode;
2229
2230    if (*errorHandler == NULL) {
2231	*errorHandler = PyCodec_LookupError(errors);
2232        if (*errorHandler == NULL)
2233	    return NULL;
2234    }
2235
2236    make_encode_exception(exceptionObject,
2237	encoding, unicode, size, startpos, endpos, reason);
2238    if (*exceptionObject == NULL)
2239	return NULL;
2240
2241    restuple = PyObject_CallFunctionObjArgs(
2242	*errorHandler, *exceptionObject, NULL);
2243    if (restuple == NULL)
2244	return NULL;
2245    if (!PyTuple_Check(restuple)) {
2246	PyErr_Format(PyExc_TypeError, &argparse[4]);
2247	Py_DECREF(restuple);
2248	return NULL;
2249    }
2250    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2251	&resunicode, newpos)) {
2252	Py_DECREF(restuple);
2253	return NULL;
2254    }
2255    if (*newpos<0)
2256	*newpos = 0;
2257    else if (*newpos>size)
2258	*newpos = size;
2259    Py_INCREF(resunicode);
2260    Py_DECREF(restuple);
2261    return resunicode;
2262}
2263
2264static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2265				 int size,
2266				 const char *errors,
2267				 int limit)
2268{
2269    /* output object */
2270    PyObject *res;
2271    /* pointers to the beginning and end+1 of input */
2272    const Py_UNICODE *startp = p;
2273    const Py_UNICODE *endp = p + size;
2274    /* pointer to the beginning of the unencodable characters */
2275    /* const Py_UNICODE *badp = NULL; */
2276    /* pointer into the output */
2277    char *str;
2278    /* current output position */
2279    int respos = 0;
2280    int ressize;
2281    char *encoding = (limit == 256) ? "latin-1" : "ascii";
2282    char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2283    PyObject *errorHandler = NULL;
2284    PyObject *exc = NULL;
2285    /* the following variable is used for caching string comparisons
2286     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2287    int known_errorHandler = -1;
2288
2289    /* allocate enough for a simple encoding without
2290       replacements, if we need more, we'll resize */
2291    res = PyString_FromStringAndSize(NULL, size);
2292    if (res == NULL)
2293        goto onError;
2294    if (size == 0)
2295	return res;
2296    str = PyString_AS_STRING(res);
2297    ressize = size;
2298
2299    while (p<endp) {
2300	Py_UNICODE c = *p;
2301
2302	/* can we encode this? */
2303	if (c<limit) {
2304	    /* no overflow check, because we know that the space is enough */
2305	    *str++ = (char)c;
2306	    ++p;
2307	}
2308	else {
2309	    int unicodepos = p-startp;
2310	    int requiredsize;
2311	    PyObject *repunicode;
2312	    int repsize;
2313	    int newpos;
2314	    int respos;
2315	    Py_UNICODE *uni2;
2316	    /* startpos for collecting unencodable chars */
2317	    const Py_UNICODE *collstart = p;
2318	    const Py_UNICODE *collend = p;
2319	    /* find all unecodable characters */
2320	    while ((collend < endp) && ((*collend)>=limit))
2321		++collend;
2322	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2323	    if (known_errorHandler==-1) {
2324		if ((errors==NULL) || (!strcmp(errors, "strict")))
2325		    known_errorHandler = 1;
2326		else if (!strcmp(errors, "replace"))
2327		    known_errorHandler = 2;
2328		else if (!strcmp(errors, "ignore"))
2329		    known_errorHandler = 3;
2330		else if (!strcmp(errors, "xmlcharrefreplace"))
2331		    known_errorHandler = 4;
2332		else
2333		    known_errorHandler = 0;
2334	    }
2335	    switch (known_errorHandler) {
2336		case 1: /* strict */
2337		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2338		    goto onError;
2339		case 2: /* replace */
2340		    while (collstart++<collend)
2341			*str++ = '?'; /* fall through */
2342		case 3: /* ignore */
2343		    p = collend;
2344		    break;
2345		case 4: /* xmlcharrefreplace */
2346		    respos = str-PyString_AS_STRING(res);
2347		    /* determine replacement size (temporarily (mis)uses p) */
2348		    for (p = collstart, repsize = 0; p < collend; ++p) {
2349			if (*p<10)
2350			    repsize += 2+1+1;
2351			else if (*p<100)
2352			    repsize += 2+2+1;
2353			else if (*p<1000)
2354			    repsize += 2+3+1;
2355			else if (*p<10000)
2356			    repsize += 2+4+1;
2357			else if (*p<100000)
2358			    repsize += 2+5+1;
2359			else if (*p<1000000)
2360			    repsize += 2+6+1;
2361			else
2362			    repsize += 2+7+1;
2363		    }
2364		    requiredsize = respos+repsize+(endp-collend);
2365		    if (requiredsize > ressize) {
2366			if (requiredsize<2*ressize)
2367			    requiredsize = 2*ressize;
2368			if (_PyString_Resize(&res, requiredsize))
2369			    goto onError;
2370			str = PyString_AS_STRING(res) + respos;
2371			ressize = requiredsize;
2372		    }
2373		    /* generate replacement (temporarily (mis)uses p) */
2374		    for (p = collstart; p < collend; ++p) {
2375			str += sprintf(str, "&#%d;", (int)*p);
2376		    }
2377		    p = collend;
2378		    break;
2379		default:
2380		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2381			encoding, reason, startp, size, &exc,
2382			collstart-startp, collend-startp, &newpos);
2383		    if (repunicode == NULL)
2384			goto onError;
2385		    /* need more space? (at least enough for what we
2386		       have+the replacement+the rest of the string, so
2387		       we won't have to check space for encodable characters) */
2388		    respos = str-PyString_AS_STRING(res);
2389		    repsize = PyUnicode_GET_SIZE(repunicode);
2390		    requiredsize = respos+repsize+(endp-collend);
2391		    if (requiredsize > ressize) {
2392			if (requiredsize<2*ressize)
2393			    requiredsize = 2*ressize;
2394			if (_PyString_Resize(&res, requiredsize)) {
2395			    Py_DECREF(repunicode);
2396			    goto onError;
2397			}
2398			str = PyString_AS_STRING(res) + respos;
2399			ressize = requiredsize;
2400		    }
2401		    /* check if there is anything unencodable in the replacement
2402		       and copy it to the output */
2403		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2404			c = *uni2;
2405			if (c >= limit) {
2406			    raise_encode_exception(&exc, encoding, startp, size,
2407				unicodepos, unicodepos+1, reason);
2408			    Py_DECREF(repunicode);
2409			    goto onError;
2410			}
2411			*str = (char)c;
2412		    }
2413		    p = startp + newpos;
2414		    Py_DECREF(repunicode);
2415	    }
2416	}
2417    }
2418    /* Resize if we allocated to much */
2419    respos = str-PyString_AS_STRING(res);
2420    if (respos<ressize)
2421       /* If this falls res will be NULL */
2422	_PyString_Resize(&res, respos);
2423    Py_XDECREF(errorHandler);
2424    Py_XDECREF(exc);
2425    return res;
2426
2427    onError:
2428    Py_XDECREF(res);
2429    Py_XDECREF(errorHandler);
2430    Py_XDECREF(exc);
2431    return NULL;
2432}
2433
2434PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2435				 int size,
2436				 const char *errors)
2437{
2438    return unicode_encode_ucs1(p, size, errors, 256);
2439}
2440
2441PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2442{
2443    if (!PyUnicode_Check(unicode)) {
2444	PyErr_BadArgument();
2445	return NULL;
2446    }
2447    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2448				  PyUnicode_GET_SIZE(unicode),
2449				  NULL);
2450}
2451
2452/* --- 7-bit ASCII Codec -------------------------------------------------- */
2453
2454PyObject *PyUnicode_DecodeASCII(const char *s,
2455				int size,
2456				const char *errors)
2457{
2458    const char *starts = s;
2459    PyUnicodeObject *v;
2460    Py_UNICODE *p;
2461    int startinpos;
2462    int endinpos;
2463    int outpos;
2464    const char *e;
2465    PyObject *errorHandler = NULL;
2466    PyObject *exc = NULL;
2467
2468    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2469    if (size == 1 && *(unsigned char*)s < 128) {
2470	Py_UNICODE r = *(unsigned char*)s;
2471	return PyUnicode_FromUnicode(&r, 1);
2472    }
2473
2474    v = _PyUnicode_New(size);
2475    if (v == NULL)
2476	goto onError;
2477    if (size == 0)
2478	return (PyObject *)v;
2479    p = PyUnicode_AS_UNICODE(v);
2480    e = s + size;
2481    while (s < e) {
2482	register unsigned char c = (unsigned char)*s;
2483	if (c < 128) {
2484	    *p++ = c;
2485	    ++s;
2486	}
2487	else {
2488	    startinpos = s-starts;
2489	    endinpos = startinpos + 1;
2490	    outpos = p-PyUnicode_AS_UNICODE(v);
2491	    if (unicode_decode_call_errorhandler(
2492		 errors, &errorHandler,
2493		 "ascii", "ordinal not in range(128)",
2494		 starts, size, &startinpos, &endinpos, &exc, &s,
2495		 (PyObject **)&v, &outpos, &p))
2496		goto onError;
2497	}
2498    }
2499    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2500	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2501	    goto onError;
2502    Py_XDECREF(errorHandler);
2503    Py_XDECREF(exc);
2504    return (PyObject *)v;
2505
2506 onError:
2507    Py_XDECREF(v);
2508    Py_XDECREF(errorHandler);
2509    Py_XDECREF(exc);
2510    return NULL;
2511}
2512
2513PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2514				int size,
2515				const char *errors)
2516{
2517    return unicode_encode_ucs1(p, size, errors, 128);
2518}
2519
2520PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2521{
2522    if (!PyUnicode_Check(unicode)) {
2523	PyErr_BadArgument();
2524	return NULL;
2525    }
2526    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2527				 PyUnicode_GET_SIZE(unicode),
2528				 NULL);
2529}
2530
2531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2532
2533/* --- MBCS codecs for Windows -------------------------------------------- */
2534
2535PyObject *PyUnicode_DecodeMBCS(const char *s,
2536				int size,
2537				const char *errors)
2538{
2539    PyUnicodeObject *v;
2540    Py_UNICODE *p;
2541
2542    /* First get the size of the result */
2543    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2544    if (size > 0 && usize==0)
2545        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2546
2547    v = _PyUnicode_New(usize);
2548    if (v == NULL)
2549        return NULL;
2550    if (usize == 0)
2551	return (PyObject *)v;
2552    p = PyUnicode_AS_UNICODE(v);
2553    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2554        Py_DECREF(v);
2555        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2556    }
2557
2558    return (PyObject *)v;
2559}
2560
2561PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2562				int size,
2563				const char *errors)
2564{
2565    PyObject *repr;
2566    char *s;
2567    DWORD mbcssize;
2568
2569    /* If there are no characters, bail now! */
2570    if (size==0)
2571	    return PyString_FromString("");
2572
2573    /* First get the size of the result */
2574    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2575    if (mbcssize==0)
2576        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2577
2578    repr = PyString_FromStringAndSize(NULL, mbcssize);
2579    if (repr == NULL)
2580        return NULL;
2581    if (mbcssize == 0)
2582        return repr;
2583
2584    /* Do the conversion */
2585    s = PyString_AS_STRING(repr);
2586    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2587        Py_DECREF(repr);
2588        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2589    }
2590    return repr;
2591}
2592
2593#endif /* MS_WINDOWS */
2594
2595/* --- Character Mapping Codec -------------------------------------------- */
2596
2597PyObject *PyUnicode_DecodeCharmap(const char *s,
2598				  int size,
2599				  PyObject *mapping,
2600				  const char *errors)
2601{
2602    const char *starts = s;
2603    int startinpos;
2604    int endinpos;
2605    int outpos;
2606    const char *e;
2607    PyUnicodeObject *v;
2608    Py_UNICODE *p;
2609    int extrachars = 0;
2610    PyObject *errorHandler = NULL;
2611    PyObject *exc = NULL;
2612
2613    /* Default to Latin-1 */
2614    if (mapping == NULL)
2615	return PyUnicode_DecodeLatin1(s, size, errors);
2616
2617    v = _PyUnicode_New(size);
2618    if (v == NULL)
2619	goto onError;
2620    if (size == 0)
2621	return (PyObject *)v;
2622    p = PyUnicode_AS_UNICODE(v);
2623    e = s + size;
2624    while (s < e) {
2625	unsigned char ch = *s;
2626	PyObject *w, *x;
2627
2628	/* Get mapping (char ordinal -> integer, Unicode char or None) */
2629	w = PyInt_FromLong((long)ch);
2630	if (w == NULL)
2631	    goto onError;
2632	x = PyObject_GetItem(mapping, w);
2633	Py_DECREF(w);
2634	if (x == NULL) {
2635	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2636		/* No mapping found means: mapping is undefined. */
2637		PyErr_Clear();
2638		x = Py_None;
2639		Py_INCREF(x);
2640	    } else
2641		goto onError;
2642	}
2643
2644	/* Apply mapping */
2645	if (PyInt_Check(x)) {
2646	    long value = PyInt_AS_LONG(x);
2647	    if (value < 0 || value > 65535) {
2648		PyErr_SetString(PyExc_TypeError,
2649				"character mapping must be in range(65536)");
2650		Py_DECREF(x);
2651		goto onError;
2652	    }
2653	    *p++ = (Py_UNICODE)value;
2654	}
2655	else if (x == Py_None) {
2656	    /* undefined mapping */
2657	    outpos = p-PyUnicode_AS_UNICODE(v);
2658	    startinpos = s-starts;
2659	    endinpos = startinpos+1;
2660	    if (unicode_decode_call_errorhandler(
2661		 errors, &errorHandler,
2662		 "charmap", "character maps to <undefined>",
2663		 starts, size, &startinpos, &endinpos, &exc, &s,
2664		 (PyObject **)&v, &outpos, &p)) {
2665		Py_DECREF(x);
2666		goto onError;
2667	    }
2668	    continue;
2669	}
2670	else if (PyUnicode_Check(x)) {
2671	    int targetsize = PyUnicode_GET_SIZE(x);
2672
2673	    if (targetsize == 1)
2674		/* 1-1 mapping */
2675		*p++ = *PyUnicode_AS_UNICODE(x);
2676
2677	    else if (targetsize > 1) {
2678		/* 1-n mapping */
2679		if (targetsize > extrachars) {
2680		    /* resize first */
2681		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2682		    int needed = (targetsize - extrachars) + \
2683			         (targetsize << 2);
2684		    extrachars += needed;
2685		    if (_PyUnicode_Resize(&v,
2686					 PyUnicode_GET_SIZE(v) + needed)) {
2687			Py_DECREF(x);
2688			goto onError;
2689		    }
2690		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2691		}
2692		Py_UNICODE_COPY(p,
2693				PyUnicode_AS_UNICODE(x),
2694				targetsize);
2695		p += targetsize;
2696		extrachars -= targetsize;
2697	    }
2698	    /* 1-0 mapping: skip the character */
2699	}
2700	else {
2701	    /* wrong return value */
2702	    PyErr_SetString(PyExc_TypeError,
2703		  "character mapping must return integer, None or unicode");
2704	    Py_DECREF(x);
2705	    goto onError;
2706	}
2707	Py_DECREF(x);
2708	++s;
2709    }
2710    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2711	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2712	    goto onError;
2713    Py_XDECREF(errorHandler);
2714    Py_XDECREF(exc);
2715    return (PyObject *)v;
2716
2717 onError:
2718    Py_XDECREF(errorHandler);
2719    Py_XDECREF(exc);
2720    Py_XDECREF(v);
2721    return NULL;
2722}
2723
2724/* Lookup the character ch in the mapping. If the character
2725   can't be found, Py_None is returned (or NULL, if another
2726   error occured). */
2727static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
2728{
2729    PyObject *w = PyInt_FromLong((long)c);
2730    PyObject *x;
2731
2732    if (w == NULL)
2733	 return NULL;
2734    x = PyObject_GetItem(mapping, w);
2735    Py_DECREF(w);
2736    if (x == NULL) {
2737	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2738	    /* No mapping found means: mapping is undefined. */
2739	    PyErr_Clear();
2740	    x = Py_None;
2741	    Py_INCREF(x);
2742	    return x;
2743	} else
2744	    return NULL;
2745    }
2746    else if (PyInt_Check(x)) {
2747	long value = PyInt_AS_LONG(x);
2748	if (value < 0 || value > 255) {
2749	    PyErr_SetString(PyExc_TypeError,
2750			     "character mapping must be in range(256)");
2751	    Py_DECREF(x);
2752	    return NULL;
2753	}
2754	return x;
2755    }
2756    else if (PyString_Check(x))
2757	return x;
2758    else {
2759	/* wrong return value */
2760	PyErr_SetString(PyExc_TypeError,
2761	      "character mapping must return integer, None or str");
2762	Py_DECREF(x);
2763	return NULL;
2764    }
2765}
2766
2767/* lookup the character, put the result in the output string and adjust
2768   various state variables. Reallocate the output string if not enough
2769   space is available. Return a new reference to the object that
2770   was put in the output buffer, or Py_None, if the mapping was undefined
2771   (in which case no character was written) or NULL, if a
2772   reallocation error ocurred. The called must decref the result */
2773static
2774PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2775    PyObject **outobj, int *outpos)
2776{
2777    PyObject *rep = charmapencode_lookup(c, mapping);
2778
2779    if (rep==NULL)
2780	return NULL;
2781    else if (rep==Py_None)
2782	return rep;
2783    else {
2784	char *outstart = PyString_AS_STRING(*outobj);
2785	int outsize = PyString_GET_SIZE(*outobj);
2786	if (PyInt_Check(rep)) {
2787	    int requiredsize = *outpos+1;
2788	    if (outsize<requiredsize) {
2789		/* exponentially overallocate to minimize reallocations */
2790		if (requiredsize < 2*outsize)
2791		    requiredsize = 2*outsize;
2792		if (_PyString_Resize(outobj, requiredsize)) {
2793		    Py_DECREF(rep);
2794		    return NULL;
2795		}
2796		outstart = PyString_AS_STRING(*outobj);
2797	    }
2798	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2799	}
2800	else {
2801	    const char *repchars = PyString_AS_STRING(rep);
2802	    int repsize = PyString_GET_SIZE(rep);
2803	    int requiredsize = *outpos+repsize;
2804	    if (outsize<requiredsize) {
2805		/* exponentially overallocate to minimize reallocations */
2806		if (requiredsize < 2*outsize)
2807		    requiredsize = 2*outsize;
2808		if (_PyString_Resize(outobj, requiredsize)) {
2809		    Py_DECREF(rep);
2810		    return NULL;
2811		}
2812		outstart = PyString_AS_STRING(*outobj);
2813	    }
2814	    memcpy(outstart + *outpos, repchars, repsize);
2815	    *outpos += repsize;
2816	}
2817    }
2818    return rep;
2819}
2820
2821/* handle an error in PyUnicode_EncodeCharmap
2822   Return 0 on success, -1 on error */
2823static
2824int charmap_encoding_error(
2825    const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2826    PyObject **exceptionObject,
2827    int *known_errorHandler, PyObject *errorHandler, const char *errors,
2828    PyObject **res, int *respos)
2829{
2830    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2831    int repsize;
2832    int newpos;
2833    Py_UNICODE *uni2;
2834    /* startpos for collecting unencodable chars */
2835    int collstartpos = *inpos;
2836    int collendpos = *inpos+1;
2837    int collpos;
2838    char *encoding = "charmap";
2839    char *reason = "character maps to <undefined>";
2840
2841    PyObject *x;
2842    /* find all unencodable characters */
2843    while (collendpos < size) {
2844	x = charmapencode_lookup(p[collendpos], mapping);
2845	if (x==NULL)
2846	    return -1;
2847	else if (x!=Py_None) {
2848	    Py_DECREF(x);
2849	    break;
2850	}
2851	Py_DECREF(x);
2852	++collendpos;
2853    }
2854    /* cache callback name lookup
2855     * (if not done yet, i.e. it's the first error) */
2856    if (*known_errorHandler==-1) {
2857	if ((errors==NULL) || (!strcmp(errors, "strict")))
2858	    *known_errorHandler = 1;
2859	else if (!strcmp(errors, "replace"))
2860	    *known_errorHandler = 2;
2861	else if (!strcmp(errors, "ignore"))
2862	    *known_errorHandler = 3;
2863	else if (!strcmp(errors, "xmlcharrefreplace"))
2864	    *known_errorHandler = 4;
2865	else
2866	    *known_errorHandler = 0;
2867    }
2868    switch (*known_errorHandler) {
2869	case 1: /* strict */
2870	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2871	    return -1;
2872	case 2: /* replace */
2873	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2874		x = charmapencode_output('?', mapping, res, respos);
2875		if (x==NULL) {
2876		    return -1;
2877		}
2878		else if (x==Py_None) {
2879		    Py_DECREF(x);
2880		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2881		    return -1;
2882		}
2883		Py_DECREF(x);
2884	    }
2885	    /* fall through */
2886	case 3: /* ignore */
2887	    *inpos = collendpos;
2888	    break;
2889	case 4: /* xmlcharrefreplace */
2890	    /* generate replacement (temporarily (mis)uses p) */
2891	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2892		char buffer[2+29+1+1];
2893		char *cp;
2894		sprintf(buffer, "&#%d;", (int)p[collpos]);
2895		for (cp = buffer; *cp; ++cp) {
2896		    x = charmapencode_output(*cp, mapping, res, respos);
2897		    if (x==NULL)
2898			return -1;
2899		    else if (x==Py_None) {
2900			Py_DECREF(x);
2901			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2902			return -1;
2903		    }
2904		    Py_DECREF(x);
2905		}
2906	    }
2907	    *inpos = collendpos;
2908	    break;
2909	default:
2910	    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2911		encoding, reason, p, size, exceptionObject,
2912		collstartpos, collendpos, &newpos);
2913	    if (repunicode == NULL)
2914		return -1;
2915	    /* generate replacement  */
2916	    repsize = PyUnicode_GET_SIZE(repunicode);
2917	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2918		x = charmapencode_output(*uni2, mapping, res, respos);
2919		if (x==NULL) {
2920		    Py_DECREF(repunicode);
2921		    return -1;
2922		}
2923		else if (x==Py_None) {
2924		    Py_DECREF(repunicode);
2925		    Py_DECREF(x);
2926		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2927		    return -1;
2928		}
2929		Py_DECREF(x);
2930	    }
2931	    *inpos = newpos;
2932	    Py_DECREF(repunicode);
2933    }
2934    return 0;
2935}
2936
2937PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2938				  int size,
2939				  PyObject *mapping,
2940				  const char *errors)
2941{
2942    /* output object */
2943    PyObject *res = NULL;
2944    /* current input position */
2945    int inpos = 0;
2946    /* current output position */
2947    int respos = 0;
2948    PyObject *errorHandler = NULL;
2949    PyObject *exc = NULL;
2950    /* the following variable is used for caching string comparisons
2951     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2952     * 3=ignore, 4=xmlcharrefreplace */
2953    int known_errorHandler = -1;
2954
2955    /* Default to Latin-1 */
2956    if (mapping == NULL)
2957	return PyUnicode_EncodeLatin1(p, size, errors);
2958
2959    /* allocate enough for a simple encoding without
2960       replacements, if we need more, we'll resize */
2961    res = PyString_FromStringAndSize(NULL, size);
2962    if (res == NULL)
2963        goto onError;
2964    if (size == 0)
2965	return res;
2966
2967    while (inpos<size) {
2968	/* try to encode it */
2969	PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
2970	if (x==NULL) /* error */
2971	    goto onError;
2972	if (x==Py_None) { /* unencodable character */
2973	    if (charmap_encoding_error(p, size, &inpos, mapping,
2974		&exc,
2975		&known_errorHandler, errorHandler, errors,
2976		&res, &respos))
2977		goto onError;
2978	}
2979	else
2980	    /* done with this character => adjust input position */
2981	    ++inpos;
2982	Py_DECREF(x);
2983    }
2984
2985    /* Resize if we allocated to much */
2986    if (respos<PyString_GET_SIZE(res)) {
2987	if (_PyString_Resize(&res, respos))
2988	    goto onError;
2989    }
2990    Py_XDECREF(exc);
2991    Py_XDECREF(errorHandler);
2992    return res;
2993
2994    onError:
2995    Py_XDECREF(res);
2996    Py_XDECREF(exc);
2997    Py_XDECREF(errorHandler);
2998    return NULL;
2999}
3000
3001PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3002				    PyObject *mapping)
3003{
3004    if (!PyUnicode_Check(unicode) || mapping == NULL) {
3005	PyErr_BadArgument();
3006	return NULL;
3007    }
3008    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3009				   PyUnicode_GET_SIZE(unicode),
3010				   mapping,
3011				   NULL);
3012}
3013
3014/* create or adjust a UnicodeTranslateError */
3015static void make_translate_exception(PyObject **exceptionObject,
3016    const Py_UNICODE *unicode, int size,
3017    int startpos, int endpos,
3018    const char *reason)
3019{
3020    if (*exceptionObject == NULL) {
3021    	*exceptionObject = PyUnicodeTranslateError_Create(
3022	    unicode, size, startpos, endpos, reason);
3023    }
3024    else {
3025	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3026	    goto onError;
3027	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3028	    goto onError;
3029	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3030	    goto onError;
3031	return;
3032	onError:
3033	Py_DECREF(*exceptionObject);
3034	*exceptionObject = NULL;
3035    }
3036}
3037
3038/* raises a UnicodeTranslateError */
3039static void raise_translate_exception(PyObject **exceptionObject,
3040    const Py_UNICODE *unicode, int size,
3041    int startpos, int endpos,
3042    const char *reason)
3043{
3044    make_translate_exception(exceptionObject,
3045	unicode, size, startpos, endpos, reason);
3046    if (*exceptionObject != NULL)
3047	PyCodec_StrictErrors(*exceptionObject);
3048}
3049
3050/* error handling callback helper:
3051   build arguments, call the callback and check the arguments,
3052   put the result into newpos and return the replacement string, which
3053   has to be freed by the caller */
3054static PyObject *unicode_translate_call_errorhandler(const char *errors,
3055    PyObject **errorHandler,
3056    const char *reason,
3057    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3058    int startpos, int endpos,
3059    int *newpos)
3060{
3061    static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3062
3063    PyObject *restuple;
3064    PyObject *resunicode;
3065
3066    if (*errorHandler == NULL) {
3067	*errorHandler = PyCodec_LookupError(errors);
3068        if (*errorHandler == NULL)
3069	    return NULL;
3070    }
3071
3072    make_translate_exception(exceptionObject,
3073	unicode, size, startpos, endpos, reason);
3074    if (*exceptionObject == NULL)
3075	return NULL;
3076
3077    restuple = PyObject_CallFunctionObjArgs(
3078	*errorHandler, *exceptionObject, NULL);
3079    if (restuple == NULL)
3080	return NULL;
3081    if (!PyTuple_Check(restuple)) {
3082	PyErr_Format(PyExc_TypeError, &argparse[4]);
3083	Py_DECREF(restuple);
3084	return NULL;
3085    }
3086    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3087	&resunicode, newpos)) {
3088	Py_DECREF(restuple);
3089	return NULL;
3090    }
3091    if (*newpos<0)
3092	*newpos = 0;
3093    else if (*newpos>size)
3094	*newpos = size;
3095    Py_INCREF(resunicode);
3096    Py_DECREF(restuple);
3097    return resunicode;
3098}
3099
3100/* Lookup the character ch in the mapping and put the result in result,
3101   which must be decrefed by the caller.
3102   Return 0 on success, -1 on error */
3103static
3104int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3105{
3106    PyObject *w = PyInt_FromLong((long)c);
3107    PyObject *x;
3108
3109    if (w == NULL)
3110	 return -1;
3111    x = PyObject_GetItem(mapping, w);
3112    Py_DECREF(w);
3113    if (x == NULL) {
3114	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3115	    /* No mapping found means: use 1:1 mapping. */
3116	    PyErr_Clear();
3117	    *result = NULL;
3118	    return 0;
3119	} else
3120	    return -1;
3121    }
3122    else if (x == Py_None) {
3123	*result = x;
3124	return 0;
3125    }
3126    else if (PyInt_Check(x)) {
3127	long value = PyInt_AS_LONG(x);
3128	long max = PyUnicode_GetMax();
3129	if (value < 0 || value > max) {
3130	    PyErr_Format(PyExc_TypeError,
3131			     "character mapping must be in range(0x%lx)", max+1);
3132	    Py_DECREF(x);
3133	    return -1;
3134	}
3135	*result = x;
3136	return 0;
3137    }
3138    else if (PyUnicode_Check(x)) {
3139	*result = x;
3140	return 0;
3141    }
3142    else {
3143	/* wrong return value */
3144	PyErr_SetString(PyExc_TypeError,
3145	      "character mapping must return integer, None or unicode");
3146	return -1;
3147    }
3148}
3149/* ensure that *outobj is at least requiredsize characters long,
3150if not reallocate and adjust various state variables.
3151Return 0 on success, -1 on error */
3152static
3153int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3154    int requiredsize)
3155{
3156    if (requiredsize > *outsize) {
3157	/* remember old output position */
3158	int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3159	/* exponentially overallocate to minimize reallocations */
3160	if (requiredsize < 2 * *outsize)
3161	    requiredsize = 2 * *outsize;
3162	if (_PyUnicode_Resize(outobj, requiredsize))
3163	    return -1;
3164	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3165	*outsize = requiredsize;
3166    }
3167    return 0;
3168}
3169/* lookup the character, put the result in the output string and adjust
3170   various state variables. Return a new reference to the object that
3171   was put in the output buffer in *result, or Py_None, if the mapping was
3172   undefined (in which case no character was written).
3173   The called must decref result.
3174   Return 0 on success, -1 on error. */
3175static
3176int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3177    PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3178{
3179    if (charmaptranslate_lookup(c, mapping, res))
3180	return -1;
3181    if (*res==NULL) {
3182	/* not found => default to 1:1 mapping */
3183	*(*outp)++ = (Py_UNICODE)c;
3184    }
3185    else if (*res==Py_None)
3186	;
3187    else if (PyInt_Check(*res)) {
3188	/* no overflow check, because we know that the space is enough */
3189	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3190    }
3191    else if (PyUnicode_Check(*res)) {
3192	int repsize = PyUnicode_GET_SIZE(*res);
3193	if (repsize==1) {
3194	    /* no overflow check, because we know that the space is enough */
3195	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3196	}
3197	else if (repsize!=0) {
3198	    /* more than one character */
3199	    int requiredsize = *outsize + repsize - 1;
3200	    if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3201		return -1;
3202	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3203	    *outp += repsize;
3204	}
3205    }
3206    else
3207	return -1;
3208    return 0;
3209}
3210
3211PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3212				     int size,
3213				     PyObject *mapping,
3214				     const char *errors)
3215{
3216    /* output object */
3217    PyObject *res = NULL;
3218    /* pointers to the beginning and end+1 of input */
3219    const Py_UNICODE *startp = p;
3220    const Py_UNICODE *endp = p + size;
3221    /* pointer into the output */
3222    Py_UNICODE *str;
3223    /* current output position */
3224    int respos = 0;
3225    int ressize;
3226    char *reason = "character maps to <undefined>";
3227    PyObject *errorHandler = NULL;
3228    PyObject *exc = NULL;
3229    /* the following variable is used for caching string comparisons
3230     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3231     * 3=ignore, 4=xmlcharrefreplace */
3232    int known_errorHandler = -1;
3233
3234    if (mapping == NULL) {
3235	PyErr_BadArgument();
3236	return NULL;
3237    }
3238
3239    /* allocate enough for a simple 1:1 translation without
3240       replacements, if we need more, we'll resize */
3241    res = PyUnicode_FromUnicode(NULL, size);
3242    if (res == NULL)
3243        goto onError;
3244    if (size == 0)
3245	return res;
3246    str = PyUnicode_AS_UNICODE(res);
3247    ressize = size;
3248
3249    while (p<endp) {
3250	/* try to encode it */
3251	PyObject *x = NULL;
3252	if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3253	    Py_XDECREF(x);
3254	    goto onError;
3255	}
3256	if (x!=Py_None) /* it worked => adjust input pointer */
3257	    ++p;
3258	else { /* untranslatable character */
3259	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3260	    int repsize;
3261	    int newpos;
3262	    Py_UNICODE *uni2;
3263	    /* startpos for collecting untranslatable chars */
3264	    const Py_UNICODE *collstart = p;
3265	    const Py_UNICODE *collend = p+1;
3266	    const Py_UNICODE *coll;
3267
3268	    Py_XDECREF(x);
3269	    /* find all untranslatable characters */
3270	    while (collend < endp) {
3271	    	if (charmaptranslate_lookup(*collend, mapping, &x))
3272		    goto onError;
3273		Py_XDECREF(x);
3274		if (x!=Py_None)
3275		    break;
3276		++collend;
3277	    }
3278	    /* cache callback name lookup
3279	     * (if not done yet, i.e. it's the first error) */
3280	    if (known_errorHandler==-1) {
3281		if ((errors==NULL) || (!strcmp(errors, "strict")))
3282		    known_errorHandler = 1;
3283		else if (!strcmp(errors, "replace"))
3284		    known_errorHandler = 2;
3285		else if (!strcmp(errors, "ignore"))
3286		    known_errorHandler = 3;
3287		else if (!strcmp(errors, "xmlcharrefreplace"))
3288		    known_errorHandler = 4;
3289		else
3290		    known_errorHandler = 0;
3291	    }
3292	    switch (known_errorHandler) {
3293		case 1: /* strict */
3294		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3295		    goto onError;
3296		case 2: /* replace */
3297		    /* No need to check for space, this is a 1:1 replacement */
3298		    for (coll = collstart; coll<collend; ++coll)
3299			*str++ = '?';
3300		    /* fall through */
3301		case 3: /* ignore */
3302		    p = collend;
3303		    break;
3304		case 4: /* xmlcharrefreplace */
3305		    /* generate replacement (temporarily (mis)uses p) */
3306		    for (p = collstart; p < collend; ++p) {
3307			char buffer[2+29+1+1];
3308			char *cp;
3309			sprintf(buffer, "&#%d;", (int)*p);
3310			if (charmaptranslate_makespace(&res, &str, &ressize,
3311			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3312			    goto onError;
3313			for (cp = buffer; *cp; ++cp)
3314			    *str++ = *cp;
3315		    }
3316		    p = collend;
3317		    break;
3318		default:
3319		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3320			reason, startp, size, &exc,
3321			collstart-startp, collend-startp, &newpos);
3322		    if (repunicode == NULL)
3323			goto onError;
3324		    /* generate replacement  */
3325		    repsize = PyUnicode_GET_SIZE(repunicode);
3326		    if (charmaptranslate_makespace(&res, &str, &ressize,
3327			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3328			Py_DECREF(repunicode);
3329			goto onError;
3330		    }
3331		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3332			*str++ = *uni2;
3333		    p = startp + newpos;
3334		    Py_DECREF(repunicode);
3335	    }
3336	}
3337    }
3338    /* Resize if we allocated to much */
3339    respos = str-PyUnicode_AS_UNICODE(res);
3340    if (respos<ressize) {
3341	if (_PyUnicode_Resize(&res, respos))
3342	    goto onError;
3343    }
3344    Py_XDECREF(exc);
3345    Py_XDECREF(errorHandler);
3346    return res;
3347
3348    onError:
3349    Py_XDECREF(res);
3350    Py_XDECREF(exc);
3351    Py_XDECREF(errorHandler);
3352    return NULL;
3353}
3354
3355PyObject *PyUnicode_Translate(PyObject *str,
3356			      PyObject *mapping,
3357			      const char *errors)
3358{
3359    PyObject *result;
3360
3361    str = PyUnicode_FromObject(str);
3362    if (str == NULL)
3363	goto onError;
3364    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3365					PyUnicode_GET_SIZE(str),
3366					mapping,
3367					errors);
3368    Py_DECREF(str);
3369    return result;
3370
3371 onError:
3372    Py_XDECREF(str);
3373    return NULL;
3374}
3375
3376/* --- Decimal Encoder ---------------------------------------------------- */
3377
3378int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3379			    int length,
3380			    char *output,
3381			    const char *errors)
3382{
3383    Py_UNICODE *p, *end;
3384    PyObject *errorHandler = NULL;
3385    PyObject *exc = NULL;
3386    const char *encoding = "decimal";
3387    const char *reason = "invalid decimal Unicode string";
3388    /* the following variable is used for caching string comparisons
3389     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3390    int known_errorHandler = -1;
3391
3392    if (output == NULL) {
3393	PyErr_BadArgument();
3394	return -1;
3395    }
3396
3397    p = s;
3398    end = s + length;
3399    while (p < end) {
3400	register Py_UNICODE ch = *p;
3401	int decimal;
3402	PyObject *repunicode;
3403	int repsize;
3404	int newpos;
3405	Py_UNICODE *uni2;
3406	Py_UNICODE *collstart;
3407	Py_UNICODE *collend;
3408
3409	if (Py_UNICODE_ISSPACE(ch)) {
3410	    *output++ = ' ';
3411	    ++p;
3412	    continue;
3413	}
3414	decimal = Py_UNICODE_TODECIMAL(ch);
3415	if (decimal >= 0) {
3416	    *output++ = '0' + decimal;
3417	    ++p;
3418	    continue;
3419	}
3420	if (0 < ch && ch < 256) {
3421	    *output++ = (char)ch;
3422	    ++p;
3423	    continue;
3424	}
3425	/* All other characters are considered unencodable */
3426	collstart = p;
3427	collend = p+1;
3428	while (collend < end) {
3429	    if ((0 < *collend && *collend < 256) ||
3430	        !Py_UNICODE_ISSPACE(*collend) ||
3431	        Py_UNICODE_TODECIMAL(*collend))
3432		break;
3433	}
3434	/* cache callback name lookup
3435	 * (if not done yet, i.e. it's the first error) */
3436	if (known_errorHandler==-1) {
3437	    if ((errors==NULL) || (!strcmp(errors, "strict")))
3438		known_errorHandler = 1;
3439	    else if (!strcmp(errors, "replace"))
3440		known_errorHandler = 2;
3441	    else if (!strcmp(errors, "ignore"))
3442		known_errorHandler = 3;
3443	    else if (!strcmp(errors, "xmlcharrefreplace"))
3444		known_errorHandler = 4;
3445	    else
3446		known_errorHandler = 0;
3447	}
3448	switch (known_errorHandler) {
3449	    case 1: /* strict */
3450		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3451		goto onError;
3452	    case 2: /* replace */
3453		for (p = collstart; p < collend; ++p)
3454		    *output++ = '?';
3455		/* fall through */
3456	    case 3: /* ignore */
3457		p = collend;
3458		break;
3459	    case 4: /* xmlcharrefreplace */
3460		/* generate replacement (temporarily (mis)uses p) */
3461		for (p = collstart; p < collend; ++p)
3462		    output += sprintf(output, "&#%d;", (int)*p);
3463		p = collend;
3464		break;
3465	    default:
3466		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3467		    encoding, reason, s, length, &exc,
3468		    collstart-s, collend-s, &newpos);
3469		if (repunicode == NULL)
3470		    goto onError;
3471		/* generate replacement  */
3472		repsize = PyUnicode_GET_SIZE(repunicode);
3473		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3474		    Py_UNICODE ch = *uni2;
3475		    if (Py_UNICODE_ISSPACE(ch))
3476			*output++ = ' ';
3477		    else {
3478			decimal = Py_UNICODE_TODECIMAL(ch);
3479			if (decimal >= 0)
3480			    *output++ = '0' + decimal;
3481			else if (0 < ch && ch < 256)
3482			    *output++ = (char)ch;
3483			else {
3484			    Py_DECREF(repunicode);
3485			    raise_encode_exception(&exc, encoding,
3486				s, length, collstart-s, collend-s, reason);
3487			    goto onError;
3488			}
3489		    }
3490		}
3491		p = s + newpos;
3492		Py_DECREF(repunicode);
3493	}
3494    }
3495    /* 0-terminate the output string */
3496    *output++ = '\0';
3497    Py_XDECREF(exc);
3498    Py_XDECREF(errorHandler);
3499    return 0;
3500
3501 onError:
3502    Py_XDECREF(exc);
3503    Py_XDECREF(errorHandler);
3504    return -1;
3505}
3506
3507/* --- Helpers ------------------------------------------------------------ */
3508
3509static
3510int count(PyUnicodeObject *self,
3511	  int start,
3512	  int end,
3513	  PyUnicodeObject *substring)
3514{
3515    int count = 0;
3516
3517    if (start < 0)
3518        start += self->length;
3519    if (start < 0)
3520        start = 0;
3521    if (end > self->length)
3522        end = self->length;
3523    if (end < 0)
3524        end += self->length;
3525    if (end < 0)
3526        end = 0;
3527
3528    if (substring->length == 0)
3529	return (end - start + 1);
3530
3531    end -= substring->length;
3532
3533    while (start <= end)
3534        if (Py_UNICODE_MATCH(self, start, substring)) {
3535            count++;
3536            start += substring->length;
3537        } else
3538            start++;
3539
3540    return count;
3541}
3542
3543int PyUnicode_Count(PyObject *str,
3544		    PyObject *substr,
3545		    int start,
3546		    int end)
3547{
3548    int result;
3549
3550    str = PyUnicode_FromObject(str);
3551    if (str == NULL)
3552	return -1;
3553    substr = PyUnicode_FromObject(substr);
3554    if (substr == NULL) {
3555	Py_DECREF(str);
3556	return -1;
3557    }
3558
3559    result = count((PyUnicodeObject *)str,
3560		   start, end,
3561		   (PyUnicodeObject *)substr);
3562
3563    Py_DECREF(str);
3564    Py_DECREF(substr);
3565    return result;
3566}
3567
3568static
3569int findstring(PyUnicodeObject *self,
3570	       PyUnicodeObject *substring,
3571	       int start,
3572	       int end,
3573	       int direction)
3574{
3575    if (start < 0)
3576        start += self->length;
3577    if (start < 0)
3578        start = 0;
3579
3580    if (end > self->length)
3581        end = self->length;
3582    if (end < 0)
3583        end += self->length;
3584    if (end < 0)
3585        end = 0;
3586
3587    if (substring->length == 0)
3588	return (direction > 0) ? start : end;
3589
3590    end -= substring->length;
3591
3592    if (direction < 0) {
3593        for (; end >= start; end--)
3594            if (Py_UNICODE_MATCH(self, end, substring))
3595                return end;
3596    } else {
3597        for (; start <= end; start++)
3598            if (Py_UNICODE_MATCH(self, start, substring))
3599                return start;
3600    }
3601
3602    return -1;
3603}
3604
3605int PyUnicode_Find(PyObject *str,
3606		   PyObject *substr,
3607		   int start,
3608		   int end,
3609		   int direction)
3610{
3611    int result;
3612
3613    str = PyUnicode_FromObject(str);
3614    if (str == NULL)
3615	return -2;
3616    substr = PyUnicode_FromObject(substr);
3617    if (substr == NULL) {
3618	Py_DECREF(str);
3619	return -2;
3620    }
3621
3622    result = findstring((PyUnicodeObject *)str,
3623			(PyUnicodeObject *)substr,
3624			start, end, direction);
3625    Py_DECREF(str);
3626    Py_DECREF(substr);
3627    return result;
3628}
3629
3630static
3631int tailmatch(PyUnicodeObject *self,
3632	      PyUnicodeObject *substring,
3633	      int start,
3634	      int end,
3635	      int direction)
3636{
3637    if (start < 0)
3638        start += self->length;
3639    if (start < 0)
3640        start = 0;
3641
3642    if (substring->length == 0)
3643        return 1;
3644
3645    if (end > self->length)
3646        end = self->length;
3647    if (end < 0)
3648        end += self->length;
3649    if (end < 0)
3650        end = 0;
3651
3652    end -= substring->length;
3653    if (end < start)
3654	return 0;
3655
3656    if (direction > 0) {
3657	if (Py_UNICODE_MATCH(self, end, substring))
3658	    return 1;
3659    } else {
3660        if (Py_UNICODE_MATCH(self, start, substring))
3661	    return 1;
3662    }
3663
3664    return 0;
3665}
3666
3667int PyUnicode_Tailmatch(PyObject *str,
3668			PyObject *substr,
3669			int start,
3670			int end,
3671			int direction)
3672{
3673    int result;
3674
3675    str = PyUnicode_FromObject(str);
3676    if (str == NULL)
3677	return -1;
3678    substr = PyUnicode_FromObject(substr);
3679    if (substr == NULL) {
3680	Py_DECREF(substr);
3681	return -1;
3682    }
3683
3684    result = tailmatch((PyUnicodeObject *)str,
3685		       (PyUnicodeObject *)substr,
3686		       start, end, direction);
3687    Py_DECREF(str);
3688    Py_DECREF(substr);
3689    return result;
3690}
3691
3692static
3693const Py_UNICODE *findchar(const Py_UNICODE *s,
3694		     int size,
3695		     Py_UNICODE ch)
3696{
3697    /* like wcschr, but doesn't stop at NULL characters */
3698
3699    while (size-- > 0) {
3700        if (*s == ch)
3701            return s;
3702        s++;
3703    }
3704
3705    return NULL;
3706}
3707
3708/* Apply fixfct filter to the Unicode object self and return a
3709   reference to the modified object */
3710
3711static
3712PyObject *fixup(PyUnicodeObject *self,
3713		int (*fixfct)(PyUnicodeObject *s))
3714{
3715
3716    PyUnicodeObject *u;
3717
3718    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3719    if (u == NULL)
3720	return NULL;
3721
3722    Py_UNICODE_COPY(u->str, self->str, self->length);
3723
3724    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3725	/* fixfct should return TRUE if it modified the buffer. If
3726	   FALSE, return a reference to the original buffer instead
3727	   (to save space, not time) */
3728	Py_INCREF(self);
3729	Py_DECREF(u);
3730	return (PyObject*) self;
3731    }
3732    return (PyObject*) u;
3733}
3734
3735static
3736int fixupper(PyUnicodeObject *self)
3737{
3738    int len = self->length;
3739    Py_UNICODE *s = self->str;
3740    int status = 0;
3741
3742    while (len-- > 0) {
3743	register Py_UNICODE ch;
3744
3745	ch = Py_UNICODE_TOUPPER(*s);
3746	if (ch != *s) {
3747            status = 1;
3748	    *s = ch;
3749	}
3750        s++;
3751    }
3752
3753    return status;
3754}
3755
3756static
3757int fixlower(PyUnicodeObject *self)
3758{
3759    int len = self->length;
3760    Py_UNICODE *s = self->str;
3761    int status = 0;
3762
3763    while (len-- > 0) {
3764	register Py_UNICODE ch;
3765
3766	ch = Py_UNICODE_TOLOWER(*s);
3767	if (ch != *s) {
3768            status = 1;
3769	    *s = ch;
3770	}
3771        s++;
3772    }
3773
3774    return status;
3775}
3776
3777static
3778int fixswapcase(PyUnicodeObject *self)
3779{
3780    int len = self->length;
3781    Py_UNICODE *s = self->str;
3782    int status = 0;
3783
3784    while (len-- > 0) {
3785        if (Py_UNICODE_ISUPPER(*s)) {
3786            *s = Py_UNICODE_TOLOWER(*s);
3787            status = 1;
3788        } else if (Py_UNICODE_ISLOWER(*s)) {
3789            *s = Py_UNICODE_TOUPPER(*s);
3790            status = 1;
3791        }
3792        s++;
3793    }
3794
3795    return status;
3796}
3797
3798static
3799int fixcapitalize(PyUnicodeObject *self)
3800{
3801    int len = self->length;
3802    Py_UNICODE *s = self->str;
3803    int status = 0;
3804
3805    if (len == 0)
3806	return 0;
3807    if (Py_UNICODE_ISLOWER(*s)) {
3808	*s = Py_UNICODE_TOUPPER(*s);
3809	status = 1;
3810    }
3811    s++;
3812    while (--len > 0) {
3813        if (Py_UNICODE_ISUPPER(*s)) {
3814            *s = Py_UNICODE_TOLOWER(*s);
3815            status = 1;
3816        }
3817        s++;
3818    }
3819    return status;
3820}
3821
3822static
3823int fixtitle(PyUnicodeObject *self)
3824{
3825    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3826    register Py_UNICODE *e;
3827    int previous_is_cased;
3828
3829    /* Shortcut for single character strings */
3830    if (PyUnicode_GET_SIZE(self) == 1) {
3831	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3832	if (*p != ch) {
3833	    *p = ch;
3834	    return 1;
3835	}
3836	else
3837	    return 0;
3838    }
3839
3840    e = p + PyUnicode_GET_SIZE(self);
3841    previous_is_cased = 0;
3842    for (; p < e; p++) {
3843	register const Py_UNICODE ch = *p;
3844
3845	if (previous_is_cased)
3846	    *p = Py_UNICODE_TOLOWER(ch);
3847	else
3848	    *p = Py_UNICODE_TOTITLE(ch);
3849
3850	if (Py_UNICODE_ISLOWER(ch) ||
3851	    Py_UNICODE_ISUPPER(ch) ||
3852	    Py_UNICODE_ISTITLE(ch))
3853	    previous_is_cased = 1;
3854	else
3855	    previous_is_cased = 0;
3856    }
3857    return 1;
3858}
3859
3860PyObject *PyUnicode_Join(PyObject *separator,
3861			 PyObject *seq)
3862{
3863    Py_UNICODE *sep;
3864    int seplen;
3865    PyUnicodeObject *res = NULL;
3866    int reslen = 0;
3867    Py_UNICODE *p;
3868    int sz = 100;
3869    int i;
3870    PyObject *it;
3871
3872    it = PyObject_GetIter(seq);
3873    if (it == NULL)
3874        return NULL;
3875
3876    if (separator == NULL) {
3877	Py_UNICODE blank = ' ';
3878	sep = &blank;
3879	seplen = 1;
3880    }
3881    else {
3882	separator = PyUnicode_FromObject(separator);
3883	if (separator == NULL)
3884	    goto onError;
3885	sep = PyUnicode_AS_UNICODE(separator);
3886	seplen = PyUnicode_GET_SIZE(separator);
3887    }
3888
3889    res = _PyUnicode_New(sz);
3890    if (res == NULL)
3891	goto onError;
3892    p = PyUnicode_AS_UNICODE(res);
3893    reslen = 0;
3894
3895    for (i = 0; ; ++i) {
3896	int itemlen;
3897	PyObject *item = PyIter_Next(it);
3898	if (item == NULL) {
3899	    if (PyErr_Occurred())
3900		goto onError;
3901	    break;
3902	}
3903	if (!PyUnicode_Check(item)) {
3904	    PyObject *v;
3905	    if (!PyString_Check(item)) {
3906		PyErr_Format(PyExc_TypeError,
3907			     "sequence item %i: expected string or Unicode,"
3908			     " %.80s found",
3909			     i, item->ob_type->tp_name);
3910		Py_DECREF(item);
3911		goto onError;
3912	    }
3913	    v = PyUnicode_FromObject(item);
3914	    Py_DECREF(item);
3915	    item = v;
3916	    if (item == NULL)
3917		goto onError;
3918	}
3919	itemlen = PyUnicode_GET_SIZE(item);
3920	while (reslen + itemlen + seplen >= sz) {
3921	    if (_PyUnicode_Resize(&res, sz*2)) {
3922		Py_DECREF(item);
3923		goto onError;
3924	    }
3925	    sz *= 2;
3926	    p = PyUnicode_AS_UNICODE(res) + reslen;
3927	}
3928	if (i > 0) {
3929	    Py_UNICODE_COPY(p, sep, seplen);
3930	    p += seplen;
3931	    reslen += seplen;
3932	}
3933	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3934	p += itemlen;
3935	reslen += itemlen;
3936	Py_DECREF(item);
3937    }
3938    if (_PyUnicode_Resize(&res, reslen))
3939	goto onError;
3940
3941    Py_XDECREF(separator);
3942    Py_DECREF(it);
3943    return (PyObject *)res;
3944
3945 onError:
3946    Py_XDECREF(separator);
3947    Py_XDECREF(res);
3948    Py_DECREF(it);
3949    return NULL;
3950}
3951
3952static
3953PyUnicodeObject *pad(PyUnicodeObject *self,
3954		     int left,
3955		     int right,
3956		     Py_UNICODE fill)
3957{
3958    PyUnicodeObject *u;
3959
3960    if (left < 0)
3961        left = 0;
3962    if (right < 0)
3963        right = 0;
3964
3965    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3966        Py_INCREF(self);
3967        return self;
3968    }
3969
3970    u = _PyUnicode_New(left + self->length + right);
3971    if (u) {
3972        if (left)
3973            Py_UNICODE_FILL(u->str, fill, left);
3974        Py_UNICODE_COPY(u->str + left, self->str, self->length);
3975        if (right)
3976            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3977    }
3978
3979    return u;
3980}
3981
3982#define SPLIT_APPEND(data, left, right)					\
3983	str = PyUnicode_FromUnicode(data + left, right - left);		\
3984	if (!str)							\
3985	    goto onError;						\
3986	if (PyList_Append(list, str)) {					\
3987	    Py_DECREF(str);						\
3988	    goto onError;						\
3989	}								\
3990        else								\
3991            Py_DECREF(str);
3992
3993static
3994PyObject *split_whitespace(PyUnicodeObject *self,
3995			   PyObject *list,
3996			   int maxcount)
3997{
3998    register int i;
3999    register int j;
4000    int len = self->length;
4001    PyObject *str;
4002
4003    for (i = j = 0; i < len; ) {
4004	/* find a token */
4005	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4006	    i++;
4007	j = i;
4008	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4009	    i++;
4010	if (j < i) {
4011	    if (maxcount-- <= 0)
4012		break;
4013	    SPLIT_APPEND(self->str, j, i);
4014	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4015		i++;
4016	    j = i;
4017	}
4018    }
4019    if (j < len) {
4020	SPLIT_APPEND(self->str, j, len);
4021    }
4022    return list;
4023
4024 onError:
4025    Py_DECREF(list);
4026    return NULL;
4027}
4028
4029PyObject *PyUnicode_Splitlines(PyObject *string,
4030			       int keepends)
4031{
4032    register int i;
4033    register int j;
4034    int len;
4035    PyObject *list;
4036    PyObject *str;
4037    Py_UNICODE *data;
4038
4039    string = PyUnicode_FromObject(string);
4040    if (string == NULL)
4041	return NULL;
4042    data = PyUnicode_AS_UNICODE(string);
4043    len = PyUnicode_GET_SIZE(string);
4044
4045    list = PyList_New(0);
4046    if (!list)
4047        goto onError;
4048
4049    for (i = j = 0; i < len; ) {
4050	int eol;
4051
4052	/* Find a line and append it */
4053	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4054	    i++;
4055
4056	/* Skip the line break reading CRLF as one line break */
4057	eol = i;
4058	if (i < len) {
4059	    if (data[i] == '\r' && i + 1 < len &&
4060		data[i+1] == '\n')
4061		i += 2;
4062	    else
4063		i++;
4064	    if (keepends)
4065		eol = i;
4066	}
4067	SPLIT_APPEND(data, j, eol);
4068	j = i;
4069    }
4070    if (j < len) {
4071	SPLIT_APPEND(data, j, len);
4072    }
4073
4074    Py_DECREF(string);
4075    return list;
4076
4077 onError:
4078    Py_DECREF(list);
4079    Py_DECREF(string);
4080    return NULL;
4081}
4082
4083static
4084PyObject *split_char(PyUnicodeObject *self,
4085		     PyObject *list,
4086		     Py_UNICODE ch,
4087		     int maxcount)
4088{
4089    register int i;
4090    register int j;
4091    int len = self->length;
4092    PyObject *str;
4093
4094    for (i = j = 0; i < len; ) {
4095	if (self->str[i] == ch) {
4096	    if (maxcount-- <= 0)
4097		break;
4098	    SPLIT_APPEND(self->str, j, i);
4099	    i = j = i + 1;
4100	} else
4101	    i++;
4102    }
4103    if (j <= len) {
4104	SPLIT_APPEND(self->str, j, len);
4105    }
4106    return list;
4107
4108 onError:
4109    Py_DECREF(list);
4110    return NULL;
4111}
4112
4113static
4114PyObject *split_substring(PyUnicodeObject *self,
4115			  PyObject *list,
4116			  PyUnicodeObject *substring,
4117			  int maxcount)
4118{
4119    register int i;
4120    register int j;
4121    int len = self->length;
4122    int sublen = substring->length;
4123    PyObject *str;
4124
4125    for (i = j = 0; i <= len - sublen; ) {
4126	if (Py_UNICODE_MATCH(self, i, substring)) {
4127	    if (maxcount-- <= 0)
4128		break;
4129	    SPLIT_APPEND(self->str, j, i);
4130	    i = j = i + sublen;
4131	} else
4132	    i++;
4133    }
4134    if (j <= len) {
4135	SPLIT_APPEND(self->str, j, len);
4136    }
4137    return list;
4138
4139 onError:
4140    Py_DECREF(list);
4141    return NULL;
4142}
4143
4144#undef SPLIT_APPEND
4145
4146static
4147PyObject *split(PyUnicodeObject *self,
4148		PyUnicodeObject *substring,
4149		int maxcount)
4150{
4151    PyObject *list;
4152
4153    if (maxcount < 0)
4154        maxcount = INT_MAX;
4155
4156    list = PyList_New(0);
4157    if (!list)
4158        return NULL;
4159
4160    if (substring == NULL)
4161	return split_whitespace(self,list,maxcount);
4162
4163    else if (substring->length == 1)
4164	return split_char(self,list,substring->str[0],maxcount);
4165
4166    else if (substring->length == 0) {
4167	Py_DECREF(list);
4168	PyErr_SetString(PyExc_ValueError, "empty separator");
4169	return NULL;
4170    }
4171    else
4172	return split_substring(self,list,substring,maxcount);
4173}
4174
4175static
4176PyObject *replace(PyUnicodeObject *self,
4177		  PyUnicodeObject *str1,
4178		  PyUnicodeObject *str2,
4179		  int maxcount)
4180{
4181    PyUnicodeObject *u;
4182
4183    if (maxcount < 0)
4184	maxcount = INT_MAX;
4185
4186    if (str1->length == 1 && str2->length == 1) {
4187        int i;
4188
4189        /* replace characters */
4190        if (!findchar(self->str, self->length, str1->str[0]) &&
4191            PyUnicode_CheckExact(self)) {
4192            /* nothing to replace, return original string */
4193            Py_INCREF(self);
4194            u = self;
4195        } else {
4196	    Py_UNICODE u1 = str1->str[0];
4197	    Py_UNICODE u2 = str2->str[0];
4198
4199            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
4200                NULL,
4201                self->length
4202                );
4203            if (u != NULL) {
4204		Py_UNICODE_COPY(u->str, self->str,
4205				self->length);
4206                for (i = 0; i < u->length; i++)
4207                    if (u->str[i] == u1) {
4208                        if (--maxcount < 0)
4209                            break;
4210                        u->str[i] = u2;
4211                    }
4212        }
4213        }
4214
4215    } else {
4216        int n, i;
4217        Py_UNICODE *p;
4218
4219        /* replace strings */
4220        n = count(self, 0, self->length, str1);
4221        if (n > maxcount)
4222            n = maxcount;
4223        if (n == 0) {
4224            /* nothing to replace, return original string */
4225            if (PyUnicode_CheckExact(self)) {
4226                Py_INCREF(self);
4227                u = self;
4228            }
4229            else {
4230                u = (PyUnicodeObject *)
4231                    PyUnicode_FromUnicode(self->str, self->length);
4232	    }
4233        } else {
4234            u = _PyUnicode_New(
4235                self->length + n * (str2->length - str1->length));
4236            if (u) {
4237                i = 0;
4238                p = u->str;
4239                if (str1->length > 0) {
4240                    while (i <= self->length - str1->length)
4241                        if (Py_UNICODE_MATCH(self, i, str1)) {
4242                            /* replace string segment */
4243                            Py_UNICODE_COPY(p, str2->str, str2->length);
4244                            p += str2->length;
4245                            i += str1->length;
4246                            if (--n <= 0) {
4247                                /* copy remaining part */
4248                                Py_UNICODE_COPY(p, self->str+i, self->length-i);
4249                                break;
4250                            }
4251                        } else
4252                            *p++ = self->str[i++];
4253                } else {
4254                    while (n > 0) {
4255                        Py_UNICODE_COPY(p, str2->str, str2->length);
4256                        p += str2->length;
4257                        if (--n <= 0)
4258                            break;
4259                        *p++ = self->str[i++];
4260                    }
4261                    Py_UNICODE_COPY(p, self->str+i, self->length-i);
4262                }
4263            }
4264        }
4265    }
4266
4267    return (PyObject *) u;
4268}
4269
4270/* --- Unicode Object Methods --------------------------------------------- */
4271
4272PyDoc_STRVAR(title__doc__,
4273"S.title() -> unicode\n\
4274\n\
4275Return a titlecased version of S, i.e. words start with title case\n\
4276characters, all remaining cased characters have lower case.");
4277
4278static PyObject*
4279unicode_title(PyUnicodeObject *self)
4280{
4281    return fixup(self, fixtitle);
4282}
4283
4284PyDoc_STRVAR(capitalize__doc__,
4285"S.capitalize() -> unicode\n\
4286\n\
4287Return a capitalized version of S, i.e. make the first character\n\
4288have upper case.");
4289
4290static PyObject*
4291unicode_capitalize(PyUnicodeObject *self)
4292{
4293    return fixup(self, fixcapitalize);
4294}
4295
4296#if 0
4297PyDoc_STRVAR(capwords__doc__,
4298"S.capwords() -> unicode\n\
4299\n\
4300Apply .capitalize() to all words in S and return the result with\n\
4301normalized whitespace (all whitespace strings are replaced by ' ').");
4302
4303static PyObject*
4304unicode_capwords(PyUnicodeObject *self)
4305{
4306    PyObject *list;
4307    PyObject *item;
4308    int i;
4309
4310    /* Split into words */
4311    list = split(self, NULL, -1);
4312    if (!list)
4313        return NULL;
4314
4315    /* Capitalize each word */
4316    for (i = 0; i < PyList_GET_SIZE(list); i++) {
4317        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4318		     fixcapitalize);
4319        if (item == NULL)
4320            goto onError;
4321        Py_DECREF(PyList_GET_ITEM(list, i));
4322        PyList_SET_ITEM(list, i, item);
4323    }
4324
4325    /* Join the words to form a new string */
4326    item = PyUnicode_Join(NULL, list);
4327
4328onError:
4329    Py_DECREF(list);
4330    return (PyObject *)item;
4331}
4332#endif
4333
4334PyDoc_STRVAR(center__doc__,
4335"S.center(width) -> unicode\n\
4336\n\
4337Return S centered in a Unicode string of length width. Padding is done\n\
4338using spaces.");
4339
4340static PyObject *
4341unicode_center(PyUnicodeObject *self, PyObject *args)
4342{
4343    int marg, left;
4344    int width;
4345
4346    if (!PyArg_ParseTuple(args, "i:center", &width))
4347        return NULL;
4348
4349    if (self->length >= width && PyUnicode_CheckExact(self)) {
4350        Py_INCREF(self);
4351        return (PyObject*) self;
4352    }
4353
4354    marg = width - self->length;
4355    left = marg / 2 + (marg & width & 1);
4356
4357    return (PyObject*) pad(self, left, marg - left, ' ');
4358}
4359
4360#if 0
4361
4362/* This code should go into some future Unicode collation support
4363   module. The basic comparison should compare ordinals on a naive
4364   basis (this is what Java does and thus JPython too). */
4365
4366/* speedy UTF-16 code point order comparison */
4367/* gleaned from: */
4368/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4369
4370static short utf16Fixup[32] =
4371{
4372    0, 0, 0, 0, 0, 0, 0, 0,
4373    0, 0, 0, 0, 0, 0, 0, 0,
4374    0, 0, 0, 0, 0, 0, 0, 0,
4375    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4376};
4377
4378static int
4379unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4380{
4381    int len1, len2;
4382
4383    Py_UNICODE *s1 = str1->str;
4384    Py_UNICODE *s2 = str2->str;
4385
4386    len1 = str1->length;
4387    len2 = str2->length;
4388
4389    while (len1 > 0 && len2 > 0) {
4390        Py_UNICODE c1, c2;
4391
4392        c1 = *s1++;
4393        c2 = *s2++;
4394
4395	if (c1 > (1<<11) * 26)
4396	    c1 += utf16Fixup[c1>>11];
4397	if (c2 > (1<<11) * 26)
4398            c2 += utf16Fixup[c2>>11];
4399        /* now c1 and c2 are in UTF-32-compatible order */
4400
4401        if (c1 != c2)
4402            return (c1 < c2) ? -1 : 1;
4403
4404        len1--; len2--;
4405    }
4406
4407    return (len1 < len2) ? -1 : (len1 != len2);
4408}
4409
4410#else
4411
4412static int
4413unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4414{
4415    register int len1, len2;
4416
4417    Py_UNICODE *s1 = str1->str;
4418    Py_UNICODE *s2 = str2->str;
4419
4420    len1 = str1->length;
4421    len2 = str2->length;
4422
4423    while (len1 > 0 && len2 > 0) {
4424        Py_UNICODE c1, c2;
4425
4426        c1 = *s1++;
4427        c2 = *s2++;
4428
4429        if (c1 != c2)
4430            return (c1 < c2) ? -1 : 1;
4431
4432        len1--; len2--;
4433    }
4434
4435    return (len1 < len2) ? -1 : (len1 != len2);
4436}
4437
4438#endif
4439
4440int PyUnicode_Compare(PyObject *left,
4441		      PyObject *right)
4442{
4443    PyUnicodeObject *u = NULL, *v = NULL;
4444    int result;
4445
4446    /* Coerce the two arguments */
4447    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4448    if (u == NULL)
4449	goto onError;
4450    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4451    if (v == NULL)
4452	goto onError;
4453
4454    /* Shortcut for empty or interned objects */
4455    if (v == u) {
4456	Py_DECREF(u);
4457	Py_DECREF(v);
4458	return 0;
4459    }
4460
4461    result = unicode_compare(u, v);
4462
4463    Py_DECREF(u);
4464    Py_DECREF(v);
4465    return result;
4466
4467onError:
4468    Py_XDECREF(u);
4469    Py_XDECREF(v);
4470    return -1;
4471}
4472
4473int PyUnicode_Contains(PyObject *container,
4474		       PyObject *element)
4475{
4476    PyUnicodeObject *u = NULL, *v = NULL;
4477    int result, size;
4478    register const Py_UNICODE *lhs, *end, *rhs;
4479
4480    /* Coerce the two arguments */
4481    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
4482    if (v == NULL) {
4483	PyErr_SetString(PyExc_TypeError,
4484	    "'in <string>' requires string as left operand");
4485	goto onError;
4486    }
4487    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
4488    if (u == NULL)
4489	goto onError;
4490
4491    size = PyUnicode_GET_SIZE(v);
4492    rhs = PyUnicode_AS_UNICODE(v);
4493    lhs = PyUnicode_AS_UNICODE(u);
4494
4495    result = 0;
4496    if (size == 1) {
4497	end = lhs + PyUnicode_GET_SIZE(u);
4498	while (lhs < end) {
4499	    if (*lhs++ == *rhs) {
4500		result = 1;
4501		break;
4502	    }
4503	}
4504    }
4505    else {
4506	end = lhs + (PyUnicode_GET_SIZE(u) - size);
4507	while (lhs <= end) {
4508	    if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
4509		result = 1;
4510		break;
4511	    }
4512	}
4513    }
4514
4515    Py_DECREF(u);
4516    Py_DECREF(v);
4517    return result;
4518
4519onError:
4520    Py_XDECREF(u);
4521    Py_XDECREF(v);
4522    return -1;
4523}
4524
4525/* Concat to string or Unicode object giving a new Unicode object. */
4526
4527PyObject *PyUnicode_Concat(PyObject *left,
4528			   PyObject *right)
4529{
4530    PyUnicodeObject *u = NULL, *v = NULL, *w;
4531
4532    /* Coerce the two arguments */
4533    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4534    if (u == NULL)
4535	goto onError;
4536    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4537    if (v == NULL)
4538	goto onError;
4539
4540    /* Shortcuts */
4541    if (v == unicode_empty) {
4542	Py_DECREF(v);
4543	return (PyObject *)u;
4544    }
4545    if (u == unicode_empty) {
4546	Py_DECREF(u);
4547	return (PyObject *)v;
4548    }
4549
4550    /* Concat the two Unicode strings */
4551    w = _PyUnicode_New(u->length + v->length);
4552    if (w == NULL)
4553	goto onError;
4554    Py_UNICODE_COPY(w->str, u->str, u->length);
4555    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4556
4557    Py_DECREF(u);
4558    Py_DECREF(v);
4559    return (PyObject *)w;
4560
4561onError:
4562    Py_XDECREF(u);
4563    Py_XDECREF(v);
4564    return NULL;
4565}
4566
4567PyDoc_STRVAR(count__doc__,
4568"S.count(sub[, start[, end]]) -> int\n\
4569\n\
4570Return the number of occurrences of substring sub in Unicode string\n\
4571S[start:end].  Optional arguments start and end are\n\
4572interpreted as in slice notation.");
4573
4574static PyObject *
4575unicode_count(PyUnicodeObject *self, PyObject *args)
4576{
4577    PyUnicodeObject *substring;
4578    int start = 0;
4579    int end = INT_MAX;
4580    PyObject *result;
4581
4582    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4583		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4584        return NULL;
4585
4586    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4587						(PyObject *)substring);
4588    if (substring == NULL)
4589	return NULL;
4590
4591    if (start < 0)
4592        start += self->length;
4593    if (start < 0)
4594        start = 0;
4595    if (end > self->length)
4596        end = self->length;
4597    if (end < 0)
4598        end += self->length;
4599    if (end < 0)
4600        end = 0;
4601
4602    result = PyInt_FromLong((long) count(self, start, end, substring));
4603
4604    Py_DECREF(substring);
4605    return result;
4606}
4607
4608PyDoc_STRVAR(encode__doc__,
4609"S.encode([encoding[,errors]]) -> string\n\
4610\n\
4611Return an encoded string version of S. Default encoding is the current\n\
4612default string encoding. errors may be given to set a different error\n\
4613handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4614a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4615'xmlcharrefreplace' as well as any other name registered with\n\
4616codecs.register_error that can handle UnicodeEncodeErrors.");
4617
4618static PyObject *
4619unicode_encode(PyUnicodeObject *self, PyObject *args)
4620{
4621    char *encoding = NULL;
4622    char *errors = NULL;
4623    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4624        return NULL;
4625    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4626}
4627
4628PyDoc_STRVAR(expandtabs__doc__,
4629"S.expandtabs([tabsize]) -> unicode\n\
4630\n\
4631Return a copy of S where all tab characters are expanded using spaces.\n\
4632If tabsize is not given, a tab size of 8 characters is assumed.");
4633
4634static PyObject*
4635unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4636{
4637    Py_UNICODE *e;
4638    Py_UNICODE *p;
4639    Py_UNICODE *q;
4640    int i, j;
4641    PyUnicodeObject *u;
4642    int tabsize = 8;
4643
4644    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4645	return NULL;
4646
4647    /* First pass: determine size of output string */
4648    i = j = 0;
4649    e = self->str + self->length;
4650    for (p = self->str; p < e; p++)
4651        if (*p == '\t') {
4652	    if (tabsize > 0)
4653		j += tabsize - (j % tabsize);
4654	}
4655        else {
4656            j++;
4657            if (*p == '\n' || *p == '\r') {
4658                i += j;
4659                j = 0;
4660            }
4661        }
4662
4663    /* Second pass: create output string and fill it */
4664    u = _PyUnicode_New(i + j);
4665    if (!u)
4666        return NULL;
4667
4668    j = 0;
4669    q = u->str;
4670
4671    for (p = self->str; p < e; p++)
4672        if (*p == '\t') {
4673	    if (tabsize > 0) {
4674		i = tabsize - (j % tabsize);
4675		j += i;
4676		while (i--)
4677		    *q++ = ' ';
4678	    }
4679	}
4680	else {
4681            j++;
4682	    *q++ = *p;
4683            if (*p == '\n' || *p == '\r')
4684                j = 0;
4685        }
4686
4687    return (PyObject*) u;
4688}
4689
4690PyDoc_STRVAR(find__doc__,
4691"S.find(sub [,start [,end]]) -> int\n\
4692\n\
4693Return the lowest index in S where substring sub is found,\n\
4694such that sub is contained within s[start,end].  Optional\n\
4695arguments start and end are interpreted as in slice notation.\n\
4696\n\
4697Return -1 on failure.");
4698
4699static PyObject *
4700unicode_find(PyUnicodeObject *self, PyObject *args)
4701{
4702    PyUnicodeObject *substring;
4703    int start = 0;
4704    int end = INT_MAX;
4705    PyObject *result;
4706
4707    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4708		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4709        return NULL;
4710    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4711						(PyObject *)substring);
4712    if (substring == NULL)
4713	return NULL;
4714
4715    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4716
4717    Py_DECREF(substring);
4718    return result;
4719}
4720
4721static PyObject *
4722unicode_getitem(PyUnicodeObject *self, int index)
4723{
4724    if (index < 0 || index >= self->length) {
4725        PyErr_SetString(PyExc_IndexError, "string index out of range");
4726        return NULL;
4727    }
4728
4729    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4730}
4731
4732static long
4733unicode_hash(PyUnicodeObject *self)
4734{
4735    /* Since Unicode objects compare equal to their ASCII string
4736       counterparts, they should use the individual character values
4737       as basis for their hash value.  This is needed to assure that
4738       strings and Unicode objects behave in the same way as
4739       dictionary keys. */
4740
4741    register int len;
4742    register Py_UNICODE *p;
4743    register long x;
4744
4745    if (self->hash != -1)
4746	return self->hash;
4747    len = PyUnicode_GET_SIZE(self);
4748    p = PyUnicode_AS_UNICODE(self);
4749    x = *p << 7;
4750    while (--len >= 0)
4751	x = (1000003*x) ^ *p++;
4752    x ^= PyUnicode_GET_SIZE(self);
4753    if (x == -1)
4754	x = -2;
4755    self->hash = x;
4756    return x;
4757}
4758
4759PyDoc_STRVAR(index__doc__,
4760"S.index(sub [,start [,end]]) -> int\n\
4761\n\
4762Like S.find() but raise ValueError when the substring is not found.");
4763
4764static PyObject *
4765unicode_index(PyUnicodeObject *self, PyObject *args)
4766{
4767    int result;
4768    PyUnicodeObject *substring;
4769    int start = 0;
4770    int end = INT_MAX;
4771
4772    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4773		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4774        return NULL;
4775
4776    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4777						(PyObject *)substring);
4778    if (substring == NULL)
4779	return NULL;
4780
4781    result = findstring(self, substring, start, end, 1);
4782
4783    Py_DECREF(substring);
4784    if (result < 0) {
4785        PyErr_SetString(PyExc_ValueError, "substring not found");
4786        return NULL;
4787    }
4788    return PyInt_FromLong(result);
4789}
4790
4791PyDoc_STRVAR(islower__doc__,
4792"S.islower() -> bool\n\
4793\n\
4794Return True if all cased characters in S are lowercase and there is\n\
4795at least one cased character in S, False otherwise.");
4796
4797static PyObject*
4798unicode_islower(PyUnicodeObject *self)
4799{
4800    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4801    register const Py_UNICODE *e;
4802    int cased;
4803
4804    /* Shortcut for single character strings */
4805    if (PyUnicode_GET_SIZE(self) == 1)
4806	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
4807
4808    /* Special case for empty strings */
4809    if (PyString_GET_SIZE(self) == 0)
4810	return PyBool_FromLong(0);
4811
4812    e = p + PyUnicode_GET_SIZE(self);
4813    cased = 0;
4814    for (; p < e; p++) {
4815	register const Py_UNICODE ch = *p;
4816
4817	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4818	    return PyBool_FromLong(0);
4819	else if (!cased && Py_UNICODE_ISLOWER(ch))
4820	    cased = 1;
4821    }
4822    return PyBool_FromLong(cased);
4823}
4824
4825PyDoc_STRVAR(isupper__doc__,
4826"S.isupper() -> bool\n\
4827\n\
4828Return True if  all cased characters in S are uppercase and there is\n\
4829at least one cased character in S, False otherwise.");
4830
4831static PyObject*
4832unicode_isupper(PyUnicodeObject *self)
4833{
4834    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4835    register const Py_UNICODE *e;
4836    int cased;
4837
4838    /* Shortcut for single character strings */
4839    if (PyUnicode_GET_SIZE(self) == 1)
4840	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4841
4842    /* Special case for empty strings */
4843    if (PyString_GET_SIZE(self) == 0)
4844	return PyBool_FromLong(0);
4845
4846    e = p + PyUnicode_GET_SIZE(self);
4847    cased = 0;
4848    for (; p < e; p++) {
4849	register const Py_UNICODE ch = *p;
4850
4851	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4852	    return PyBool_FromLong(0);
4853	else if (!cased && Py_UNICODE_ISUPPER(ch))
4854	    cased = 1;
4855    }
4856    return PyBool_FromLong(cased);
4857}
4858
4859PyDoc_STRVAR(istitle__doc__,
4860"S.istitle() -> bool\n\
4861\n\
4862Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4863characters may only follow uncased characters and lowercase characters\n\
4864only cased ones. Return False otherwise.");
4865
4866static PyObject*
4867unicode_istitle(PyUnicodeObject *self)
4868{
4869    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4870    register const Py_UNICODE *e;
4871    int cased, previous_is_cased;
4872
4873    /* Shortcut for single character strings */
4874    if (PyUnicode_GET_SIZE(self) == 1)
4875	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4876			       (Py_UNICODE_ISUPPER(*p) != 0));
4877
4878    /* Special case for empty strings */
4879    if (PyString_GET_SIZE(self) == 0)
4880	return PyBool_FromLong(0);
4881
4882    e = p + PyUnicode_GET_SIZE(self);
4883    cased = 0;
4884    previous_is_cased = 0;
4885    for (; p < e; p++) {
4886	register const Py_UNICODE ch = *p;
4887
4888	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4889	    if (previous_is_cased)
4890		return PyBool_FromLong(0);
4891	    previous_is_cased = 1;
4892	    cased = 1;
4893	}
4894	else if (Py_UNICODE_ISLOWER(ch)) {
4895	    if (!previous_is_cased)
4896		return PyBool_FromLong(0);
4897	    previous_is_cased = 1;
4898	    cased = 1;
4899	}
4900	else
4901	    previous_is_cased = 0;
4902    }
4903    return PyBool_FromLong(cased);
4904}
4905
4906PyDoc_STRVAR(isspace__doc__,
4907"S.isspace() -> bool\n\
4908\n\
4909Return True if there are only whitespace characters in S,\n\
4910False otherwise.");
4911
4912static PyObject*
4913unicode_isspace(PyUnicodeObject *self)
4914{
4915    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4916    register const Py_UNICODE *e;
4917
4918    /* Shortcut for single character strings */
4919    if (PyUnicode_GET_SIZE(self) == 1 &&
4920	Py_UNICODE_ISSPACE(*p))
4921	return PyBool_FromLong(1);
4922
4923    /* Special case for empty strings */
4924    if (PyString_GET_SIZE(self) == 0)
4925	return PyBool_FromLong(0);
4926
4927    e = p + PyUnicode_GET_SIZE(self);
4928    for (; p < e; p++) {
4929	if (!Py_UNICODE_ISSPACE(*p))
4930	    return PyBool_FromLong(0);
4931    }
4932    return PyBool_FromLong(1);
4933}
4934
4935PyDoc_STRVAR(isalpha__doc__,
4936"S.isalpha() -> bool\n\
4937\n\
4938Return True if  all characters in S are alphabetic\n\
4939and there is at least one character in S, False otherwise.");
4940
4941static PyObject*
4942unicode_isalpha(PyUnicodeObject *self)
4943{
4944    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4945    register const Py_UNICODE *e;
4946
4947    /* Shortcut for single character strings */
4948    if (PyUnicode_GET_SIZE(self) == 1 &&
4949	Py_UNICODE_ISALPHA(*p))
4950	return PyBool_FromLong(1);
4951
4952    /* Special case for empty strings */
4953    if (PyString_GET_SIZE(self) == 0)
4954	return PyBool_FromLong(0);
4955
4956    e = p + PyUnicode_GET_SIZE(self);
4957    for (; p < e; p++) {
4958	if (!Py_UNICODE_ISALPHA(*p))
4959	    return PyBool_FromLong(0);
4960    }
4961    return PyBool_FromLong(1);
4962}
4963
4964PyDoc_STRVAR(isalnum__doc__,
4965"S.isalnum() -> bool\n\
4966\n\
4967Return True if  all characters in S are alphanumeric\n\
4968and there is at least one character in S, False otherwise.");
4969
4970static PyObject*
4971unicode_isalnum(PyUnicodeObject *self)
4972{
4973    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4974    register const Py_UNICODE *e;
4975
4976    /* Shortcut for single character strings */
4977    if (PyUnicode_GET_SIZE(self) == 1 &&
4978	Py_UNICODE_ISALNUM(*p))
4979	return PyBool_FromLong(1);
4980
4981    /* Special case for empty strings */
4982    if (PyString_GET_SIZE(self) == 0)
4983	return PyBool_FromLong(0);
4984
4985    e = p + PyUnicode_GET_SIZE(self);
4986    for (; p < e; p++) {
4987	if (!Py_UNICODE_ISALNUM(*p))
4988	    return PyBool_FromLong(0);
4989    }
4990    return PyBool_FromLong(1);
4991}
4992
4993PyDoc_STRVAR(isdecimal__doc__,
4994"S.isdecimal() -> bool\n\
4995\n\
4996Return True if there are only decimal characters in S,\n\
4997False otherwise.");
4998
4999static PyObject*
5000unicode_isdecimal(PyUnicodeObject *self)
5001{
5002    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5003    register const Py_UNICODE *e;
5004
5005    /* Shortcut for single character strings */
5006    if (PyUnicode_GET_SIZE(self) == 1 &&
5007	Py_UNICODE_ISDECIMAL(*p))
5008	return PyBool_FromLong(1);
5009
5010    /* Special case for empty strings */
5011    if (PyString_GET_SIZE(self) == 0)
5012	return PyBool_FromLong(0);
5013
5014    e = p + PyUnicode_GET_SIZE(self);
5015    for (; p < e; p++) {
5016	if (!Py_UNICODE_ISDECIMAL(*p))
5017	    return PyBool_FromLong(0);
5018    }
5019    return PyBool_FromLong(1);
5020}
5021
5022PyDoc_STRVAR(isdigit__doc__,
5023"S.isdigit() -> bool\n\
5024\n\
5025Return True if there are only digit characters in S,\n\
5026False otherwise.");
5027
5028static PyObject*
5029unicode_isdigit(PyUnicodeObject *self)
5030{
5031    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5032    register const Py_UNICODE *e;
5033
5034    /* Shortcut for single character strings */
5035    if (PyUnicode_GET_SIZE(self) == 1 &&
5036	Py_UNICODE_ISDIGIT(*p))
5037	return PyBool_FromLong(1);
5038
5039    /* Special case for empty strings */
5040    if (PyString_GET_SIZE(self) == 0)
5041	return PyBool_FromLong(0);
5042
5043    e = p + PyUnicode_GET_SIZE(self);
5044    for (; p < e; p++) {
5045	if (!Py_UNICODE_ISDIGIT(*p))
5046	    return PyBool_FromLong(0);
5047    }
5048    return PyBool_FromLong(1);
5049}
5050
5051PyDoc_STRVAR(isnumeric__doc__,
5052"S.isnumeric() -> bool\n\
5053\n\
5054Return True if there are only numeric characters in S,\n\
5055False otherwise.");
5056
5057static PyObject*
5058unicode_isnumeric(PyUnicodeObject *self)
5059{
5060    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5061    register const Py_UNICODE *e;
5062
5063    /* Shortcut for single character strings */
5064    if (PyUnicode_GET_SIZE(self) == 1 &&
5065	Py_UNICODE_ISNUMERIC(*p))
5066	return PyBool_FromLong(1);
5067
5068    /* Special case for empty strings */
5069    if (PyString_GET_SIZE(self) == 0)
5070	return PyBool_FromLong(0);
5071
5072    e = p + PyUnicode_GET_SIZE(self);
5073    for (; p < e; p++) {
5074	if (!Py_UNICODE_ISNUMERIC(*p))
5075	    return PyBool_FromLong(0);
5076    }
5077    return PyBool_FromLong(1);
5078}
5079
5080PyDoc_STRVAR(join__doc__,
5081"S.join(sequence) -> unicode\n\
5082\n\
5083Return a string which is the concatenation of the strings in the\n\
5084sequence.  The separator between elements is S.");
5085
5086static PyObject*
5087unicode_join(PyObject *self, PyObject *data)
5088{
5089    return PyUnicode_Join(self, data);
5090}
5091
5092static int
5093unicode_length(PyUnicodeObject *self)
5094{
5095    return self->length;
5096}
5097
5098PyDoc_STRVAR(ljust__doc__,
5099"S.ljust(width) -> unicode\n\
5100\n\
5101Return S left justified in a Unicode string of length width. Padding is\n\
5102done using spaces.");
5103
5104static PyObject *
5105unicode_ljust(PyUnicodeObject *self, PyObject *args)
5106{
5107    int width;
5108    if (!PyArg_ParseTuple(args, "i:ljust", &width))
5109        return NULL;
5110
5111    if (self->length >= width && PyUnicode_CheckExact(self)) {
5112        Py_INCREF(self);
5113        return (PyObject*) self;
5114    }
5115
5116    return (PyObject*) pad(self, 0, width - self->length, ' ');
5117}
5118
5119PyDoc_STRVAR(lower__doc__,
5120"S.lower() -> unicode\n\
5121\n\
5122Return a copy of the string S converted to lowercase.");
5123
5124static PyObject*
5125unicode_lower(PyUnicodeObject *self)
5126{
5127    return fixup(self, fixlower);
5128}
5129
5130#define LEFTSTRIP 0
5131#define RIGHTSTRIP 1
5132#define BOTHSTRIP 2
5133
5134/* Arrays indexed by above */
5135static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5136
5137#define STRIPNAME(i) (stripformat[i]+3)
5138
5139static const Py_UNICODE *
5140unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5141{
5142	size_t i;
5143	for (i = 0; i < n; ++i)
5144		if (s[i] == c)
5145			return s+i;
5146	return NULL;
5147}
5148
5149/* externally visible for str.strip(unicode) */
5150PyObject *
5151_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5152{
5153	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5154	int len = PyUnicode_GET_SIZE(self);
5155	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5156	int seplen = PyUnicode_GET_SIZE(sepobj);
5157	int i, j;
5158
5159	i = 0;
5160	if (striptype != RIGHTSTRIP) {
5161		while (i < len && unicode_memchr(sep, s[i], seplen)) {
5162			i++;
5163		}
5164	}
5165
5166	j = len;
5167	if (striptype != LEFTSTRIP) {
5168		do {
5169			j--;
5170		} while (j >= i && unicode_memchr(sep, s[j], seplen));
5171		j++;
5172	}
5173
5174	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5175		Py_INCREF(self);
5176		return (PyObject*)self;
5177	}
5178	else
5179		return PyUnicode_FromUnicode(s+i, j-i);
5180}
5181
5182
5183static PyObject *
5184do_strip(PyUnicodeObject *self, int striptype)
5185{
5186	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5187	int len = PyUnicode_GET_SIZE(self), i, j;
5188
5189	i = 0;
5190	if (striptype != RIGHTSTRIP) {
5191		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5192			i++;
5193		}
5194	}
5195
5196	j = len;
5197	if (striptype != LEFTSTRIP) {
5198		do {
5199			j--;
5200		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5201		j++;
5202	}
5203
5204	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5205		Py_INCREF(self);
5206		return (PyObject*)self;
5207	}
5208	else
5209		return PyUnicode_FromUnicode(s+i, j-i);
5210}
5211
5212
5213static PyObject *
5214do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5215{
5216	PyObject *sep = NULL;
5217
5218	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5219		return NULL;
5220
5221	if (sep != NULL && sep != Py_None) {
5222		if (PyUnicode_Check(sep))
5223			return _PyUnicode_XStrip(self, striptype, sep);
5224		else if (PyString_Check(sep)) {
5225			PyObject *res;
5226			sep = PyUnicode_FromObject(sep);
5227			if (sep==NULL)
5228				return NULL;
5229			res = _PyUnicode_XStrip(self, striptype, sep);
5230			Py_DECREF(sep);
5231			return res;
5232		}
5233		else {
5234			PyErr_Format(PyExc_TypeError,
5235				     "%s arg must be None, unicode or str",
5236				     STRIPNAME(striptype));
5237			return NULL;
5238		}
5239	}
5240
5241	return do_strip(self, striptype);
5242}
5243
5244
5245PyDoc_STRVAR(strip__doc__,
5246"S.strip([sep]) -> unicode\n\
5247\n\
5248Return a copy of the string S with leading and trailing\n\
5249whitespace removed.\n\
5250If sep is given and not None, remove characters in sep instead.\n\
5251If sep is a str, it will be converted to unicode before stripping");
5252
5253static PyObject *
5254unicode_strip(PyUnicodeObject *self, PyObject *args)
5255{
5256	if (PyTuple_GET_SIZE(args) == 0)
5257		return do_strip(self, BOTHSTRIP); /* Common case */
5258	else
5259		return do_argstrip(self, BOTHSTRIP, args);
5260}
5261
5262
5263PyDoc_STRVAR(lstrip__doc__,
5264"S.lstrip([sep]) -> unicode\n\
5265\n\
5266Return a copy of the string S with leading whitespace removed.\n\
5267If sep is given and not None, remove characters in sep instead.\n\
5268If sep is a str, it will be converted to unicode before stripping");
5269
5270static PyObject *
5271unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5272{
5273	if (PyTuple_GET_SIZE(args) == 0)
5274		return do_strip(self, LEFTSTRIP); /* Common case */
5275	else
5276		return do_argstrip(self, LEFTSTRIP, args);
5277}
5278
5279
5280PyDoc_STRVAR(rstrip__doc__,
5281"S.rstrip([sep]) -> unicode\n\
5282\n\
5283Return a copy of the string S with trailing whitespace removed.\n\
5284If sep is given and not None, remove characters in sep instead.\n\
5285If sep is a str, it will be converted to unicode before stripping");
5286
5287static PyObject *
5288unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5289{
5290	if (PyTuple_GET_SIZE(args) == 0)
5291		return do_strip(self, RIGHTSTRIP); /* Common case */
5292	else
5293		return do_argstrip(self, RIGHTSTRIP, args);
5294}
5295
5296
5297static PyObject*
5298unicode_repeat(PyUnicodeObject *str, int len)
5299{
5300    PyUnicodeObject *u;
5301    Py_UNICODE *p;
5302    int nchars;
5303    size_t nbytes;
5304
5305    if (len < 0)
5306        len = 0;
5307
5308    if (len == 1 && PyUnicode_CheckExact(str)) {
5309        /* no repeat, return original string */
5310        Py_INCREF(str);
5311        return (PyObject*) str;
5312    }
5313
5314    /* ensure # of chars needed doesn't overflow int and # of bytes
5315     * needed doesn't overflow size_t
5316     */
5317    nchars = len * str->length;
5318    if (len && nchars / len != str->length) {
5319        PyErr_SetString(PyExc_OverflowError,
5320                        "repeated string is too long");
5321        return NULL;
5322    }
5323    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5324    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5325        PyErr_SetString(PyExc_OverflowError,
5326                        "repeated string is too long");
5327        return NULL;
5328    }
5329    u = _PyUnicode_New(nchars);
5330    if (!u)
5331        return NULL;
5332
5333    p = u->str;
5334
5335    while (len-- > 0) {
5336        Py_UNICODE_COPY(p, str->str, str->length);
5337        p += str->length;
5338    }
5339
5340    return (PyObject*) u;
5341}
5342
5343PyObject *PyUnicode_Replace(PyObject *obj,
5344			    PyObject *subobj,
5345			    PyObject *replobj,
5346			    int maxcount)
5347{
5348    PyObject *self;
5349    PyObject *str1;
5350    PyObject *str2;
5351    PyObject *result;
5352
5353    self = PyUnicode_FromObject(obj);
5354    if (self == NULL)
5355	return NULL;
5356    str1 = PyUnicode_FromObject(subobj);
5357    if (str1 == NULL) {
5358	Py_DECREF(self);
5359	return NULL;
5360    }
5361    str2 = PyUnicode_FromObject(replobj);
5362    if (str2 == NULL) {
5363	Py_DECREF(self);
5364	Py_DECREF(str1);
5365	return NULL;
5366    }
5367    result = replace((PyUnicodeObject *)self,
5368		     (PyUnicodeObject *)str1,
5369		     (PyUnicodeObject *)str2,
5370		     maxcount);
5371    Py_DECREF(self);
5372    Py_DECREF(str1);
5373    Py_DECREF(str2);
5374    return result;
5375}
5376
5377PyDoc_STRVAR(replace__doc__,
5378"S.replace (old, new[, maxsplit]) -> unicode\n\
5379\n\
5380Return a copy of S with all occurrences of substring\n\
5381old replaced by new.  If the optional argument maxsplit is\n\
5382given, only the first maxsplit occurrences are replaced.");
5383
5384static PyObject*
5385unicode_replace(PyUnicodeObject *self, PyObject *args)
5386{
5387    PyUnicodeObject *str1;
5388    PyUnicodeObject *str2;
5389    int maxcount = -1;
5390    PyObject *result;
5391
5392    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5393        return NULL;
5394    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5395    if (str1 == NULL)
5396	return NULL;
5397    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5398    if (str2 == NULL)
5399	return NULL;
5400
5401    result = replace(self, str1, str2, maxcount);
5402
5403    Py_DECREF(str1);
5404    Py_DECREF(str2);
5405    return result;
5406}
5407
5408static
5409PyObject *unicode_repr(PyObject *unicode)
5410{
5411    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5412				PyUnicode_GET_SIZE(unicode),
5413				1);
5414}
5415
5416PyDoc_STRVAR(rfind__doc__,
5417"S.rfind(sub [,start [,end]]) -> int\n\
5418\n\
5419Return the highest index in S where substring sub is found,\n\
5420such that sub is contained within s[start,end].  Optional\n\
5421arguments start and end are interpreted as in slice notation.\n\
5422\n\
5423Return -1 on failure.");
5424
5425static PyObject *
5426unicode_rfind(PyUnicodeObject *self, PyObject *args)
5427{
5428    PyUnicodeObject *substring;
5429    int start = 0;
5430    int end = INT_MAX;
5431    PyObject *result;
5432
5433    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5434		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5435        return NULL;
5436    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5437						(PyObject *)substring);
5438    if (substring == NULL)
5439	return NULL;
5440
5441    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5442
5443    Py_DECREF(substring);
5444    return result;
5445}
5446
5447PyDoc_STRVAR(rindex__doc__,
5448"S.rindex(sub [,start [,end]]) -> int\n\
5449\n\
5450Like S.rfind() but raise ValueError when the substring is not found.");
5451
5452static PyObject *
5453unicode_rindex(PyUnicodeObject *self, PyObject *args)
5454{
5455    int result;
5456    PyUnicodeObject *substring;
5457    int start = 0;
5458    int end = INT_MAX;
5459
5460    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5461		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5462        return NULL;
5463    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5464						(PyObject *)substring);
5465    if (substring == NULL)
5466	return NULL;
5467
5468    result = findstring(self, substring, start, end, -1);
5469
5470    Py_DECREF(substring);
5471    if (result < 0) {
5472        PyErr_SetString(PyExc_ValueError, "substring not found");
5473        return NULL;
5474    }
5475    return PyInt_FromLong(result);
5476}
5477
5478PyDoc_STRVAR(rjust__doc__,
5479"S.rjust(width) -> unicode\n\
5480\n\
5481Return S right justified in a Unicode string of length width. Padding is\n\
5482done using spaces.");
5483
5484static PyObject *
5485unicode_rjust(PyUnicodeObject *self, PyObject *args)
5486{
5487    int width;
5488    if (!PyArg_ParseTuple(args, "i:rjust", &width))
5489        return NULL;
5490
5491    if (self->length >= width && PyUnicode_CheckExact(self)) {
5492        Py_INCREF(self);
5493        return (PyObject*) self;
5494    }
5495
5496    return (PyObject*) pad(self, width - self->length, 0, ' ');
5497}
5498
5499static PyObject*
5500unicode_slice(PyUnicodeObject *self, int start, int end)
5501{
5502    /* standard clamping */
5503    if (start < 0)
5504        start = 0;
5505    if (end < 0)
5506        end = 0;
5507    if (end > self->length)
5508        end = self->length;
5509    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
5510        /* full slice, return original string */
5511        Py_INCREF(self);
5512        return (PyObject*) self;
5513    }
5514    if (start > end)
5515        start = end;
5516    /* copy slice */
5517    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5518					     end - start);
5519}
5520
5521PyObject *PyUnicode_Split(PyObject *s,
5522			  PyObject *sep,
5523			  int maxsplit)
5524{
5525    PyObject *result;
5526
5527    s = PyUnicode_FromObject(s);
5528    if (s == NULL)
5529	return NULL;
5530    if (sep != NULL) {
5531	sep = PyUnicode_FromObject(sep);
5532	if (sep == NULL) {
5533	    Py_DECREF(s);
5534	    return NULL;
5535	}
5536    }
5537
5538    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5539
5540    Py_DECREF(s);
5541    Py_XDECREF(sep);
5542    return result;
5543}
5544
5545PyDoc_STRVAR(split__doc__,
5546"S.split([sep [,maxsplit]]) -> list of strings\n\
5547\n\
5548Return a list of the words in S, using sep as the\n\
5549delimiter string.  If maxsplit is given, at most maxsplit\n\
5550splits are done. If sep is not specified, any whitespace string\n\
5551is a separator.");
5552
5553static PyObject*
5554unicode_split(PyUnicodeObject *self, PyObject *args)
5555{
5556    PyObject *substring = Py_None;
5557    int maxcount = -1;
5558
5559    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5560        return NULL;
5561
5562    if (substring == Py_None)
5563	return split(self, NULL, maxcount);
5564    else if (PyUnicode_Check(substring))
5565	return split(self, (PyUnicodeObject *)substring, maxcount);
5566    else
5567	return PyUnicode_Split((PyObject *)self, substring, maxcount);
5568}
5569
5570PyDoc_STRVAR(splitlines__doc__,
5571"S.splitlines([keepends]]) -> list of strings\n\
5572\n\
5573Return a list of the lines in S, breaking at line boundaries.\n\
5574Line breaks are not included in the resulting list unless keepends\n\
5575is given and true.");
5576
5577static PyObject*
5578unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5579{
5580    int keepends = 0;
5581
5582    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
5583        return NULL;
5584
5585    return PyUnicode_Splitlines((PyObject *)self, keepends);
5586}
5587
5588static
5589PyObject *unicode_str(PyUnicodeObject *self)
5590{
5591    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
5592}
5593
5594PyDoc_STRVAR(swapcase__doc__,
5595"S.swapcase() -> unicode\n\
5596\n\
5597Return a copy of S with uppercase characters converted to lowercase\n\
5598and vice versa.");
5599
5600static PyObject*
5601unicode_swapcase(PyUnicodeObject *self)
5602{
5603    return fixup(self, fixswapcase);
5604}
5605
5606PyDoc_STRVAR(translate__doc__,
5607"S.translate(table) -> unicode\n\
5608\n\
5609Return a copy of the string S, where all characters have been mapped\n\
5610through the given translation table, which must be a mapping of\n\
5611Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5612Unmapped characters are left untouched. Characters mapped to None\n\
5613are deleted.");
5614
5615static PyObject*
5616unicode_translate(PyUnicodeObject *self, PyObject *table)
5617{
5618    return PyUnicode_TranslateCharmap(self->str,
5619				      self->length,
5620				      table,
5621				      "ignore");
5622}
5623
5624PyDoc_STRVAR(upper__doc__,
5625"S.upper() -> unicode\n\
5626\n\
5627Return a copy of S converted to uppercase.");
5628
5629static PyObject*
5630unicode_upper(PyUnicodeObject *self)
5631{
5632    return fixup(self, fixupper);
5633}
5634
5635PyDoc_STRVAR(zfill__doc__,
5636"S.zfill(width) -> unicode\n\
5637\n\
5638Pad a numeric string x with zeros on the left, to fill a field\n\
5639of the specified width. The string x is never truncated.");
5640
5641static PyObject *
5642unicode_zfill(PyUnicodeObject *self, PyObject *args)
5643{
5644    int fill;
5645    PyUnicodeObject *u;
5646
5647    int width;
5648    if (!PyArg_ParseTuple(args, "i:zfill", &width))
5649        return NULL;
5650
5651    if (self->length >= width) {
5652        if (PyUnicode_CheckExact(self)) {
5653            Py_INCREF(self);
5654            return (PyObject*) self;
5655        }
5656        else
5657            return PyUnicode_FromUnicode(
5658                PyUnicode_AS_UNICODE(self),
5659                PyUnicode_GET_SIZE(self)
5660            );
5661    }
5662
5663    fill = width - self->length;
5664
5665    u = pad(self, fill, 0, '0');
5666
5667    if (u == NULL)
5668        return NULL;
5669
5670    if (u->str[fill] == '+' || u->str[fill] == '-') {
5671        /* move sign to beginning of string */
5672        u->str[0] = u->str[fill];
5673        u->str[fill] = '0';
5674    }
5675
5676    return (PyObject*) u;
5677}
5678
5679#if 0
5680static PyObject*
5681unicode_freelistsize(PyUnicodeObject *self)
5682{
5683    return PyInt_FromLong(unicode_freelist_size);
5684}
5685#endif
5686
5687PyDoc_STRVAR(startswith__doc__,
5688"S.startswith(prefix[, start[, end]]) -> bool\n\
5689\n\
5690Return True if S starts with the specified prefix, False otherwise.  With\n\
5691optional start, test S beginning at that position.  With optional end, stop\n\
5692comparing S at that position.");
5693
5694static PyObject *
5695unicode_startswith(PyUnicodeObject *self,
5696		   PyObject *args)
5697{
5698    PyUnicodeObject *substring;
5699    int start = 0;
5700    int end = INT_MAX;
5701    PyObject *result;
5702
5703    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5704		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5705	return NULL;
5706    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5707						(PyObject *)substring);
5708    if (substring == NULL)
5709	return NULL;
5710
5711    result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
5712
5713    Py_DECREF(substring);
5714    return result;
5715}
5716
5717
5718PyDoc_STRVAR(endswith__doc__,
5719"S.endswith(suffix[, start[, end]]) -> bool\n\
5720\n\
5721Return True if S ends with the specified suffix, False otherwise.  With\n\
5722optional start, test S beginning at that position.  With optional end, stop\n\
5723comparing S at that position.");
5724
5725static PyObject *
5726unicode_endswith(PyUnicodeObject *self,
5727		 PyObject *args)
5728{
5729    PyUnicodeObject *substring;
5730    int start = 0;
5731    int end = INT_MAX;
5732    PyObject *result;
5733
5734    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5735		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5736	return NULL;
5737    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5738						(PyObject *)substring);
5739    if (substring == NULL)
5740	return NULL;
5741
5742    result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
5743
5744    Py_DECREF(substring);
5745    return result;
5746}
5747
5748
5749static PyMethodDef unicode_methods[] = {
5750
5751    /* Order is according to common usage: often used methods should
5752       appear first, since lookup is done sequentially. */
5753
5754    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5755    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5756    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5757    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5758    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5759    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5760    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5761    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5762    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5763    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5764    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5765    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5766    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
5767    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
5768/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5769    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5770    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5771    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
5772    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
5773    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
5774    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
5775    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5776    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5777    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5778    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5779    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5780    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5781    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5782    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5783    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5784    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5785    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5786    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5787    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5788    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
5789    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
5790#if 0
5791    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
5792#endif
5793
5794#if 0
5795    /* This one is just used for debugging the implementation. */
5796    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
5797#endif
5798
5799    {NULL, NULL}
5800};
5801
5802static PyObject *
5803unicode_mod(PyObject *v, PyObject *w)
5804{
5805       if (!PyUnicode_Check(v)) {
5806               Py_INCREF(Py_NotImplemented);
5807               return Py_NotImplemented;
5808       }
5809       return PyUnicode_Format(v, w);
5810}
5811
5812static PyNumberMethods unicode_as_number = {
5813	0,				/*nb_add*/
5814	0,				/*nb_subtract*/
5815	0,				/*nb_multiply*/
5816	0,				/*nb_divide*/
5817	unicode_mod,			/*nb_remainder*/
5818};
5819
5820static PySequenceMethods unicode_as_sequence = {
5821    (inquiry) unicode_length, 		/* sq_length */
5822    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
5823    (intargfunc) unicode_repeat, 	/* sq_repeat */
5824    (intargfunc) unicode_getitem, 	/* sq_item */
5825    (intintargfunc) unicode_slice, 	/* sq_slice */
5826    0, 					/* sq_ass_item */
5827    0, 					/* sq_ass_slice */
5828    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
5829};
5830
5831static PyObject*
5832unicode_subscript(PyUnicodeObject* self, PyObject* item)
5833{
5834    if (PyInt_Check(item)) {
5835        long i = PyInt_AS_LONG(item);
5836        if (i < 0)
5837            i += PyString_GET_SIZE(self);
5838        return unicode_getitem(self, i);
5839    } else if (PyLong_Check(item)) {
5840        long i = PyLong_AsLong(item);
5841        if (i == -1 && PyErr_Occurred())
5842            return NULL;
5843        if (i < 0)
5844            i += PyString_GET_SIZE(self);
5845        return unicode_getitem(self, i);
5846    } else if (PySlice_Check(item)) {
5847        int start, stop, step, slicelength, cur, i;
5848        Py_UNICODE* source_buf;
5849        Py_UNICODE* result_buf;
5850        PyObject* result;
5851
5852        if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5853				 &start, &stop, &step, &slicelength) < 0) {
5854            return NULL;
5855        }
5856
5857        if (slicelength <= 0) {
5858            return PyUnicode_FromUnicode(NULL, 0);
5859        } else {
5860            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5861            result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5862
5863            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5864                result_buf[i] = source_buf[cur];
5865            }
5866
5867            result = PyUnicode_FromUnicode(result_buf, slicelength);
5868            PyMem_FREE(result_buf);
5869            return result;
5870        }
5871    } else {
5872        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5873        return NULL;
5874    }
5875}
5876
5877static PyMappingMethods unicode_as_mapping = {
5878    (inquiry)unicode_length,		/* mp_length */
5879    (binaryfunc)unicode_subscript,	/* mp_subscript */
5880    (objobjargproc)0,			/* mp_ass_subscript */
5881};
5882
5883static int
5884unicode_buffer_getreadbuf(PyUnicodeObject *self,
5885			  int index,
5886			  const void **ptr)
5887{
5888    if (index != 0) {
5889        PyErr_SetString(PyExc_SystemError,
5890			"accessing non-existent unicode segment");
5891        return -1;
5892    }
5893    *ptr = (void *) self->str;
5894    return PyUnicode_GET_DATA_SIZE(self);
5895}
5896
5897static int
5898unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5899			   const void **ptr)
5900{
5901    PyErr_SetString(PyExc_TypeError,
5902		    "cannot use unicode as modifiable buffer");
5903    return -1;
5904}
5905
5906static int
5907unicode_buffer_getsegcount(PyUnicodeObject *self,
5908			   int *lenp)
5909{
5910    if (lenp)
5911        *lenp = PyUnicode_GET_DATA_SIZE(self);
5912    return 1;
5913}
5914
5915static int
5916unicode_buffer_getcharbuf(PyUnicodeObject *self,
5917			  int index,
5918			  const void **ptr)
5919{
5920    PyObject *str;
5921
5922    if (index != 0) {
5923        PyErr_SetString(PyExc_SystemError,
5924			"accessing non-existent unicode segment");
5925        return -1;
5926    }
5927    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5928    if (str == NULL)
5929	return -1;
5930    *ptr = (void *) PyString_AS_STRING(str);
5931    return PyString_GET_SIZE(str);
5932}
5933
5934/* Helpers for PyUnicode_Format() */
5935
5936static PyObject *
5937getnextarg(PyObject *args, int arglen, int *p_argidx)
5938{
5939    int argidx = *p_argidx;
5940    if (argidx < arglen) {
5941	(*p_argidx)++;
5942	if (arglen < 0)
5943	    return args;
5944	else
5945	    return PyTuple_GetItem(args, argidx);
5946    }
5947    PyErr_SetString(PyExc_TypeError,
5948		    "not enough arguments for format string");
5949    return NULL;
5950}
5951
5952#define F_LJUST (1<<0)
5953#define F_SIGN	(1<<1)
5954#define F_BLANK (1<<2)
5955#define F_ALT	(1<<3)
5956#define F_ZERO	(1<<4)
5957
5958static
5959int usprintf(register Py_UNICODE *buffer, char *format, ...)
5960{
5961    register int i;
5962    int len;
5963    va_list va;
5964    char *charbuffer;
5965    va_start(va, format);
5966
5967    /* First, format the string as char array, then expand to Py_UNICODE
5968       array. */
5969    charbuffer = (char *)buffer;
5970    len = vsprintf(charbuffer, format, va);
5971    for (i = len - 1; i >= 0; i--)
5972	buffer[i] = (Py_UNICODE) charbuffer[i];
5973
5974    va_end(va);
5975    return len;
5976}
5977
5978/* XXX To save some code duplication, formatfloat/long/int could have been
5979   shared with stringobject.c, converting from 8-bit to Unicode after the
5980   formatting is done. */
5981
5982static int
5983formatfloat(Py_UNICODE *buf,
5984	    size_t buflen,
5985	    int flags,
5986	    int prec,
5987	    int type,
5988	    PyObject *v)
5989{
5990    /* fmt = '%#.' + `prec` + `type`
5991       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5992    char fmt[20];
5993    double x;
5994
5995    x = PyFloat_AsDouble(v);
5996    if (x == -1.0 && PyErr_Occurred())
5997	return -1;
5998    if (prec < 0)
5999	prec = 6;
6000    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6001	type = 'g';
6002    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6003		  (flags & F_ALT) ? "#" : "", prec, type);
6004    /* worst case length calc to ensure no buffer overrun:
6005         fmt = %#.<prec>g
6006         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6007            for any double rep.)
6008         len = 1 + prec + 1 + 2 + 5 = 9 + prec
6009       If prec=0 the effective precision is 1 (the leading digit is
6010       always given), therefore increase by one to 10+prec. */
6011    if (buflen <= (size_t)10 + (size_t)prec) {
6012	PyErr_SetString(PyExc_OverflowError,
6013	    "formatted float is too long (precision too long?)");
6014	return -1;
6015    }
6016    return usprintf(buf, fmt, x);
6017}
6018
6019static PyObject*
6020formatlong(PyObject *val, int flags, int prec, int type)
6021{
6022	char *buf;
6023	int i, len;
6024	PyObject *str; /* temporary string object. */
6025	PyUnicodeObject *result;
6026
6027	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6028	if (!str)
6029		return NULL;
6030	result = _PyUnicode_New(len);
6031	for (i = 0; i < len; i++)
6032		result->str[i] = buf[i];
6033	result->str[len] = 0;
6034	Py_DECREF(str);
6035	return (PyObject*)result;
6036}
6037
6038static int
6039formatint(Py_UNICODE *buf,
6040	  size_t buflen,
6041	  int flags,
6042	  int prec,
6043	  int type,
6044	  PyObject *v)
6045{
6046    /* fmt = '%#.' + `prec` + 'l' + `type`
6047     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6048     *                     + 1 + 1
6049     *                   = 24
6050     */
6051    char fmt[64]; /* plenty big enough! */
6052    long x;
6053
6054    x = PyInt_AsLong(v);
6055    if (x == -1 && PyErr_Occurred())
6056        return -1;
6057    if (x < 0 && type != 'd' && type != 'i') {
6058	if (PyErr_Warn(PyExc_FutureWarning,
6059		       "%u/%o/%x/%X of negative int will return "
6060		       "a signed string in Python 2.4 and up") < 0)
6061	    return -1;
6062    }
6063    if (prec < 0)
6064        prec = 1;
6065
6066    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
6067     * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6068     */
6069    if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
6070        PyErr_SetString(PyExc_OverflowError,
6071    	        "formatted integer is too long (precision too large?)");
6072        return -1;
6073    }
6074
6075    if ((flags & F_ALT) &&
6076        (type == 'x' || type == 'X')) {
6077        /* When converting under %#x or %#X, there are a number
6078         * of issues that cause pain:
6079         * - when 0 is being converted, the C standard leaves off
6080         *   the '0x' or '0X', which is inconsistent with other
6081         *   %#x/%#X conversions and inconsistent with Python's
6082         *   hex() function
6083         * - there are platforms that violate the standard and
6084         *   convert 0 with the '0x' or '0X'
6085         *   (Metrowerks, Compaq Tru64)
6086         * - there are platforms that give '0x' when converting
6087         *   under %#X, but convert 0 in accordance with the
6088         *   standard (OS/2 EMX)
6089         *
6090         * We can achieve the desired consistency by inserting our
6091         * own '0x' or '0X' prefix, and substituting %x/%X in place
6092         * of %#x/%#X.
6093         *
6094         * Note that this is the same approach as used in
6095         * formatint() in stringobject.c
6096         */
6097        PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6098                      type, prec, type);
6099    }
6100    else {
6101        PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6102                      (flags&F_ALT) ? "#" : "",
6103                      prec, type);
6104    }
6105    return usprintf(buf, fmt, x);
6106}
6107
6108static int
6109formatchar(Py_UNICODE *buf,
6110           size_t buflen,
6111           PyObject *v)
6112{
6113    /* presume that the buffer is at least 2 characters long */
6114    if (PyUnicode_Check(v)) {
6115	if (PyUnicode_GET_SIZE(v) != 1)
6116	    goto onError;
6117	buf[0] = PyUnicode_AS_UNICODE(v)[0];
6118    }
6119
6120    else if (PyString_Check(v)) {
6121	if (PyString_GET_SIZE(v) != 1)
6122	    goto onError;
6123	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6124    }
6125
6126    else {
6127	/* Integer input truncated to a character */
6128        long x;
6129	x = PyInt_AsLong(v);
6130	if (x == -1 && PyErr_Occurred())
6131	    goto onError;
6132#ifdef Py_UNICODE_WIDE
6133	if (x < 0 || x > 0x10ffff) {
6134	    PyErr_SetString(PyExc_ValueError,
6135			    "%c arg not in range(0x110000) "
6136			    "(wide Python build)");
6137	    return -1;
6138	}
6139#else
6140	if (x < 0 || x > 0xffff) {
6141	    PyErr_SetString(PyExc_ValueError,
6142			    "%c arg not in range(0x10000) "
6143			    "(narrow Python build)");
6144	    return -1;
6145	}
6146#endif
6147	buf[0] = (Py_UNICODE) x;
6148    }
6149    buf[1] = '\0';
6150    return 1;
6151
6152 onError:
6153    PyErr_SetString(PyExc_TypeError,
6154		    "%c requires int or char");
6155    return -1;
6156}
6157
6158/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6159
6160   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6161   chars are formatted. XXX This is a magic number. Each formatting
6162   routine does bounds checking to ensure no overflow, but a better
6163   solution may be to malloc a buffer of appropriate size for each
6164   format. For now, the current solution is sufficient.
6165*/
6166#define FORMATBUFLEN (size_t)120
6167
6168PyObject *PyUnicode_Format(PyObject *format,
6169			   PyObject *args)
6170{
6171    Py_UNICODE *fmt, *res;
6172    int fmtcnt, rescnt, reslen, arglen, argidx;
6173    int args_owned = 0;
6174    PyUnicodeObject *result = NULL;
6175    PyObject *dict = NULL;
6176    PyObject *uformat;
6177
6178    if (format == NULL || args == NULL) {
6179	PyErr_BadInternalCall();
6180	return NULL;
6181    }
6182    uformat = PyUnicode_FromObject(format);
6183    if (uformat == NULL)
6184	return NULL;
6185    fmt = PyUnicode_AS_UNICODE(uformat);
6186    fmtcnt = PyUnicode_GET_SIZE(uformat);
6187
6188    reslen = rescnt = fmtcnt + 100;
6189    result = _PyUnicode_New(reslen);
6190    if (result == NULL)
6191	goto onError;
6192    res = PyUnicode_AS_UNICODE(result);
6193
6194    if (PyTuple_Check(args)) {
6195	arglen = PyTuple_Size(args);
6196	argidx = 0;
6197    }
6198    else {
6199	arglen = -1;
6200	argidx = -2;
6201    }
6202    if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6203        !PyObject_TypeCheck(args, &PyBaseString_Type))
6204	dict = args;
6205
6206    while (--fmtcnt >= 0) {
6207	if (*fmt != '%') {
6208	    if (--rescnt < 0) {
6209		rescnt = fmtcnt + 100;
6210		reslen += rescnt;
6211		if (_PyUnicode_Resize(&result, reslen) < 0)
6212		    return NULL;
6213		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6214		--rescnt;
6215	    }
6216	    *res++ = *fmt++;
6217	}
6218	else {
6219	    /* Got a format specifier */
6220	    int flags = 0;
6221	    int width = -1;
6222	    int prec = -1;
6223	    Py_UNICODE c = '\0';
6224	    Py_UNICODE fill;
6225	    PyObject *v = NULL;
6226	    PyObject *temp = NULL;
6227	    Py_UNICODE *pbuf;
6228	    Py_UNICODE sign;
6229	    int len;
6230	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
6231
6232	    fmt++;
6233	    if (*fmt == '(') {
6234		Py_UNICODE *keystart;
6235		int keylen;
6236		PyObject *key;
6237		int pcount = 1;
6238
6239		if (dict == NULL) {
6240		    PyErr_SetString(PyExc_TypeError,
6241				    "format requires a mapping");
6242		    goto onError;
6243		}
6244		++fmt;
6245		--fmtcnt;
6246		keystart = fmt;
6247		/* Skip over balanced parentheses */
6248		while (pcount > 0 && --fmtcnt >= 0) {
6249		    if (*fmt == ')')
6250			--pcount;
6251		    else if (*fmt == '(')
6252			++pcount;
6253		    fmt++;
6254		}
6255		keylen = fmt - keystart - 1;
6256		if (fmtcnt < 0 || pcount > 0) {
6257		    PyErr_SetString(PyExc_ValueError,
6258				    "incomplete format key");
6259		    goto onError;
6260		}
6261#if 0
6262		/* keys are converted to strings using UTF-8 and
6263		   then looked up since Python uses strings to hold
6264		   variables names etc. in its namespaces and we
6265		   wouldn't want to break common idioms. */
6266		key = PyUnicode_EncodeUTF8(keystart,
6267					   keylen,
6268					   NULL);
6269#else
6270		key = PyUnicode_FromUnicode(keystart, keylen);
6271#endif
6272		if (key == NULL)
6273		    goto onError;
6274		if (args_owned) {
6275		    Py_DECREF(args);
6276		    args_owned = 0;
6277		}
6278		args = PyObject_GetItem(dict, key);
6279		Py_DECREF(key);
6280		if (args == NULL) {
6281		    goto onError;
6282		}
6283		args_owned = 1;
6284		arglen = -1;
6285		argidx = -2;
6286	    }
6287	    while (--fmtcnt >= 0) {
6288		switch (c = *fmt++) {
6289		case '-': flags |= F_LJUST; continue;
6290		case '+': flags |= F_SIGN; continue;
6291		case ' ': flags |= F_BLANK; continue;
6292		case '#': flags |= F_ALT; continue;
6293		case '0': flags |= F_ZERO; continue;
6294		}
6295		break;
6296	    }
6297	    if (c == '*') {
6298		v = getnextarg(args, arglen, &argidx);
6299		if (v == NULL)
6300		    goto onError;
6301		if (!PyInt_Check(v)) {
6302		    PyErr_SetString(PyExc_TypeError,
6303				    "* wants int");
6304		    goto onError;
6305		}
6306		width = PyInt_AsLong(v);
6307		if (width < 0) {
6308		    flags |= F_LJUST;
6309		    width = -width;
6310		}
6311		if (--fmtcnt >= 0)
6312		    c = *fmt++;
6313	    }
6314	    else if (c >= '0' && c <= '9') {
6315		width = c - '0';
6316		while (--fmtcnt >= 0) {
6317		    c = *fmt++;
6318		    if (c < '0' || c > '9')
6319			break;
6320		    if ((width*10) / 10 != width) {
6321			PyErr_SetString(PyExc_ValueError,
6322					"width too big");
6323			goto onError;
6324		    }
6325		    width = width*10 + (c - '0');
6326		}
6327	    }
6328	    if (c == '.') {
6329		prec = 0;
6330		if (--fmtcnt >= 0)
6331		    c = *fmt++;
6332		if (c == '*') {
6333		    v = getnextarg(args, arglen, &argidx);
6334		    if (v == NULL)
6335			goto onError;
6336		    if (!PyInt_Check(v)) {
6337			PyErr_SetString(PyExc_TypeError,
6338					"* wants int");
6339			goto onError;
6340		    }
6341		    prec = PyInt_AsLong(v);
6342		    if (prec < 0)
6343			prec = 0;
6344		    if (--fmtcnt >= 0)
6345			c = *fmt++;
6346		}
6347		else if (c >= '0' && c <= '9') {
6348		    prec = c - '0';
6349		    while (--fmtcnt >= 0) {
6350			c = Py_CHARMASK(*fmt++);
6351			if (c < '0' || c > '9')
6352			    break;
6353			if ((prec*10) / 10 != prec) {
6354			    PyErr_SetString(PyExc_ValueError,
6355					    "prec too big");
6356			    goto onError;
6357			}
6358			prec = prec*10 + (c - '0');
6359		    }
6360		}
6361	    } /* prec */
6362	    if (fmtcnt >= 0) {
6363		if (c == 'h' || c == 'l' || c == 'L') {
6364		    if (--fmtcnt >= 0)
6365			c = *fmt++;
6366		}
6367	    }
6368	    if (fmtcnt < 0) {
6369		PyErr_SetString(PyExc_ValueError,
6370				"incomplete format");
6371		goto onError;
6372	    }
6373	    if (c != '%') {
6374		v = getnextarg(args, arglen, &argidx);
6375		if (v == NULL)
6376		    goto onError;
6377	    }
6378	    sign = 0;
6379	    fill = ' ';
6380	    switch (c) {
6381
6382	    case '%':
6383		pbuf = formatbuf;
6384		/* presume that buffer length is at least 1 */
6385		pbuf[0] = '%';
6386		len = 1;
6387		break;
6388
6389	    case 's':
6390	    case 'r':
6391		if (PyUnicode_Check(v) && c == 's') {
6392		    temp = v;
6393		    Py_INCREF(temp);
6394		}
6395		else {
6396		    PyObject *unicode;
6397		    if (c == 's')
6398			temp = PyObject_Str(v);
6399		    else
6400			temp = PyObject_Repr(v);
6401		    if (temp == NULL)
6402			goto onError;
6403		    if (!PyString_Check(temp)) {
6404			/* XXX Note: this should never happen, since
6405   			       PyObject_Repr() and PyObject_Str() assure
6406			       this */
6407			Py_DECREF(temp);
6408			PyErr_SetString(PyExc_TypeError,
6409					"%s argument has non-string str()");
6410			goto onError;
6411		    }
6412		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
6413						   PyString_GET_SIZE(temp),
6414					       NULL,
6415						   "strict");
6416		    Py_DECREF(temp);
6417		    temp = unicode;
6418		    if (temp == NULL)
6419			goto onError;
6420		}
6421		pbuf = PyUnicode_AS_UNICODE(temp);
6422		len = PyUnicode_GET_SIZE(temp);
6423		if (prec >= 0 && len > prec)
6424		    len = prec;
6425		break;
6426
6427	    case 'i':
6428	    case 'd':
6429	    case 'u':
6430	    case 'o':
6431	    case 'x':
6432	    case 'X':
6433		if (c == 'i')
6434		    c = 'd';
6435		if (PyLong_Check(v)) {
6436		    temp = formatlong(v, flags, prec, c);
6437		    if (!temp)
6438			goto onError;
6439		    pbuf = PyUnicode_AS_UNICODE(temp);
6440		    len = PyUnicode_GET_SIZE(temp);
6441		    /* unbounded ints can always produce
6442		       a sign character! */
6443		    sign = 1;
6444		}
6445		else {
6446		    pbuf = formatbuf;
6447		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6448				    flags, prec, c, v);
6449		    if (len < 0)
6450			goto onError;
6451		    /* only d conversion is signed */
6452		    sign = c == 'd';
6453		}
6454		if (flags & F_ZERO)
6455		    fill = '0';
6456		break;
6457
6458	    case 'e':
6459	    case 'E':
6460	    case 'f':
6461	    case 'g':
6462	    case 'G':
6463		pbuf = formatbuf;
6464		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6465			flags, prec, c, v);
6466		if (len < 0)
6467		    goto onError;
6468		sign = 1;
6469		if (flags & F_ZERO)
6470		    fill = '0';
6471		break;
6472
6473	    case 'c':
6474		pbuf = formatbuf;
6475		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
6476		if (len < 0)
6477		    goto onError;
6478		break;
6479
6480	    default:
6481		PyErr_Format(PyExc_ValueError,
6482			     "unsupported format character '%c' (0x%x) "
6483			     "at index %i",
6484			     (31<=c && c<=126) ? (char)c : '?',
6485                             (int)c,
6486			     (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
6487		goto onError;
6488	    }
6489	    if (sign) {
6490		if (*pbuf == '-' || *pbuf == '+') {
6491		    sign = *pbuf++;
6492		    len--;
6493		}
6494		else if (flags & F_SIGN)
6495		    sign = '+';
6496		else if (flags & F_BLANK)
6497		    sign = ' ';
6498		else
6499		    sign = 0;
6500	    }
6501	    if (width < len)
6502		width = len;
6503	    if (rescnt - (sign != 0) < width) {
6504		reslen -= rescnt;
6505		rescnt = width + fmtcnt + 100;
6506		reslen += rescnt;
6507		if (reslen < 0) {
6508		    Py_DECREF(result);
6509		    return PyErr_NoMemory();
6510		}
6511		if (_PyUnicode_Resize(&result, reslen) < 0)
6512		    return NULL;
6513		res = PyUnicode_AS_UNICODE(result)
6514		    + reslen - rescnt;
6515	    }
6516	    if (sign) {
6517		if (fill != ' ')
6518		    *res++ = sign;
6519		rescnt--;
6520		if (width > len)
6521		    width--;
6522	    }
6523	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6524		assert(pbuf[0] == '0');
6525		assert(pbuf[1] == c);
6526		if (fill != ' ') {
6527		    *res++ = *pbuf++;
6528		    *res++ = *pbuf++;
6529		}
6530		rescnt -= 2;
6531		width -= 2;
6532		if (width < 0)
6533		    width = 0;
6534		len -= 2;
6535	    }
6536	    if (width > len && !(flags & F_LJUST)) {
6537		do {
6538		    --rescnt;
6539		    *res++ = fill;
6540		} while (--width > len);
6541	    }
6542	    if (fill == ' ') {
6543		if (sign)
6544		    *res++ = sign;
6545		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6546		    assert(pbuf[0] == '0');
6547		    assert(pbuf[1] == c);
6548		    *res++ = *pbuf++;
6549		    *res++ = *pbuf++;
6550		}
6551	    }
6552	    Py_UNICODE_COPY(res, pbuf, len);
6553	    res += len;
6554	    rescnt -= len;
6555	    while (--width >= len) {
6556		--rescnt;
6557		*res++ = ' ';
6558	    }
6559	    if (dict && (argidx < arglen) && c != '%') {
6560		PyErr_SetString(PyExc_TypeError,
6561				"not all arguments converted during string formatting");
6562		goto onError;
6563	    }
6564	    Py_XDECREF(temp);
6565	} /* '%' */
6566    } /* until end */
6567    if (argidx < arglen && !dict) {
6568	PyErr_SetString(PyExc_TypeError,
6569			"not all arguments converted during string formatting");
6570	goto onError;
6571    }
6572
6573    if (args_owned) {
6574	Py_DECREF(args);
6575    }
6576    Py_DECREF(uformat);
6577    if (_PyUnicode_Resize(&result, reslen - rescnt))
6578	goto onError;
6579    return (PyObject *)result;
6580
6581 onError:
6582    Py_XDECREF(result);
6583    Py_DECREF(uformat);
6584    if (args_owned) {
6585	Py_DECREF(args);
6586    }
6587    return NULL;
6588}
6589
6590static PyBufferProcs unicode_as_buffer = {
6591    (getreadbufferproc) unicode_buffer_getreadbuf,
6592    (getwritebufferproc) unicode_buffer_getwritebuf,
6593    (getsegcountproc) unicode_buffer_getsegcount,
6594    (getcharbufferproc) unicode_buffer_getcharbuf,
6595};
6596
6597static PyObject *
6598unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6599
6600static PyObject *
6601unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6602{
6603        PyObject *x = NULL;
6604	static char *kwlist[] = {"string", "encoding", "errors", 0};
6605	char *encoding = NULL;
6606	char *errors = NULL;
6607
6608	if (type != &PyUnicode_Type)
6609		return unicode_subtype_new(type, args, kwds);
6610	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6611					  kwlist, &x, &encoding, &errors))
6612	    return NULL;
6613	if (x == NULL)
6614		return (PyObject *)_PyUnicode_New(0);
6615	if (encoding == NULL && errors == NULL)
6616	    return PyObject_Unicode(x);
6617	else
6618	return PyUnicode_FromEncodedObject(x, encoding, errors);
6619}
6620
6621static PyObject *
6622unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6623{
6624	PyUnicodeObject *tmp, *pnew;
6625	int n;
6626
6627	assert(PyType_IsSubtype(type, &PyUnicode_Type));
6628	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6629	if (tmp == NULL)
6630		return NULL;
6631	assert(PyUnicode_Check(tmp));
6632	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6633	if (pnew == NULL)
6634		return NULL;
6635	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6636	if (pnew->str == NULL) {
6637		_Py_ForgetReference((PyObject *)pnew);
6638		PyObject_Del(pnew);
6639		return NULL;
6640	}
6641	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6642	pnew->length = n;
6643	pnew->hash = tmp->hash;
6644	Py_DECREF(tmp);
6645	return (PyObject *)pnew;
6646}
6647
6648PyDoc_STRVAR(unicode_doc,
6649"unicode(string [, encoding[, errors]]) -> object\n\
6650\n\
6651Create a new Unicode object from the given encoded string.\n\
6652encoding defaults to the current default string encoding.\n\
6653errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
6654
6655PyTypeObject PyUnicode_Type = {
6656    PyObject_HEAD_INIT(&PyType_Type)
6657    0, 					/* ob_size */
6658    "unicode", 				/* tp_name */
6659    sizeof(PyUnicodeObject), 		/* tp_size */
6660    0, 					/* tp_itemsize */
6661    /* Slots */
6662    (destructor)unicode_dealloc, 	/* tp_dealloc */
6663    0, 					/* tp_print */
6664    0,				 	/* tp_getattr */
6665    0, 					/* tp_setattr */
6666    (cmpfunc) unicode_compare, 		/* tp_compare */
6667    (reprfunc) unicode_repr, 		/* tp_repr */
6668    &unicode_as_number, 		/* tp_as_number */
6669    &unicode_as_sequence, 		/* tp_as_sequence */
6670    &unicode_as_mapping, 		/* tp_as_mapping */
6671    (hashfunc) unicode_hash, 		/* tp_hash*/
6672    0, 					/* tp_call*/
6673    (reprfunc) unicode_str,	 	/* tp_str */
6674    PyObject_GenericGetAttr, 		/* tp_getattro */
6675    0,			 		/* tp_setattro */
6676    &unicode_as_buffer,			/* tp_as_buffer */
6677    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6678	    Py_TPFLAGS_BASETYPE,	/* tp_flags */
6679    unicode_doc,			/* tp_doc */
6680    0,					/* tp_traverse */
6681    0,					/* tp_clear */
6682    0,					/* tp_richcompare */
6683    0,					/* tp_weaklistoffset */
6684    0,					/* tp_iter */
6685    0,					/* tp_iternext */
6686    unicode_methods,			/* tp_methods */
6687    0,					/* tp_members */
6688    0,					/* tp_getset */
6689    &PyBaseString_Type,			/* tp_base */
6690    0,					/* tp_dict */
6691    0,					/* tp_descr_get */
6692    0,					/* tp_descr_set */
6693    0,					/* tp_dictoffset */
6694    0,					/* tp_init */
6695    0,					/* tp_alloc */
6696    unicode_new,			/* tp_new */
6697    PyObject_Del,      		/* tp_free */
6698};
6699
6700/* Initialize the Unicode implementation */
6701
6702void _PyUnicode_Init(void)
6703{
6704    int i;
6705
6706    /* Init the implementation */
6707    unicode_freelist = NULL;
6708    unicode_freelist_size = 0;
6709    unicode_empty = _PyUnicode_New(0);
6710    strcpy(unicode_default_encoding, "ascii");
6711    for (i = 0; i < 256; i++)
6712	unicode_latin1[i] = NULL;
6713    if (PyType_Ready(&PyUnicode_Type) < 0)
6714	Py_FatalError("Can't initialize 'unicode'");
6715}
6716
6717/* Finalize the Unicode implementation */
6718
6719void
6720_PyUnicode_Fini(void)
6721{
6722    PyUnicodeObject *u;
6723    int i;
6724
6725    Py_XDECREF(unicode_empty);
6726    unicode_empty = NULL;
6727
6728    for (i = 0; i < 256; i++) {
6729	if (unicode_latin1[i]) {
6730	    Py_DECREF(unicode_latin1[i]);
6731	    unicode_latin1[i] = NULL;
6732	}
6733    }
6734
6735    for (u = unicode_freelist; u != NULL;) {
6736	PyUnicodeObject *v = u;
6737	u = *(PyUnicodeObject **)u;
6738	if (v->str)
6739	    PyMem_DEL(v->str);
6740	Py_XDECREF(v->defenc);
6741	PyObject_Del(v);
6742    }
6743    unicode_freelist = NULL;
6744    unicode_freelist_size = 0;
6745}
6746