unicodeobject.c revision f6b56aecad067f730d7fc6ae76cca94a26c3c896
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WINDOWS
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106Py_UNICODE
107PyUnicode_GetMax(void)
108{
109#ifdef Py_UNICODE_WIDE
110	return 0x10FFFF;
111#else
112	/* This is actually an illegal character, so it should
113	   not be passed to unichr. */
114	return 0xFFFF;
115#endif
116}
117
118/* --- Unicode Object ----------------------------------------------------- */
119
120static
121int unicode_resize(register PyUnicodeObject *unicode,
122                      int length)
123{
124    void *oldstr;
125
126    /* Shortcut if there's nothing much to do. */
127    if (unicode->length == length)
128	goto reset;
129
130    /* Resizing shared object (unicode_empty or single character
131       objects) in-place is not allowed. Use PyUnicode_Resize()
132       instead ! */
133    if (unicode == unicode_empty ||
134	(unicode->length == 1 &&
135	 unicode->str[0] < 256 &&
136	 unicode_latin1[unicode->str[0]] == unicode)) {
137        PyErr_SetString(PyExc_SystemError,
138                        "can't resize shared unicode objects");
139        return -1;
140    }
141
142    /* We allocate one more byte to make sure the string is
143       Ux0000 terminated -- XXX is this needed ? */
144    oldstr = unicode->str;
145    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146    if (!unicode->str) {
147	unicode->str = oldstr;
148        PyErr_NoMemory();
149        return -1;
150    }
151    unicode->str[length] = 0;
152    unicode->length = length;
153
154 reset:
155    /* Reset the object caches */
156    if (unicode->defenc) {
157        Py_DECREF(unicode->defenc);
158        unicode->defenc = NULL;
159    }
160    unicode->hash = -1;
161
162    return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166   Ux0000 terminated -- XXX is this needed ?
167
168   XXX This allocator could further be enhanced by assuring that the
169       free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176    register PyUnicodeObject *unicode;
177
178    /* Optimization for empty strings */
179    if (length == 0 && unicode_empty != NULL) {
180        Py_INCREF(unicode_empty);
181        return unicode_empty;
182    }
183
184    /* Unicode freelist & memory allocation */
185    if (unicode_freelist) {
186        unicode = unicode_freelist;
187        unicode_freelist = *(PyUnicodeObject **)unicode;
188        unicode_freelist_size--;
189	if (unicode->str) {
190	    /* Keep-Alive optimization: we only upsize the buffer,
191	       never downsize it. */
192	    if ((unicode->length < length) &&
193		unicode_resize(unicode, length)) {
194		PyMem_DEL(unicode->str);
195		goto onError;
196	    }
197	}
198        else {
199	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200        }
201        PyObject_INIT(unicode, &PyUnicode_Type);
202    }
203    else {
204        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
205        if (unicode == NULL)
206            return NULL;
207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208    }
209
210    if (!unicode->str) {
211	PyErr_NoMemory();
212	goto onError;
213    }
214    unicode->str[length] = 0;
215    unicode->length = length;
216    unicode->hash = -1;
217    unicode->defenc = NULL;
218    return unicode;
219
220 onError:
221    _Py_ForgetReference((PyObject *)unicode);
222    PyObject_Del(unicode);
223    return NULL;
224}
225
226static
227void unicode_dealloc(register PyUnicodeObject *unicode)
228{
229    if (PyUnicode_CheckExact(unicode) &&
230	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
231        /* Keep-Alive optimization */
232	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
233	    PyMem_DEL(unicode->str);
234	    unicode->str = NULL;
235	    unicode->length = 0;
236	}
237	if (unicode->defenc) {
238	    Py_DECREF(unicode->defenc);
239	    unicode->defenc = NULL;
240	}
241	/* Add to free list */
242        *(PyUnicodeObject **)unicode = unicode_freelist;
243        unicode_freelist = unicode;
244        unicode_freelist_size++;
245    }
246    else {
247	PyMem_DEL(unicode->str);
248	Py_XDECREF(unicode->defenc);
249	unicode->ob_type->tp_free((PyObject *)unicode);
250    }
251}
252
253int PyUnicode_Resize(PyObject **unicode,
254		     int length)
255{
256    register PyUnicodeObject *v;
257
258    /* Argument checks */
259    if (unicode == NULL) {
260	PyErr_BadInternalCall();
261	return -1;
262    }
263    v = (PyUnicodeObject *)*unicode;
264    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
265	PyErr_BadInternalCall();
266	return -1;
267    }
268
269    /* Resizing unicode_empty and single character objects is not
270       possible since these are being shared. We simply return a fresh
271       copy with the same Unicode content. */
272    if (v->length != length &&
273	(v == unicode_empty || v->length == 1)) {
274	PyUnicodeObject *w = _PyUnicode_New(length);
275	if (w == NULL)
276	    return -1;
277	Py_UNICODE_COPY(w->str, v->str,
278			length < v->length ? length : v->length);
279	*unicode = (PyObject *)w;
280	return 0;
281    }
282
283    /* Note that we don't have to modify *unicode for unshared Unicode
284       objects, since we can modify them in-place. */
285    return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293				int size)
294{
295    PyUnicodeObject *unicode;
296
297    /* If the Unicode data is known at construction time, we can apply
298       some optimizations which share commonly used objects. */
299    if (u != NULL) {
300
301	/* Optimization for empty strings */
302	if (size == 0 && unicode_empty != NULL) {
303	    Py_INCREF(unicode_empty);
304	    return (PyObject *)unicode_empty;
305	}
306
307	/* Single character Unicode objects in the Latin-1 range are
308	   shared when using this constructor */
309	if (size == 1 && *u < 256) {
310	    unicode = unicode_latin1[*u];
311	    if (!unicode) {
312		unicode = _PyUnicode_New(1);
313		if (!unicode)
314		    return NULL;
315		unicode->str[0] = *u;
316		unicode_latin1[*u] = unicode;
317	    }
318	    Py_INCREF(unicode);
319	    return (PyObject *)unicode;
320	}
321    }
322
323    unicode = _PyUnicode_New(size);
324    if (!unicode)
325        return NULL;
326
327    /* Copy the Unicode data into the new object */
328    if (u != NULL)
329	Py_UNICODE_COPY(unicode->str, u, size);
330
331    return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337				 int size)
338{
339    PyUnicodeObject *unicode;
340
341    if (w == NULL) {
342	PyErr_BadInternalCall();
343	return NULL;
344    }
345
346    unicode = _PyUnicode_New(size);
347    if (!unicode)
348        return NULL;
349
350    /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352    memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354    {
355	register Py_UNICODE *u;
356	register int i;
357	u = PyUnicode_AS_UNICODE(unicode);
358	for (i = size; i >= 0; i--)
359	    *u++ = *w++;
360    }
361#endif
362
363    return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367			 register wchar_t *w,
368			 int size)
369{
370    if (unicode == NULL) {
371	PyErr_BadInternalCall();
372	return -1;
373    }
374    if (size > PyUnicode_GET_SIZE(unicode))
375	size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377    memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379    {
380	register Py_UNICODE *u;
381	register int i;
382	u = PyUnicode_AS_UNICODE(unicode);
383	for (i = size; i >= 0; i--)
384	    *w++ = *u++;
385    }
386#endif
387
388    return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromOrdinal(int ordinal)
394{
395    Py_UNICODE s[2];
396
397#ifdef Py_UNICODE_WIDE
398    if (ordinal < 0 || ordinal > 0x10ffff) {
399	PyErr_SetString(PyExc_ValueError,
400			"unichr() arg not in range(0x110000) "
401			"(wide Python build)");
402	return NULL;
403    }
404#else
405    if (ordinal < 0 || ordinal > 0xffff) {
406	PyErr_SetString(PyExc_ValueError,
407			"unichr() arg not in range(0x10000) "
408			"(narrow Python build)");
409	return NULL;
410    }
411#endif
412
413    if (ordinal <= 0xffff) {
414	/* UCS-2 character */
415	s[0] = (Py_UNICODE) ordinal;
416	return PyUnicode_FromUnicode(s, 1);
417    }
418    else {
419#ifndef Py_UNICODE_WIDE
420	/* UCS-4 character.  store as two surrogate characters */
421	ordinal -= 0x10000L;
422	s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
423	s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
424	return PyUnicode_FromUnicode(s, 2);
425#else
426	s[0] = (Py_UNICODE)ordinal;
427	return PyUnicode_FromUnicode(s, 1);
428#endif
429    }
430}
431
432PyObject *PyUnicode_FromObject(register PyObject *obj)
433{
434    /* XXX Perhaps we should make this API an alias of
435           PyObject_Unicode() instead ?! */
436    if (PyUnicode_CheckExact(obj)) {
437	Py_INCREF(obj);
438	return obj;
439    }
440    if (PyUnicode_Check(obj)) {
441	/* For a Unicode subtype that's not a Unicode object,
442	   return a true Unicode object with the same data. */
443	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
444				     PyUnicode_GET_SIZE(obj));
445    }
446    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
447}
448
449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
450				      const char *encoding,
451				      const char *errors)
452{
453    const char *s = NULL;
454    int len;
455    PyObject *v;
456
457    if (obj == NULL) {
458	PyErr_BadInternalCall();
459	return NULL;
460    }
461
462#if 0
463    /* For b/w compatibility we also accept Unicode objects provided
464       that no encodings is given and then redirect to
465       PyObject_Unicode() which then applies the additional logic for
466       Unicode subclasses.
467
468       NOTE: This API should really only be used for object which
469             represent *encoded* Unicode !
470
471    */
472	if (PyUnicode_Check(obj)) {
473	    if (encoding) {
474		PyErr_SetString(PyExc_TypeError,
475				"decoding Unicode is not supported");
476	    return NULL;
477	    }
478	return PyObject_Unicode(obj);
479	    }
480#else
481    if (PyUnicode_Check(obj)) {
482	PyErr_SetString(PyExc_TypeError,
483			"decoding Unicode is not supported");
484	return NULL;
485	}
486#endif
487
488    /* Coerce object */
489    if (PyString_Check(obj)) {
490	    s = PyString_AS_STRING(obj);
491	    len = PyString_GET_SIZE(obj);
492	    }
493    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
494	/* Overwrite the error message with something more useful in
495	   case of a TypeError. */
496	if (PyErr_ExceptionMatches(PyExc_TypeError))
497	PyErr_Format(PyExc_TypeError,
498			 "coercing to Unicode: need string or buffer, "
499			 "%.80s found",
500		     obj->ob_type->tp_name);
501	goto onError;
502    }
503
504    /* Convert to Unicode */
505    if (len == 0) {
506	Py_INCREF(unicode_empty);
507	v = (PyObject *)unicode_empty;
508    }
509    else
510	v = PyUnicode_Decode(s, len, encoding, errors);
511
512    return v;
513
514 onError:
515    return NULL;
516}
517
518PyObject *PyUnicode_Decode(const char *s,
519			   int size,
520			   const char *encoding,
521			   const char *errors)
522{
523    PyObject *buffer = NULL, *unicode;
524
525    if (encoding == NULL)
526	encoding = PyUnicode_GetDefaultEncoding();
527
528    /* Shortcuts for common default encodings */
529    if (strcmp(encoding, "utf-8") == 0)
530        return PyUnicode_DecodeUTF8(s, size, errors);
531    else if (strcmp(encoding, "latin-1") == 0)
532        return PyUnicode_DecodeLatin1(s, size, errors);
533    else if (strcmp(encoding, "ascii") == 0)
534        return PyUnicode_DecodeASCII(s, size, errors);
535
536    /* Decode via the codec registry */
537    buffer = PyBuffer_FromMemory((void *)s, size);
538    if (buffer == NULL)
539        goto onError;
540    unicode = PyCodec_Decode(buffer, encoding, errors);
541    if (unicode == NULL)
542        goto onError;
543    if (!PyUnicode_Check(unicode)) {
544        PyErr_Format(PyExc_TypeError,
545                     "decoder did not return an unicode object (type=%.400s)",
546                     unicode->ob_type->tp_name);
547        Py_DECREF(unicode);
548        goto onError;
549    }
550    Py_DECREF(buffer);
551    return unicode;
552
553 onError:
554    Py_XDECREF(buffer);
555    return NULL;
556}
557
558PyObject *PyUnicode_Encode(const Py_UNICODE *s,
559			   int size,
560			   const char *encoding,
561			   const char *errors)
562{
563    PyObject *v, *unicode;
564
565    unicode = PyUnicode_FromUnicode(s, size);
566    if (unicode == NULL)
567	return NULL;
568    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
569    Py_DECREF(unicode);
570    return v;
571}
572
573PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
574                                    const char *encoding,
575                                    const char *errors)
576{
577    PyObject *v;
578
579    if (!PyUnicode_Check(unicode)) {
580        PyErr_BadArgument();
581        goto onError;
582    }
583
584    if (encoding == NULL)
585	encoding = PyUnicode_GetDefaultEncoding();
586
587    /* Shortcuts for common default encodings */
588    if (errors == NULL) {
589	if (strcmp(encoding, "utf-8") == 0)
590	    return PyUnicode_AsUTF8String(unicode);
591	else if (strcmp(encoding, "latin-1") == 0)
592	    return PyUnicode_AsLatin1String(unicode);
593	else if (strcmp(encoding, "ascii") == 0)
594	    return PyUnicode_AsASCIIString(unicode);
595    }
596
597    /* Encode via the codec registry */
598    v = PyCodec_Encode(unicode, encoding, errors);
599    if (v == NULL)
600        goto onError;
601    /* XXX Should we really enforce this ? */
602    if (!PyString_Check(v)) {
603        PyErr_Format(PyExc_TypeError,
604                     "encoder did not return a string object (type=%.400s)",
605                     v->ob_type->tp_name);
606        Py_DECREF(v);
607        goto onError;
608    }
609    return v;
610
611 onError:
612    return NULL;
613}
614
615PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
616					    const char *errors)
617{
618    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
619
620    if (v)
621        return v;
622    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
623    if (v && errors == NULL)
624        ((PyUnicodeObject *)unicode)->defenc = v;
625    return v;
626}
627
628Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
629{
630    if (!PyUnicode_Check(unicode)) {
631        PyErr_BadArgument();
632        goto onError;
633    }
634    return PyUnicode_AS_UNICODE(unicode);
635
636 onError:
637    return NULL;
638}
639
640int PyUnicode_GetSize(PyObject *unicode)
641{
642    if (!PyUnicode_Check(unicode)) {
643        PyErr_BadArgument();
644        goto onError;
645    }
646    return PyUnicode_GET_SIZE(unicode);
647
648 onError:
649    return -1;
650}
651
652const char *PyUnicode_GetDefaultEncoding(void)
653{
654    return unicode_default_encoding;
655}
656
657int PyUnicode_SetDefaultEncoding(const char *encoding)
658{
659    PyObject *v;
660
661    /* Make sure the encoding is valid. As side effect, this also
662       loads the encoding into the codec registry cache. */
663    v = _PyCodec_Lookup(encoding);
664    if (v == NULL)
665	goto onError;
666    Py_DECREF(v);
667    strncpy(unicode_default_encoding,
668	    encoding,
669	    sizeof(unicode_default_encoding));
670    return 0;
671
672 onError:
673    return -1;
674}
675
676/* error handling callback helper:
677   build arguments, call the callback and check the arguments,
678   if no exception occured, copy the replacement to the output
679   and adjust various state variables.
680   return 0 on success, -1 on error
681*/
682
683static
684int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
685                 const char *encoding, const char *reason,
686                 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
687                 PyObject **output, int *outpos, Py_UNICODE **outptr)
688{
689    static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
690
691    PyObject *restuple = NULL;
692    PyObject *repunicode = NULL;
693    int outsize = PyUnicode_GET_SIZE(*output);
694    int requiredsize;
695    int newpos;
696    Py_UNICODE *repptr;
697    int repsize;
698    int res = -1;
699
700    if (*errorHandler == NULL) {
701	*errorHandler = PyCodec_LookupError(errors);
702	if (*errorHandler == NULL)
703	   goto onError;
704    }
705
706    if (*exceptionObject == NULL) {
707    	*exceptionObject = PyUnicodeDecodeError_Create(
708	    encoding, input, insize, *startinpos, *endinpos, reason);
709	if (*exceptionObject == NULL)
710	   goto onError;
711    }
712    else {
713	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
714	    goto onError;
715	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
716	    goto onError;
717	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
718	    goto onError;
719    }
720
721    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
722    if (restuple == NULL)
723	goto onError;
724    if (!PyTuple_Check(restuple)) {
725	PyErr_Format(PyExc_TypeError, &argparse[4]);
726	goto onError;
727    }
728    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
729	goto onError;
730    if (newpos<0)
731	newpos = insize+newpos;
732    if (newpos<0 || newpos>insize) {
733	PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
734	goto onError;
735    }
736
737    /* need more space? (at least enough for what we
738       have+the replacement+the rest of the string (starting
739       at the new input position), so we won't have to check space
740       when there are no errors in the rest of the string) */
741    repptr = PyUnicode_AS_UNICODE(repunicode);
742    repsize = PyUnicode_GET_SIZE(repunicode);
743    requiredsize = *outpos + repsize + insize-newpos;
744    if (requiredsize > outsize) {
745	if (requiredsize<2*outsize)
746	    requiredsize = 2*outsize;
747	if (PyUnicode_Resize(output, requiredsize))
748	    goto onError;
749	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
750    }
751    *endinpos = newpos;
752    *inptr = input + newpos;
753    Py_UNICODE_COPY(*outptr, repptr, repsize);
754    *outptr += repsize;
755    *outpos += repsize;
756    /* we made it! */
757    res = 0;
758
759    onError:
760    Py_XDECREF(restuple);
761    return res;
762}
763
764/* --- UTF-7 Codec -------------------------------------------------------- */
765
766/* see RFC2152 for details */
767
768static
769char utf7_special[128] = {
770    /* indicate whether a UTF-7 character is special i.e. cannot be directly
771       encoded:
772	   0 - not special
773	   1 - special
774	   2 - whitespace (optional)
775	   3 - RFC2152 Set O (optional) */
776    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
777    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
778    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
779    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
780    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
781    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
782    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
783    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
784
785};
786
787#define SPECIAL(c, encodeO, encodeWS) \
788	(((c)>127 || utf7_special[(c)] == 1) || \
789	 (encodeWS && (utf7_special[(c)] == 2)) || \
790     (encodeO && (utf7_special[(c)] == 3)))
791
792#define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
793#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
794#define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
795                        (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
796
797#define ENCODE(out, ch, bits) \
798    while (bits >= 6) { \
799        *out++ = B64(ch >> (bits-6)); \
800        bits -= 6; \
801    }
802
803#define DECODE(out, ch, bits, surrogate) \
804    while (bits >= 16) { \
805        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
806        bits -= 16; \
807		if (surrogate) { \
808			/* We have already generated an error for the high surrogate
809               so let's not bother seeing if the low surrogate is correct or not */\
810			surrogate = 0; \
811		} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
812            /* This is a surrogate pair. Unfortunately we can't represent \
813               it in a 16-bit character */ \
814			surrogate = 1; \
815            errmsg = "code pairs are not supported"; \
816	        goto utf7Error; \
817		} else { \
818				*out++ = outCh; \
819		} \
820    } \
821
822PyObject *PyUnicode_DecodeUTF7(const char *s,
823			       int size,
824			       const char *errors)
825{
826    const char *starts = s;
827    int startinpos;
828    int endinpos;
829    int outpos;
830    const char *e;
831    PyUnicodeObject *unicode;
832    Py_UNICODE *p;
833    const char *errmsg = "";
834    int inShift = 0;
835    unsigned int bitsleft = 0;
836    unsigned long charsleft = 0;
837    int surrogate = 0;
838    PyObject *errorHandler = NULL;
839    PyObject *exc = NULL;
840
841    unicode = _PyUnicode_New(size);
842    if (!unicode)
843        return NULL;
844    if (size == 0)
845        return (PyObject *)unicode;
846
847    p = unicode->str;
848    e = s + size;
849
850    while (s < e) {
851        Py_UNICODE ch;
852        restart:
853        ch = *s;
854
855        if (inShift) {
856            if ((ch == '-') || !B64CHAR(ch)) {
857                inShift = 0;
858                s++;
859
860                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
861                if (bitsleft >= 6) {
862                    /* The shift sequence has a partial character in it. If
863                       bitsleft < 6 then we could just classify it as padding
864                       but that is not the case here */
865
866                    errmsg = "partial character in shift sequence";
867                    goto utf7Error;
868                }
869                /* According to RFC2152 the remaining bits should be zero. We
870                   choose to signal an error/insert a replacement character
871                   here so indicate the potential of a misencoded character. */
872
873                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
874                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
875                    errmsg = "non-zero padding bits in shift sequence";
876                    goto utf7Error;
877                }
878
879                if (ch == '-') {
880                    if ((s < e) && (*(s) == '-')) {
881                        *p++ = '-';
882                        inShift = 1;
883                    }
884                } else if (SPECIAL(ch,0,0)) {
885                    errmsg = "unexpected special character";
886	                goto utf7Error;
887                } else  {
888                    *p++ = ch;
889                }
890            } else {
891                charsleft = (charsleft << 6) | UB64(ch);
892                bitsleft += 6;
893                s++;
894                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
895            }
896        }
897        else if ( ch == '+' ) {
898            startinpos = s-starts;
899            s++;
900            if (s < e && *s == '-') {
901                s++;
902                *p++ = '+';
903            } else
904            {
905                inShift = 1;
906                bitsleft = 0;
907            }
908        }
909        else if (SPECIAL(ch,0,0)) {
910            errmsg = "unexpected special character";
911            s++;
912	        goto utf7Error;
913        }
914        else {
915            *p++ = ch;
916            s++;
917        }
918        continue;
919    utf7Error:
920        outpos = p-PyUnicode_AS_UNICODE(unicode);
921        endinpos = s-starts;
922        if (unicode_decode_call_errorhandler(
923             errors, &errorHandler,
924             "utf7", errmsg,
925             starts, size, &startinpos, &endinpos, &exc, &s,
926             (PyObject **)&unicode, &outpos, &p))
927        goto onError;
928    }
929
930    if (inShift) {
931        outpos = p-PyUnicode_AS_UNICODE(unicode);
932        endinpos = size;
933        if (unicode_decode_call_errorhandler(
934             errors, &errorHandler,
935             "utf7", "unterminated shift sequence",
936             starts, size, &startinpos, &endinpos, &exc, &s,
937             (PyObject **)&unicode, &outpos, &p))
938            goto onError;
939        if (s < e)
940           goto restart;
941    }
942
943    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
944        goto onError;
945
946    Py_XDECREF(errorHandler);
947    Py_XDECREF(exc);
948    return (PyObject *)unicode;
949
950onError:
951    Py_XDECREF(errorHandler);
952    Py_XDECREF(exc);
953    Py_DECREF(unicode);
954    return NULL;
955}
956
957
958PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
959                   int size,
960                   int encodeSetO,
961                   int encodeWhiteSpace,
962                   const char *errors)
963{
964    PyObject *v;
965    /* It might be possible to tighten this worst case */
966    unsigned int cbAllocated = 5 * size;
967    int inShift = 0;
968    int i = 0;
969    unsigned int bitsleft = 0;
970    unsigned long charsleft = 0;
971    char * out;
972    char * start;
973
974    if (size == 0)
975		return PyString_FromStringAndSize(NULL, 0);
976
977    v = PyString_FromStringAndSize(NULL, cbAllocated);
978    if (v == NULL)
979        return NULL;
980
981    start = out = PyString_AS_STRING(v);
982    for (;i < size; ++i) {
983        Py_UNICODE ch = s[i];
984
985        if (!inShift) {
986			if (ch == '+') {
987				*out++ = '+';
988                *out++ = '-';
989            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
990                charsleft = ch;
991                bitsleft = 16;
992                *out++ = '+';
993				/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
994                inShift = bitsleft > 0;
995			} else {
996				*out++ = (char) ch;
997			}
998		} else {
999            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1000                *out++ = B64(charsleft << (6-bitsleft));
1001                charsleft = 0;
1002                bitsleft = 0;
1003                /* Characters not in the BASE64 set implicitly unshift the sequence
1004                   so no '-' is required, except if the character is itself a '-' */
1005                if (B64CHAR(ch) || ch == '-') {
1006                    *out++ = '-';
1007                }
1008                inShift = 0;
1009                *out++ = (char) ch;
1010            } else {
1011                bitsleft += 16;
1012                charsleft = (charsleft << 16) | ch;
1013                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1014
1015                /* If the next character is special then we dont' need to terminate
1016                   the shift sequence. If the next character is not a BASE64 character
1017                   or '-' then the shift sequence will be terminated implicitly and we
1018                   don't have to insert a '-'. */
1019
1020                if (bitsleft == 0) {
1021                    if (i + 1 < size) {
1022                        Py_UNICODE ch2 = s[i+1];
1023
1024                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1025
1026                        } else if (B64CHAR(ch2) || ch2 == '-') {
1027                            *out++ = '-';
1028                            inShift = 0;
1029                        } else {
1030                            inShift = 0;
1031                        }
1032
1033                    }
1034                    else {
1035                        *out++ = '-';
1036                        inShift = 0;
1037                    }
1038                }
1039            }
1040        }
1041	}
1042    if (bitsleft) {
1043        *out++= B64(charsleft << (6-bitsleft) );
1044        *out++ = '-';
1045    }
1046
1047    _PyString_Resize(&v, out - start);
1048    return v;
1049}
1050
1051#undef SPECIAL
1052#undef B64
1053#undef B64CHAR
1054#undef UB64
1055#undef ENCODE
1056#undef DECODE
1057
1058/* --- UTF-8 Codec -------------------------------------------------------- */
1059
1060static
1061char utf8_code_length[256] = {
1062    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1063       illegal prefix.  see RFC 2279 for details */
1064    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1065    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1066    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1067    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1068    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1069    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1073    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1077    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1078    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1079    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1080};
1081
1082PyObject *PyUnicode_DecodeUTF8(const char *s,
1083			       int size,
1084			       const char *errors)
1085{
1086    const char *starts = s;
1087    int n;
1088    int startinpos;
1089    int endinpos;
1090    int outpos;
1091    const char *e;
1092    PyUnicodeObject *unicode;
1093    Py_UNICODE *p;
1094    const char *errmsg = "";
1095    PyObject *errorHandler = NULL;
1096    PyObject *exc = NULL;
1097
1098    /* Note: size will always be longer than the resulting Unicode
1099       character count */
1100    unicode = _PyUnicode_New(size);
1101    if (!unicode)
1102        return NULL;
1103    if (size == 0)
1104        return (PyObject *)unicode;
1105
1106    /* Unpack UTF-8 encoded data */
1107    p = unicode->str;
1108    e = s + size;
1109
1110    while (s < e) {
1111        Py_UCS4 ch = (unsigned char)*s;
1112
1113        if (ch < 0x80) {
1114            *p++ = (Py_UNICODE)ch;
1115            s++;
1116            continue;
1117        }
1118
1119        n = utf8_code_length[ch];
1120
1121        if (s + n > e) {
1122	    errmsg = "unexpected end of data";
1123	    startinpos = s-starts;
1124	    endinpos = size;
1125	    goto utf8Error;
1126	}
1127
1128        switch (n) {
1129
1130        case 0:
1131            errmsg = "unexpected code byte";
1132	    startinpos = s-starts;
1133	    endinpos = startinpos+1;
1134	    goto utf8Error;
1135
1136        case 1:
1137            errmsg = "internal error";
1138	    startinpos = s-starts;
1139	    endinpos = startinpos+1;
1140	    goto utf8Error;
1141
1142        case 2:
1143            if ((s[1] & 0xc0) != 0x80) {
1144                errmsg = "invalid data";
1145		startinpos = s-starts;
1146		endinpos = startinpos+2;
1147		goto utf8Error;
1148	    }
1149            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1150            if (ch < 0x80) {
1151		startinpos = s-starts;
1152		endinpos = startinpos+2;
1153                errmsg = "illegal encoding";
1154		goto utf8Error;
1155	    }
1156	    else
1157		*p++ = (Py_UNICODE)ch;
1158            break;
1159
1160        case 3:
1161            if ((s[1] & 0xc0) != 0x80 ||
1162                (s[2] & 0xc0) != 0x80) {
1163                errmsg = "invalid data";
1164		startinpos = s-starts;
1165		endinpos = startinpos+3;
1166		goto utf8Error;
1167	    }
1168            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1169            if (ch < 0x0800) {
1170		/* Note: UTF-8 encodings of surrogates are considered
1171		   legal UTF-8 sequences;
1172
1173		   XXX For wide builds (UCS-4) we should probably try
1174		       to recombine the surrogates into a single code
1175		       unit.
1176		*/
1177                errmsg = "illegal encoding";
1178		startinpos = s-starts;
1179		endinpos = startinpos+3;
1180		goto utf8Error;
1181	    }
1182	    else
1183		*p++ = (Py_UNICODE)ch;
1184            break;
1185
1186        case 4:
1187            if ((s[1] & 0xc0) != 0x80 ||
1188                (s[2] & 0xc0) != 0x80 ||
1189                (s[3] & 0xc0) != 0x80) {
1190                errmsg = "invalid data";
1191		startinpos = s-starts;
1192		endinpos = startinpos+4;
1193		goto utf8Error;
1194	    }
1195            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1196                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1197            /* validate and convert to UTF-16 */
1198            if ((ch < 0x10000)        /* minimum value allowed for 4
1199					 byte encoding */
1200                || (ch > 0x10ffff))   /* maximum value allowed for
1201					 UTF-16 */
1202	    {
1203                errmsg = "illegal encoding";
1204		startinpos = s-starts;
1205		endinpos = startinpos+4;
1206		goto utf8Error;
1207	    }
1208#ifdef Py_UNICODE_WIDE
1209	    *p++ = (Py_UNICODE)ch;
1210#else
1211            /*  compute and append the two surrogates: */
1212
1213            /*  translate from 10000..10FFFF to 0..FFFF */
1214            ch -= 0x10000;
1215
1216            /*  high surrogate = top 10 bits added to D800 */
1217            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1218
1219            /*  low surrogate = bottom 10 bits added to DC00 */
1220            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1221#endif
1222            break;
1223
1224        default:
1225            /* Other sizes are only needed for UCS-4 */
1226            errmsg = "unsupported Unicode code range";
1227	    startinpos = s-starts;
1228	    endinpos = startinpos+n;
1229	    goto utf8Error;
1230        }
1231        s += n;
1232	continue;
1233
1234    utf8Error:
1235    outpos = p-PyUnicode_AS_UNICODE(unicode);
1236    if (unicode_decode_call_errorhandler(
1237	     errors, &errorHandler,
1238	     "utf8", errmsg,
1239	     starts, size, &startinpos, &endinpos, &exc, &s,
1240	     (PyObject **)&unicode, &outpos, &p))
1241	goto onError;
1242    }
1243
1244    /* Adjust length */
1245    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1246        goto onError;
1247
1248    Py_XDECREF(errorHandler);
1249    Py_XDECREF(exc);
1250    return (PyObject *)unicode;
1251
1252onError:
1253    Py_XDECREF(errorHandler);
1254    Py_XDECREF(exc);
1255    Py_DECREF(unicode);
1256    return NULL;
1257}
1258
1259/* Allocation strategy:  if the string is short, convert into a stack buffer
1260   and allocate exactly as much space needed at the end.  Else allocate the
1261   maximum possible needed (4 result bytes per Unicode character), and return
1262   the excess memory at the end.
1263*/
1264PyObject *
1265PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1266		     int size,
1267		     const char *errors)
1268{
1269#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1270
1271    int i;              /* index into s of next input byte */
1272    PyObject *v;        /* result string object */
1273    char *p;            /* next free byte in output buffer */
1274    int nallocated;     /* number of result bytes allocated */
1275    int nneeded;        /* number of result bytes needed */
1276    char stackbuf[MAX_SHORT_UNICHARS * 4];
1277
1278    assert(s != NULL);
1279    assert(size >= 0);
1280
1281    if (size <= MAX_SHORT_UNICHARS) {
1282        /* Write into the stack buffer; nallocated can't overflow.
1283         * At the end, we'll allocate exactly as much heap space as it
1284         * turns out we need.
1285         */
1286        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1287        v = NULL;   /* will allocate after we're done */
1288        p = stackbuf;
1289    }
1290    else {
1291        /* Overallocate on the heap, and give the excess back at the end. */
1292        nallocated = size * 4;
1293        if (nallocated / 4 != size)  /* overflow! */
1294            return PyErr_NoMemory();
1295        v = PyString_FromStringAndSize(NULL, nallocated);
1296        if (v == NULL)
1297            return NULL;
1298        p = PyString_AS_STRING(v);
1299    }
1300
1301    for (i = 0; i < size;) {
1302        Py_UCS4 ch = s[i++];
1303
1304        if (ch < 0x80)
1305            /* Encode ASCII */
1306            *p++ = (char) ch;
1307
1308        else if (ch < 0x0800) {
1309            /* Encode Latin-1 */
1310            *p++ = (char)(0xc0 | (ch >> 6));
1311            *p++ = (char)(0x80 | (ch & 0x3f));
1312        }
1313        else {
1314            /* Encode UCS2 Unicode ordinals */
1315            if (ch < 0x10000) {
1316                /* Special case: check for high surrogate */
1317                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1318                    Py_UCS4 ch2 = s[i];
1319                    /* Check for low surrogate and combine the two to
1320                       form a UCS4 value */
1321                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1322                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1323                        i++;
1324                        goto encodeUCS4;
1325                    }
1326                    /* Fall through: handles isolated high surrogates */
1327                }
1328                *p++ = (char)(0xe0 | (ch >> 12));
1329                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1330                *p++ = (char)(0x80 | (ch & 0x3f));
1331                continue;
1332    	    }
1333encodeUCS4:
1334            /* Encode UCS4 Unicode ordinals */
1335            *p++ = (char)(0xf0 | (ch >> 18));
1336            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1337            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1338            *p++ = (char)(0x80 | (ch & 0x3f));
1339        }
1340    }
1341
1342    if (v == NULL) {
1343        /* This was stack allocated. */
1344        nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1345        assert(nneeded <= nallocated);
1346        v = PyString_FromStringAndSize(stackbuf, nneeded);
1347    }
1348    else {
1349    	/* Cut back to size actually needed. */
1350        nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1351        assert(nneeded <= nallocated);
1352        _PyString_Resize(&v, nneeded);
1353    }
1354    return v;
1355
1356#undef MAX_SHORT_UNICHARS
1357}
1358
1359PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1360{
1361    if (!PyUnicode_Check(unicode)) {
1362        PyErr_BadArgument();
1363        return NULL;
1364    }
1365    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1366				PyUnicode_GET_SIZE(unicode),
1367				NULL);
1368}
1369
1370/* --- UTF-16 Codec ------------------------------------------------------- */
1371
1372PyObject *
1373PyUnicode_DecodeUTF16(const char *s,
1374		      int size,
1375		      const char *errors,
1376		      int *byteorder)
1377{
1378    const char *starts = s;
1379    int startinpos;
1380    int endinpos;
1381    int outpos;
1382    PyUnicodeObject *unicode;
1383    Py_UNICODE *p;
1384    const unsigned char *q, *e;
1385    int bo = 0;       /* assume native ordering by default */
1386    const char *errmsg = "";
1387    /* Offsets from q for retrieving byte pairs in the right order. */
1388#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1389    int ihi = 1, ilo = 0;
1390#else
1391    int ihi = 0, ilo = 1;
1392#endif
1393    PyObject *errorHandler = NULL;
1394    PyObject *exc = NULL;
1395
1396    /* Note: size will always be longer than the resulting Unicode
1397       character count */
1398    unicode = _PyUnicode_New(size);
1399    if (!unicode)
1400        return NULL;
1401    if (size == 0)
1402        return (PyObject *)unicode;
1403
1404    /* Unpack UTF-16 encoded data */
1405    p = unicode->str;
1406    q = (unsigned char *)s;
1407    e = q + size;
1408
1409    if (byteorder)
1410        bo = *byteorder;
1411
1412    /* Check for BOM marks (U+FEFF) in the input and adjust current
1413       byte order setting accordingly. In native mode, the leading BOM
1414       mark is skipped, in all other modes, it is copied to the output
1415       stream as-is (giving a ZWNBSP character). */
1416    if (bo == 0) {
1417        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1418#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1419	if (bom == 0xFEFF) {
1420	    q += 2;
1421	    bo = -1;
1422	}
1423        else if (bom == 0xFFFE) {
1424	    q += 2;
1425	    bo = 1;
1426	}
1427#else
1428	if (bom == 0xFEFF) {
1429	    q += 2;
1430	    bo = 1;
1431	}
1432        else if (bom == 0xFFFE) {
1433	    q += 2;
1434	    bo = -1;
1435	}
1436#endif
1437    }
1438
1439    if (bo == -1) {
1440        /* force LE */
1441        ihi = 1;
1442        ilo = 0;
1443    }
1444    else if (bo == 1) {
1445        /* force BE */
1446        ihi = 0;
1447        ilo = 1;
1448    }
1449
1450    while (q < e) {
1451	Py_UNICODE ch;
1452	/* remaing bytes at the end? (size should be even) */
1453	if (e-q<2) {
1454	    errmsg = "truncated data";
1455	    startinpos = ((const char *)q)-starts;
1456	    endinpos = ((const char *)e)-starts;
1457	    goto utf16Error;
1458	    /* The remaining input chars are ignored if the callback
1459	       chooses to skip the input */
1460	}
1461	ch = (q[ihi] << 8) | q[ilo];
1462
1463	q += 2;
1464
1465	if (ch < 0xD800 || ch > 0xDFFF) {
1466	    *p++ = ch;
1467	    continue;
1468	}
1469
1470	/* UTF-16 code pair: */
1471	if (q >= e) {
1472	    errmsg = "unexpected end of data";
1473	    startinpos = (((const char *)q)-2)-starts;
1474	    endinpos = ((const char *)e)-starts;
1475	    goto utf16Error;
1476	}
1477	if (0xD800 <= ch && ch <= 0xDBFF) {
1478	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1479	    q += 2;
1480	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1481#ifndef Py_UNICODE_WIDE
1482		*p++ = ch;
1483		*p++ = ch2;
1484#else
1485		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1486#endif
1487		continue;
1488	    }
1489	    else {
1490                errmsg = "illegal UTF-16 surrogate";
1491		startinpos = (((const char *)q)-4)-starts;
1492		endinpos = startinpos+2;
1493		goto utf16Error;
1494	    }
1495
1496	}
1497	errmsg = "illegal encoding";
1498	startinpos = (((const char *)q)-2)-starts;
1499	endinpos = startinpos+2;
1500	/* Fall through to report the error */
1501
1502    utf16Error:
1503	outpos = p-PyUnicode_AS_UNICODE(unicode);
1504	if (unicode_decode_call_errorhandler(
1505	         errors, &errorHandler,
1506	         "utf16", errmsg,
1507	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1508	         (PyObject **)&unicode, &outpos, &p))
1509	    goto onError;
1510    }
1511
1512    if (byteorder)
1513        *byteorder = bo;
1514
1515    /* Adjust length */
1516    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1517        goto onError;
1518
1519    Py_XDECREF(errorHandler);
1520    Py_XDECREF(exc);
1521    return (PyObject *)unicode;
1522
1523onError:
1524    Py_DECREF(unicode);
1525    Py_XDECREF(errorHandler);
1526    Py_XDECREF(exc);
1527    return NULL;
1528}
1529
1530PyObject *
1531PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1532		      int size,
1533		      const char *errors,
1534		      int byteorder)
1535{
1536    PyObject *v;
1537    unsigned char *p;
1538    int i, pairs;
1539    /* Offsets from p for storing byte pairs in the right order. */
1540#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1541    int ihi = 1, ilo = 0;
1542#else
1543    int ihi = 0, ilo = 1;
1544#endif
1545
1546#define STORECHAR(CH)                   \
1547    do {                                \
1548        p[ihi] = ((CH) >> 8) & 0xff;    \
1549        p[ilo] = (CH) & 0xff;           \
1550        p += 2;                         \
1551    } while(0)
1552
1553    for (i = pairs = 0; i < size; i++)
1554	if (s[i] >= 0x10000)
1555	    pairs++;
1556    v = PyString_FromStringAndSize(NULL,
1557		  2 * (size + pairs + (byteorder == 0)));
1558    if (v == NULL)
1559        return NULL;
1560
1561    p = (unsigned char *)PyString_AS_STRING(v);
1562    if (byteorder == 0)
1563	STORECHAR(0xFEFF);
1564    if (size == 0)
1565        return v;
1566
1567    if (byteorder == -1) {
1568        /* force LE */
1569        ihi = 1;
1570        ilo = 0;
1571    }
1572    else if (byteorder == 1) {
1573        /* force BE */
1574        ihi = 0;
1575        ilo = 1;
1576    }
1577
1578    while (size-- > 0) {
1579	Py_UNICODE ch = *s++;
1580	Py_UNICODE ch2 = 0;
1581	if (ch >= 0x10000) {
1582	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1583	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1584	}
1585        STORECHAR(ch);
1586        if (ch2)
1587            STORECHAR(ch2);
1588    }
1589    return v;
1590#undef STORECHAR
1591}
1592
1593PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1594{
1595    if (!PyUnicode_Check(unicode)) {
1596        PyErr_BadArgument();
1597        return NULL;
1598    }
1599    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1600				 PyUnicode_GET_SIZE(unicode),
1601				 NULL,
1602				 0);
1603}
1604
1605/* --- Unicode Escape Codec ----------------------------------------------- */
1606
1607static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1608
1609PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1610					int size,
1611					const char *errors)
1612{
1613    const char *starts = s;
1614    int startinpos;
1615    int endinpos;
1616    int outpos;
1617    int i;
1618    PyUnicodeObject *v;
1619    Py_UNICODE *p;
1620    const char *end;
1621    char* message;
1622    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1623    PyObject *errorHandler = NULL;
1624    PyObject *exc = NULL;
1625
1626    /* Escaped strings will always be longer than the resulting
1627       Unicode string, so we start with size here and then reduce the
1628       length after conversion to the true value.
1629       (but if the error callback returns a long replacement string
1630       we'll have to allocate more space) */
1631    v = _PyUnicode_New(size);
1632    if (v == NULL)
1633        goto onError;
1634    if (size == 0)
1635        return (PyObject *)v;
1636
1637    p = PyUnicode_AS_UNICODE(v);
1638    end = s + size;
1639
1640    while (s < end) {
1641        unsigned char c;
1642        Py_UNICODE x;
1643        int digits;
1644
1645        /* Non-escape characters are interpreted as Unicode ordinals */
1646        if (*s != '\\') {
1647            *p++ = (unsigned char) *s++;
1648            continue;
1649        }
1650
1651        startinpos = s-starts;
1652        /* \ - Escapes */
1653        s++;
1654        switch (*s++) {
1655
1656        /* \x escapes */
1657        case '\n': break;
1658        case '\\': *p++ = '\\'; break;
1659        case '\'': *p++ = '\''; break;
1660        case '\"': *p++ = '\"'; break;
1661        case 'b': *p++ = '\b'; break;
1662        case 'f': *p++ = '\014'; break; /* FF */
1663        case 't': *p++ = '\t'; break;
1664        case 'n': *p++ = '\n'; break;
1665        case 'r': *p++ = '\r'; break;
1666        case 'v': *p++ = '\013'; break; /* VT */
1667        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1668
1669        /* \OOO (octal) escapes */
1670        case '0': case '1': case '2': case '3':
1671        case '4': case '5': case '6': case '7':
1672            x = s[-1] - '0';
1673            if ('0' <= *s && *s <= '7') {
1674                x = (x<<3) + *s++ - '0';
1675                if ('0' <= *s && *s <= '7')
1676                    x = (x<<3) + *s++ - '0';
1677            }
1678            *p++ = x;
1679            break;
1680
1681        /* hex escapes */
1682        /* \xXX */
1683        case 'x':
1684            digits = 2;
1685            message = "truncated \\xXX escape";
1686            goto hexescape;
1687
1688        /* \uXXXX */
1689        case 'u':
1690            digits = 4;
1691            message = "truncated \\uXXXX escape";
1692            goto hexescape;
1693
1694        /* \UXXXXXXXX */
1695        case 'U':
1696            digits = 8;
1697            message = "truncated \\UXXXXXXXX escape";
1698        hexescape:
1699            chr = 0;
1700            outpos = p-PyUnicode_AS_UNICODE(v);
1701            if (s+digits>end) {
1702                endinpos = size;
1703                if (unicode_decode_call_errorhandler(
1704                    errors, &errorHandler,
1705                    "unicodeescape", "end of string in escape sequence",
1706                    starts, size, &startinpos, &endinpos, &exc, &s,
1707                    (PyObject **)&v, &outpos, &p))
1708                    goto onError;
1709                goto nextByte;
1710            }
1711            for (i = 0; i < digits; ++i) {
1712                c = (unsigned char) s[i];
1713                if (!isxdigit(c)) {
1714                    endinpos = (s+i+1)-starts;
1715                    if (unicode_decode_call_errorhandler(
1716                        errors, &errorHandler,
1717                        "unicodeescape", message,
1718                        starts, size, &startinpos, &endinpos, &exc, &s,
1719                        (PyObject **)&v, &outpos, &p))
1720                        goto onError;
1721                    goto nextByte;
1722                }
1723                chr = (chr<<4) & ~0xF;
1724                if (c >= '0' && c <= '9')
1725                    chr += c - '0';
1726                else if (c >= 'a' && c <= 'f')
1727                    chr += 10 + c - 'a';
1728                else
1729                    chr += 10 + c - 'A';
1730            }
1731            s += i;
1732            if (chr == 0xffffffff)
1733                /* _decoding_error will have already written into the
1734                   target buffer. */
1735                break;
1736        store:
1737            /* when we get here, chr is a 32-bit unicode character */
1738            if (chr <= 0xffff)
1739                /* UCS-2 character */
1740                *p++ = (Py_UNICODE) chr;
1741            else if (chr <= 0x10ffff) {
1742                /* UCS-4 character. Either store directly, or as
1743                   surrogate pair. */
1744#ifdef Py_UNICODE_WIDE
1745                *p++ = chr;
1746#else
1747                chr -= 0x10000L;
1748                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1749                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1750#endif
1751            } else {
1752                endinpos = s-starts;
1753                outpos = p-PyUnicode_AS_UNICODE(v);
1754                if (unicode_decode_call_errorhandler(
1755                    errors, &errorHandler,
1756                    "unicodeescape", "illegal Unicode character",
1757                    starts, size, &startinpos, &endinpos, &exc, &s,
1758                    (PyObject **)&v, &outpos, &p))
1759                    goto onError;
1760            }
1761            break;
1762
1763        /* \N{name} */
1764        case 'N':
1765            message = "malformed \\N character escape";
1766            if (ucnhash_CAPI == NULL) {
1767                /* load the unicode data module */
1768                PyObject *m, *v;
1769                m = PyImport_ImportModule("unicodedata");
1770                if (m == NULL)
1771                    goto ucnhashError;
1772                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1773                Py_DECREF(m);
1774                if (v == NULL)
1775                    goto ucnhashError;
1776                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1777                Py_DECREF(v);
1778                if (ucnhash_CAPI == NULL)
1779                    goto ucnhashError;
1780            }
1781            if (*s == '{') {
1782                const char *start = s+1;
1783                /* look for the closing brace */
1784                while (*s != '}' && s < end)
1785                    s++;
1786                if (s > start && s < end && *s == '}') {
1787                    /* found a name.  look it up in the unicode database */
1788                    message = "unknown Unicode character name";
1789                    s++;
1790                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1791                        goto store;
1792                }
1793            }
1794            endinpos = s-starts;
1795            outpos = p-PyUnicode_AS_UNICODE(v);
1796            if (unicode_decode_call_errorhandler(
1797                errors, &errorHandler,
1798                "unicodeescape", message,
1799                starts, size, &startinpos, &endinpos, &exc, &s,
1800                (PyObject **)&v, &outpos, &p))
1801                goto onError;
1802            break;
1803
1804        default:
1805            if (s > end) {
1806                message = "\\ at end of string";
1807                s--;
1808                endinpos = s-starts;
1809                outpos = p-PyUnicode_AS_UNICODE(v);
1810                if (unicode_decode_call_errorhandler(
1811                    errors, &errorHandler,
1812                    "unicodeescape", message,
1813                    starts, size, &startinpos, &endinpos, &exc, &s,
1814                    (PyObject **)&v, &outpos, &p))
1815                    goto onError;
1816            }
1817            else {
1818                *p++ = '\\';
1819                *p++ = (unsigned char)s[-1];
1820            }
1821            break;
1822        }
1823        nextByte:
1824        ;
1825    }
1826    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1827        goto onError;
1828    return (PyObject *)v;
1829
1830ucnhashError:
1831    PyErr_SetString(
1832        PyExc_UnicodeError,
1833        "\\N escapes not supported (can't load unicodedata module)"
1834        );
1835    Py_XDECREF(errorHandler);
1836    Py_XDECREF(exc);
1837    return NULL;
1838
1839onError:
1840    Py_XDECREF(v);
1841    Py_XDECREF(errorHandler);
1842    Py_XDECREF(exc);
1843    return NULL;
1844}
1845
1846/* Return a Unicode-Escape string version of the Unicode object.
1847
1848   If quotes is true, the string is enclosed in u"" or u'' quotes as
1849   appropriate.
1850
1851*/
1852
1853static const Py_UNICODE *findchar(const Py_UNICODE *s,
1854				  int size,
1855				  Py_UNICODE ch);
1856
1857static
1858PyObject *unicodeescape_string(const Py_UNICODE *s,
1859                               int size,
1860                               int quotes)
1861{
1862    PyObject *repr;
1863    char *p;
1864
1865    static const char *hexdigit = "0123456789abcdef";
1866
1867    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1868    if (repr == NULL)
1869        return NULL;
1870
1871    p = PyString_AS_STRING(repr);
1872
1873    if (quotes) {
1874        *p++ = 'u';
1875        *p++ = (findchar(s, size, '\'') &&
1876                !findchar(s, size, '"')) ? '"' : '\'';
1877    }
1878    while (size-- > 0) {
1879        Py_UNICODE ch = *s++;
1880
1881        /* Escape quotes */
1882        if (quotes &&
1883	    (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1884            *p++ = '\\';
1885            *p++ = (char) ch;
1886	    continue;
1887        }
1888
1889#ifdef Py_UNICODE_WIDE
1890        /* Map 21-bit characters to '\U00xxxxxx' */
1891        else if (ch >= 0x10000) {
1892	    int offset = p - PyString_AS_STRING(repr);
1893
1894	    /* Resize the string if necessary */
1895	    if (offset + 12 > PyString_GET_SIZE(repr)) {
1896		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1897		    return NULL;
1898		p = PyString_AS_STRING(repr) + offset;
1899	    }
1900
1901            *p++ = '\\';
1902            *p++ = 'U';
1903            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1904            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1905            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1906            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1907            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1908            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1909            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1910            *p++ = hexdigit[ch & 0x0000000F];
1911	    continue;
1912        }
1913#endif
1914	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1915	else if (ch >= 0xD800 && ch < 0xDC00) {
1916	    Py_UNICODE ch2;
1917	    Py_UCS4 ucs;
1918
1919	    ch2 = *s++;
1920	    size--;
1921	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1922		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1923		*p++ = '\\';
1924		*p++ = 'U';
1925		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1926		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1927		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1928		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1929		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1930		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1931		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1932		*p++ = hexdigit[ucs & 0x0000000F];
1933		continue;
1934	    }
1935	    /* Fall through: isolated surrogates are copied as-is */
1936	    s--;
1937	    size++;
1938	}
1939
1940        /* Map 16-bit characters to '\uxxxx' */
1941        if (ch >= 256) {
1942            *p++ = '\\';
1943            *p++ = 'u';
1944            *p++ = hexdigit[(ch >> 12) & 0x000F];
1945            *p++ = hexdigit[(ch >> 8) & 0x000F];
1946            *p++ = hexdigit[(ch >> 4) & 0x000F];
1947            *p++ = hexdigit[ch & 0x000F];
1948        }
1949
1950        /* Map special whitespace to '\t', \n', '\r' */
1951        else if (ch == '\t') {
1952            *p++ = '\\';
1953            *p++ = 't';
1954        }
1955        else if (ch == '\n') {
1956            *p++ = '\\';
1957            *p++ = 'n';
1958        }
1959        else if (ch == '\r') {
1960            *p++ = '\\';
1961            *p++ = 'r';
1962        }
1963
1964        /* Map non-printable US ASCII to '\xhh' */
1965        else if (ch < ' ' || ch >= 0x7F) {
1966            *p++ = '\\';
1967            *p++ = 'x';
1968            *p++ = hexdigit[(ch >> 4) & 0x000F];
1969            *p++ = hexdigit[ch & 0x000F];
1970        }
1971
1972        /* Copy everything else as-is */
1973        else
1974            *p++ = (char) ch;
1975    }
1976    if (quotes)
1977        *p++ = PyString_AS_STRING(repr)[1];
1978
1979    *p = '\0';
1980    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
1981    return repr;
1982}
1983
1984PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1985					int size)
1986{
1987    return unicodeescape_string(s, size, 0);
1988}
1989
1990PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1991{
1992    if (!PyUnicode_Check(unicode)) {
1993        PyErr_BadArgument();
1994        return NULL;
1995    }
1996    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1997					 PyUnicode_GET_SIZE(unicode));
1998}
1999
2000/* --- Raw Unicode Escape Codec ------------------------------------------- */
2001
2002PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2003					   int size,
2004					   const char *errors)
2005{
2006    const char *starts = s;
2007    int startinpos;
2008    int endinpos;
2009    int outpos;
2010    PyUnicodeObject *v;
2011    Py_UNICODE *p;
2012    const char *end;
2013    const char *bs;
2014    PyObject *errorHandler = NULL;
2015    PyObject *exc = NULL;
2016
2017    /* Escaped strings will always be longer than the resulting
2018       Unicode string, so we start with size here and then reduce the
2019       length after conversion to the true value. (But decoding error
2020       handler might have to resize the string) */
2021    v = _PyUnicode_New(size);
2022    if (v == NULL)
2023	goto onError;
2024    if (size == 0)
2025	return (PyObject *)v;
2026    p = PyUnicode_AS_UNICODE(v);
2027    end = s + size;
2028    while (s < end) {
2029	unsigned char c;
2030	Py_UCS4 x;
2031	int i;
2032
2033	/* Non-escape characters are interpreted as Unicode ordinals */
2034	if (*s != '\\') {
2035	    *p++ = (unsigned char)*s++;
2036	    continue;
2037	}
2038	startinpos = s-starts;
2039
2040	/* \u-escapes are only interpreted iff the number of leading
2041	   backslashes if odd */
2042	bs = s;
2043	for (;s < end;) {
2044	    if (*s != '\\')
2045		break;
2046	    *p++ = (unsigned char)*s++;
2047	}
2048	if (((s - bs) & 1) == 0 ||
2049	    s >= end ||
2050	    *s != 'u') {
2051	    continue;
2052	}
2053	p--;
2054	s++;
2055
2056	/* \uXXXX with 4 hex digits */
2057	outpos = p-PyUnicode_AS_UNICODE(v);
2058	for (x = 0, i = 0; i < 4; ++i, ++s) {
2059	    c = (unsigned char)*s;
2060	    if (!isxdigit(c)) {
2061		endinpos = s-starts;
2062		if (unicode_decode_call_errorhandler(
2063		    errors, &errorHandler,
2064		    "rawunicodeescape", "truncated \\uXXXX",
2065		    starts, size, &startinpos, &endinpos, &exc, &s,
2066		    (PyObject **)&v, &outpos, &p))
2067		    goto onError;
2068		goto nextByte;
2069	    }
2070	    x = (x<<4) & ~0xF;
2071	    if (c >= '0' && c <= '9')
2072		x += c - '0';
2073	    else if (c >= 'a' && c <= 'f')
2074		x += 10 + c - 'a';
2075	    else
2076		x += 10 + c - 'A';
2077	}
2078	*p++ = x;
2079	nextByte:
2080	;
2081    }
2082    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2083	goto onError;
2084    Py_XDECREF(errorHandler);
2085    Py_XDECREF(exc);
2086    return (PyObject *)v;
2087
2088 onError:
2089    Py_XDECREF(v);
2090    Py_XDECREF(errorHandler);
2091    Py_XDECREF(exc);
2092    return NULL;
2093}
2094
2095PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2096					   int size)
2097{
2098    PyObject *repr;
2099    char *p;
2100    char *q;
2101
2102    static const char *hexdigit = "0123456789abcdef";
2103
2104    repr = PyString_FromStringAndSize(NULL, 6 * size);
2105    if (repr == NULL)
2106        return NULL;
2107    if (size == 0)
2108	return repr;
2109
2110    p = q = PyString_AS_STRING(repr);
2111    while (size-- > 0) {
2112        Py_UNICODE ch = *s++;
2113	/* Map 16-bit characters to '\uxxxx' */
2114	if (ch >= 256) {
2115            *p++ = '\\';
2116            *p++ = 'u';
2117            *p++ = hexdigit[(ch >> 12) & 0xf];
2118            *p++ = hexdigit[(ch >> 8) & 0xf];
2119            *p++ = hexdigit[(ch >> 4) & 0xf];
2120            *p++ = hexdigit[ch & 15];
2121        }
2122	/* Copy everything else as-is */
2123	else
2124            *p++ = (char) ch;
2125    }
2126    *p = '\0';
2127    _PyString_Resize(&repr, p - q);
2128    return repr;
2129}
2130
2131PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2132{
2133    if (!PyUnicode_Check(unicode)) {
2134	PyErr_BadArgument();
2135	return NULL;
2136    }
2137    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2138					    PyUnicode_GET_SIZE(unicode));
2139}
2140
2141/* --- Latin-1 Codec ------------------------------------------------------ */
2142
2143PyObject *PyUnicode_DecodeLatin1(const char *s,
2144				 int size,
2145				 const char *errors)
2146{
2147    PyUnicodeObject *v;
2148    Py_UNICODE *p;
2149
2150    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2151    if (size == 1 && *(unsigned char*)s < 256) {
2152	Py_UNICODE r = *(unsigned char*)s;
2153	return PyUnicode_FromUnicode(&r, 1);
2154    }
2155
2156    v = _PyUnicode_New(size);
2157    if (v == NULL)
2158	goto onError;
2159    if (size == 0)
2160	return (PyObject *)v;
2161    p = PyUnicode_AS_UNICODE(v);
2162    while (size-- > 0)
2163	*p++ = (unsigned char)*s++;
2164    return (PyObject *)v;
2165
2166 onError:
2167    Py_XDECREF(v);
2168    return NULL;
2169}
2170
2171/* create or adjust a UnicodeEncodeError */
2172static void make_encode_exception(PyObject **exceptionObject,
2173    const char *encoding,
2174    const Py_UNICODE *unicode, int size,
2175    int startpos, int endpos,
2176    const char *reason)
2177{
2178    if (*exceptionObject == NULL) {
2179	*exceptionObject = PyUnicodeEncodeError_Create(
2180	    encoding, unicode, size, startpos, endpos, reason);
2181    }
2182    else {
2183	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2184	    goto onError;
2185	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2186	    goto onError;
2187	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2188	    goto onError;
2189	return;
2190	onError:
2191	Py_DECREF(*exceptionObject);
2192	*exceptionObject = NULL;
2193    }
2194}
2195
2196/* raises a UnicodeEncodeError */
2197static void raise_encode_exception(PyObject **exceptionObject,
2198    const char *encoding,
2199    const Py_UNICODE *unicode, int size,
2200    int startpos, int endpos,
2201    const char *reason)
2202{
2203    make_encode_exception(exceptionObject,
2204	encoding, unicode, size, startpos, endpos, reason);
2205    if (*exceptionObject != NULL)
2206	PyCodec_StrictErrors(*exceptionObject);
2207}
2208
2209/* error handling callback helper:
2210   build arguments, call the callback and check the arguments,
2211   put the result into newpos and return the replacement string, which
2212   has to be freed by the caller */
2213static PyObject *unicode_encode_call_errorhandler(const char *errors,
2214    PyObject **errorHandler,
2215    const char *encoding, const char *reason,
2216    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2217    int startpos, int endpos,
2218    int *newpos)
2219{
2220    static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2221
2222    PyObject *restuple;
2223    PyObject *resunicode;
2224
2225    if (*errorHandler == NULL) {
2226	*errorHandler = PyCodec_LookupError(errors);
2227        if (*errorHandler == NULL)
2228	    return NULL;
2229    }
2230
2231    make_encode_exception(exceptionObject,
2232	encoding, unicode, size, startpos, endpos, reason);
2233    if (*exceptionObject == NULL)
2234	return NULL;
2235
2236    restuple = PyObject_CallFunctionObjArgs(
2237	*errorHandler, *exceptionObject, NULL);
2238    if (restuple == NULL)
2239	return NULL;
2240    if (!PyTuple_Check(restuple)) {
2241	PyErr_Format(PyExc_TypeError, &argparse[4]);
2242	Py_DECREF(restuple);
2243	return NULL;
2244    }
2245    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2246	&resunicode, newpos)) {
2247	Py_DECREF(restuple);
2248	return NULL;
2249    }
2250    if (*newpos<0)
2251	*newpos = size+*newpos;
2252    if (*newpos<0 || *newpos>size) {
2253	PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2254	Py_DECREF(restuple);
2255	return NULL;
2256    }
2257    Py_INCREF(resunicode);
2258    Py_DECREF(restuple);
2259    return resunicode;
2260}
2261
2262static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2263				 int size,
2264				 const char *errors,
2265				 int limit)
2266{
2267    /* output object */
2268    PyObject *res;
2269    /* pointers to the beginning and end+1 of input */
2270    const Py_UNICODE *startp = p;
2271    const Py_UNICODE *endp = p + size;
2272    /* pointer to the beginning of the unencodable characters */
2273    /* const Py_UNICODE *badp = NULL; */
2274    /* pointer into the output */
2275    char *str;
2276    /* current output position */
2277    int respos = 0;
2278    int ressize;
2279    char *encoding = (limit == 256) ? "latin-1" : "ascii";
2280    char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2281    PyObject *errorHandler = NULL;
2282    PyObject *exc = NULL;
2283    /* the following variable is used for caching string comparisons
2284     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2285    int known_errorHandler = -1;
2286
2287    /* allocate enough for a simple encoding without
2288       replacements, if we need more, we'll resize */
2289    res = PyString_FromStringAndSize(NULL, size);
2290    if (res == NULL)
2291        goto onError;
2292    if (size == 0)
2293	return res;
2294    str = PyString_AS_STRING(res);
2295    ressize = size;
2296
2297    while (p<endp) {
2298	Py_UNICODE c = *p;
2299
2300	/* can we encode this? */
2301	if (c<limit) {
2302	    /* no overflow check, because we know that the space is enough */
2303	    *str++ = (char)c;
2304	    ++p;
2305	}
2306	else {
2307	    int unicodepos = p-startp;
2308	    int requiredsize;
2309	    PyObject *repunicode;
2310	    int repsize;
2311	    int newpos;
2312	    int respos;
2313	    Py_UNICODE *uni2;
2314	    /* startpos for collecting unencodable chars */
2315	    const Py_UNICODE *collstart = p;
2316	    const Py_UNICODE *collend = p;
2317	    /* find all unecodable characters */
2318	    while ((collend < endp) && ((*collend)>=limit))
2319		++collend;
2320	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2321	    if (known_errorHandler==-1) {
2322		if ((errors==NULL) || (!strcmp(errors, "strict")))
2323		    known_errorHandler = 1;
2324		else if (!strcmp(errors, "replace"))
2325		    known_errorHandler = 2;
2326		else if (!strcmp(errors, "ignore"))
2327		    known_errorHandler = 3;
2328		else if (!strcmp(errors, "xmlcharrefreplace"))
2329		    known_errorHandler = 4;
2330		else
2331		    known_errorHandler = 0;
2332	    }
2333	    switch (known_errorHandler) {
2334		case 1: /* strict */
2335		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2336		    goto onError;
2337		case 2: /* replace */
2338		    while (collstart++<collend)
2339			*str++ = '?'; /* fall through */
2340		case 3: /* ignore */
2341		    p = collend;
2342		    break;
2343		case 4: /* xmlcharrefreplace */
2344		    respos = str-PyString_AS_STRING(res);
2345		    /* determine replacement size (temporarily (mis)uses p) */
2346		    for (p = collstart, repsize = 0; p < collend; ++p) {
2347			if (*p<10)
2348			    repsize += 2+1+1;
2349			else if (*p<100)
2350			    repsize += 2+2+1;
2351			else if (*p<1000)
2352			    repsize += 2+3+1;
2353			else if (*p<10000)
2354			    repsize += 2+4+1;
2355			else if (*p<100000)
2356			    repsize += 2+5+1;
2357			else if (*p<1000000)
2358			    repsize += 2+6+1;
2359			else
2360			    repsize += 2+7+1;
2361		    }
2362		    requiredsize = respos+repsize+(endp-collend);
2363		    if (requiredsize > ressize) {
2364			if (requiredsize<2*ressize)
2365			    requiredsize = 2*ressize;
2366			if (_PyString_Resize(&res, requiredsize))
2367			    goto onError;
2368			str = PyString_AS_STRING(res) + respos;
2369			ressize = requiredsize;
2370		    }
2371		    /* generate replacement (temporarily (mis)uses p) */
2372		    for (p = collstart; p < collend; ++p) {
2373			str += sprintf(str, "&#%d;", (int)*p);
2374		    }
2375		    p = collend;
2376		    break;
2377		default:
2378		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2379			encoding, reason, startp, size, &exc,
2380			collstart-startp, collend-startp, &newpos);
2381		    if (repunicode == NULL)
2382			goto onError;
2383		    /* need more space? (at least enough for what we
2384		       have+the replacement+the rest of the string, so
2385		       we won't have to check space for encodable characters) */
2386		    respos = str-PyString_AS_STRING(res);
2387		    repsize = PyUnicode_GET_SIZE(repunicode);
2388		    requiredsize = respos+repsize+(endp-collend);
2389		    if (requiredsize > ressize) {
2390			if (requiredsize<2*ressize)
2391			    requiredsize = 2*ressize;
2392			if (_PyString_Resize(&res, requiredsize)) {
2393			    Py_DECREF(repunicode);
2394			    goto onError;
2395			}
2396			str = PyString_AS_STRING(res) + respos;
2397			ressize = requiredsize;
2398		    }
2399		    /* check if there is anything unencodable in the replacement
2400		       and copy it to the output */
2401		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2402			c = *uni2;
2403			if (c >= limit) {
2404			    raise_encode_exception(&exc, encoding, startp, size,
2405				unicodepos, unicodepos+1, reason);
2406			    Py_DECREF(repunicode);
2407			    goto onError;
2408			}
2409			*str = (char)c;
2410		    }
2411		    p = startp + newpos;
2412		    Py_DECREF(repunicode);
2413	    }
2414	}
2415    }
2416    /* Resize if we allocated to much */
2417    respos = str-PyString_AS_STRING(res);
2418    if (respos<ressize)
2419       /* If this falls res will be NULL */
2420	_PyString_Resize(&res, respos);
2421    Py_XDECREF(errorHandler);
2422    Py_XDECREF(exc);
2423    return res;
2424
2425    onError:
2426    Py_XDECREF(res);
2427    Py_XDECREF(errorHandler);
2428    Py_XDECREF(exc);
2429    return NULL;
2430}
2431
2432PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2433				 int size,
2434				 const char *errors)
2435{
2436    return unicode_encode_ucs1(p, size, errors, 256);
2437}
2438
2439PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2440{
2441    if (!PyUnicode_Check(unicode)) {
2442	PyErr_BadArgument();
2443	return NULL;
2444    }
2445    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2446				  PyUnicode_GET_SIZE(unicode),
2447				  NULL);
2448}
2449
2450/* --- 7-bit ASCII Codec -------------------------------------------------- */
2451
2452PyObject *PyUnicode_DecodeASCII(const char *s,
2453				int size,
2454				const char *errors)
2455{
2456    const char *starts = s;
2457    PyUnicodeObject *v;
2458    Py_UNICODE *p;
2459    int startinpos;
2460    int endinpos;
2461    int outpos;
2462    const char *e;
2463    PyObject *errorHandler = NULL;
2464    PyObject *exc = NULL;
2465
2466    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2467    if (size == 1 && *(unsigned char*)s < 128) {
2468	Py_UNICODE r = *(unsigned char*)s;
2469	return PyUnicode_FromUnicode(&r, 1);
2470    }
2471
2472    v = _PyUnicode_New(size);
2473    if (v == NULL)
2474	goto onError;
2475    if (size == 0)
2476	return (PyObject *)v;
2477    p = PyUnicode_AS_UNICODE(v);
2478    e = s + size;
2479    while (s < e) {
2480	register unsigned char c = (unsigned char)*s;
2481	if (c < 128) {
2482	    *p++ = c;
2483	    ++s;
2484	}
2485	else {
2486	    startinpos = s-starts;
2487	    endinpos = startinpos + 1;
2488	    outpos = p-PyUnicode_AS_UNICODE(v);
2489	    if (unicode_decode_call_errorhandler(
2490		 errors, &errorHandler,
2491		 "ascii", "ordinal not in range(128)",
2492		 starts, size, &startinpos, &endinpos, &exc, &s,
2493		 (PyObject **)&v, &outpos, &p))
2494		goto onError;
2495	}
2496    }
2497    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2498	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2499	    goto onError;
2500    Py_XDECREF(errorHandler);
2501    Py_XDECREF(exc);
2502    return (PyObject *)v;
2503
2504 onError:
2505    Py_XDECREF(v);
2506    Py_XDECREF(errorHandler);
2507    Py_XDECREF(exc);
2508    return NULL;
2509}
2510
2511PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2512				int size,
2513				const char *errors)
2514{
2515    return unicode_encode_ucs1(p, size, errors, 128);
2516}
2517
2518PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2519{
2520    if (!PyUnicode_Check(unicode)) {
2521	PyErr_BadArgument();
2522	return NULL;
2523    }
2524    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2525				 PyUnicode_GET_SIZE(unicode),
2526				 NULL);
2527}
2528
2529#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2530
2531/* --- MBCS codecs for Windows -------------------------------------------- */
2532
2533PyObject *PyUnicode_DecodeMBCS(const char *s,
2534				int size,
2535				const char *errors)
2536{
2537    PyUnicodeObject *v;
2538    Py_UNICODE *p;
2539
2540    /* First get the size of the result */
2541    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2542    if (size > 0 && usize==0)
2543        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2544
2545    v = _PyUnicode_New(usize);
2546    if (v == NULL)
2547        return NULL;
2548    if (usize == 0)
2549	return (PyObject *)v;
2550    p = PyUnicode_AS_UNICODE(v);
2551    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2552        Py_DECREF(v);
2553        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2554    }
2555
2556    return (PyObject *)v;
2557}
2558
2559PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2560				int size,
2561				const char *errors)
2562{
2563    PyObject *repr;
2564    char *s;
2565    DWORD mbcssize;
2566
2567    /* If there are no characters, bail now! */
2568    if (size==0)
2569	    return PyString_FromString("");
2570
2571    /* First get the size of the result */
2572    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2573    if (mbcssize==0)
2574        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2575
2576    repr = PyString_FromStringAndSize(NULL, mbcssize);
2577    if (repr == NULL)
2578        return NULL;
2579    if (mbcssize == 0)
2580        return repr;
2581
2582    /* Do the conversion */
2583    s = PyString_AS_STRING(repr);
2584    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2585        Py_DECREF(repr);
2586        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2587    }
2588    return repr;
2589}
2590
2591#endif /* MS_WINDOWS */
2592
2593/* --- Character Mapping Codec -------------------------------------------- */
2594
2595PyObject *PyUnicode_DecodeCharmap(const char *s,
2596				  int size,
2597				  PyObject *mapping,
2598				  const char *errors)
2599{
2600    const char *starts = s;
2601    int startinpos;
2602    int endinpos;
2603    int outpos;
2604    const char *e;
2605    PyUnicodeObject *v;
2606    Py_UNICODE *p;
2607    int extrachars = 0;
2608    PyObject *errorHandler = NULL;
2609    PyObject *exc = NULL;
2610
2611    /* Default to Latin-1 */
2612    if (mapping == NULL)
2613	return PyUnicode_DecodeLatin1(s, size, errors);
2614
2615    v = _PyUnicode_New(size);
2616    if (v == NULL)
2617	goto onError;
2618    if (size == 0)
2619	return (PyObject *)v;
2620    p = PyUnicode_AS_UNICODE(v);
2621    e = s + size;
2622    while (s < e) {
2623	unsigned char ch = *s;
2624	PyObject *w, *x;
2625
2626	/* Get mapping (char ordinal -> integer, Unicode char or None) */
2627	w = PyInt_FromLong((long)ch);
2628	if (w == NULL)
2629	    goto onError;
2630	x = PyObject_GetItem(mapping, w);
2631	Py_DECREF(w);
2632	if (x == NULL) {
2633	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2634		/* No mapping found means: mapping is undefined. */
2635		PyErr_Clear();
2636		x = Py_None;
2637		Py_INCREF(x);
2638	    } else
2639		goto onError;
2640	}
2641
2642	/* Apply mapping */
2643	if (PyInt_Check(x)) {
2644	    long value = PyInt_AS_LONG(x);
2645	    if (value < 0 || value > 65535) {
2646		PyErr_SetString(PyExc_TypeError,
2647				"character mapping must be in range(65536)");
2648		Py_DECREF(x);
2649		goto onError;
2650	    }
2651	    *p++ = (Py_UNICODE)value;
2652	}
2653	else if (x == Py_None) {
2654	    /* undefined mapping */
2655	    outpos = p-PyUnicode_AS_UNICODE(v);
2656	    startinpos = s-starts;
2657	    endinpos = startinpos+1;
2658	    if (unicode_decode_call_errorhandler(
2659		 errors, &errorHandler,
2660		 "charmap", "character maps to <undefined>",
2661		 starts, size, &startinpos, &endinpos, &exc, &s,
2662		 (PyObject **)&v, &outpos, &p)) {
2663		Py_DECREF(x);
2664		goto onError;
2665	    }
2666	    continue;
2667	}
2668	else if (PyUnicode_Check(x)) {
2669	    int targetsize = PyUnicode_GET_SIZE(x);
2670
2671	    if (targetsize == 1)
2672		/* 1-1 mapping */
2673		*p++ = *PyUnicode_AS_UNICODE(x);
2674
2675	    else if (targetsize > 1) {
2676		/* 1-n mapping */
2677		if (targetsize > extrachars) {
2678		    /* resize first */
2679		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2680		    int needed = (targetsize - extrachars) + \
2681			         (targetsize << 2);
2682		    extrachars += needed;
2683		    if (_PyUnicode_Resize(&v,
2684					 PyUnicode_GET_SIZE(v) + needed)) {
2685			Py_DECREF(x);
2686			goto onError;
2687		    }
2688		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2689		}
2690		Py_UNICODE_COPY(p,
2691				PyUnicode_AS_UNICODE(x),
2692				targetsize);
2693		p += targetsize;
2694		extrachars -= targetsize;
2695	    }
2696	    /* 1-0 mapping: skip the character */
2697	}
2698	else {
2699	    /* wrong return value */
2700	    PyErr_SetString(PyExc_TypeError,
2701		  "character mapping must return integer, None or unicode");
2702	    Py_DECREF(x);
2703	    goto onError;
2704	}
2705	Py_DECREF(x);
2706	++s;
2707    }
2708    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2709	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2710	    goto onError;
2711    Py_XDECREF(errorHandler);
2712    Py_XDECREF(exc);
2713    return (PyObject *)v;
2714
2715 onError:
2716    Py_XDECREF(errorHandler);
2717    Py_XDECREF(exc);
2718    Py_XDECREF(v);
2719    return NULL;
2720}
2721
2722/* Lookup the character ch in the mapping. If the character
2723   can't be found, Py_None is returned (or NULL, if another
2724   error occured). */
2725static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
2726{
2727    PyObject *w = PyInt_FromLong((long)c);
2728    PyObject *x;
2729
2730    if (w == NULL)
2731	 return NULL;
2732    x = PyObject_GetItem(mapping, w);
2733    Py_DECREF(w);
2734    if (x == NULL) {
2735	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2736	    /* No mapping found means: mapping is undefined. */
2737	    PyErr_Clear();
2738	    x = Py_None;
2739	    Py_INCREF(x);
2740	    return x;
2741	} else
2742	    return NULL;
2743    }
2744    else if (x == Py_None)
2745	return x;
2746    else if (PyInt_Check(x)) {
2747	long value = PyInt_AS_LONG(x);
2748	if (value < 0 || value > 255) {
2749	    PyErr_SetString(PyExc_TypeError,
2750			     "character mapping must be in range(256)");
2751	    Py_DECREF(x);
2752	    return NULL;
2753	}
2754	return x;
2755    }
2756    else if (PyString_Check(x))
2757	return x;
2758    else {
2759	/* wrong return value */
2760	PyErr_SetString(PyExc_TypeError,
2761	      "character mapping must return integer, None or str");
2762	Py_DECREF(x);
2763	return NULL;
2764    }
2765}
2766
2767/* lookup the character, put the result in the output string and adjust
2768   various state variables. Reallocate the output string if not enough
2769   space is available. Return a new reference to the object that
2770   was put in the output buffer, or Py_None, if the mapping was undefined
2771   (in which case no character was written) or NULL, if a
2772   reallocation error ocurred. The called must decref the result */
2773static
2774PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2775    PyObject **outobj, int *outpos)
2776{
2777    PyObject *rep = charmapencode_lookup(c, mapping);
2778
2779    if (rep==NULL)
2780	return NULL;
2781    else if (rep==Py_None)
2782	return rep;
2783    else {
2784	char *outstart = PyString_AS_STRING(*outobj);
2785	int outsize = PyString_GET_SIZE(*outobj);
2786	if (PyInt_Check(rep)) {
2787	    int requiredsize = *outpos+1;
2788	    if (outsize<requiredsize) {
2789		/* exponentially overallocate to minimize reallocations */
2790		if (requiredsize < 2*outsize)
2791		    requiredsize = 2*outsize;
2792		if (_PyString_Resize(outobj, requiredsize)) {
2793		    Py_DECREF(rep);
2794		    return NULL;
2795		}
2796		outstart = PyString_AS_STRING(*outobj);
2797	    }
2798	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2799	}
2800	else {
2801	    const char *repchars = PyString_AS_STRING(rep);
2802	    int repsize = PyString_GET_SIZE(rep);
2803	    int requiredsize = *outpos+repsize;
2804	    if (outsize<requiredsize) {
2805		/* exponentially overallocate to minimize reallocations */
2806		if (requiredsize < 2*outsize)
2807		    requiredsize = 2*outsize;
2808		if (_PyString_Resize(outobj, requiredsize)) {
2809		    Py_DECREF(rep);
2810		    return NULL;
2811		}
2812		outstart = PyString_AS_STRING(*outobj);
2813	    }
2814	    memcpy(outstart + *outpos, repchars, repsize);
2815	    *outpos += repsize;
2816	}
2817    }
2818    return rep;
2819}
2820
2821/* handle an error in PyUnicode_EncodeCharmap
2822   Return 0 on success, -1 on error */
2823static
2824int charmap_encoding_error(
2825    const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2826    PyObject **exceptionObject,
2827    int *known_errorHandler, PyObject *errorHandler, const char *errors,
2828    PyObject **res, int *respos)
2829{
2830    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2831    int repsize;
2832    int newpos;
2833    Py_UNICODE *uni2;
2834    /* startpos for collecting unencodable chars */
2835    int collstartpos = *inpos;
2836    int collendpos = *inpos+1;
2837    int collpos;
2838    char *encoding = "charmap";
2839    char *reason = "character maps to <undefined>";
2840
2841    PyObject *x;
2842    /* find all unencodable characters */
2843    while (collendpos < size) {
2844	x = charmapencode_lookup(p[collendpos], mapping);
2845	if (x==NULL)
2846	    return -1;
2847	else if (x!=Py_None) {
2848	    Py_DECREF(x);
2849	    break;
2850	}
2851	Py_DECREF(x);
2852	++collendpos;
2853    }
2854    /* cache callback name lookup
2855     * (if not done yet, i.e. it's the first error) */
2856    if (*known_errorHandler==-1) {
2857	if ((errors==NULL) || (!strcmp(errors, "strict")))
2858	    *known_errorHandler = 1;
2859	else if (!strcmp(errors, "replace"))
2860	    *known_errorHandler = 2;
2861	else if (!strcmp(errors, "ignore"))
2862	    *known_errorHandler = 3;
2863	else if (!strcmp(errors, "xmlcharrefreplace"))
2864	    *known_errorHandler = 4;
2865	else
2866	    *known_errorHandler = 0;
2867    }
2868    switch (*known_errorHandler) {
2869	case 1: /* strict */
2870	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2871	    return -1;
2872	case 2: /* replace */
2873	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2874		x = charmapencode_output('?', mapping, res, respos);
2875		if (x==NULL) {
2876		    return -1;
2877		}
2878		else if (x==Py_None) {
2879		    Py_DECREF(x);
2880		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2881		    return -1;
2882		}
2883		Py_DECREF(x);
2884	    }
2885	    /* fall through */
2886	case 3: /* ignore */
2887	    *inpos = collendpos;
2888	    break;
2889	case 4: /* xmlcharrefreplace */
2890	    /* generate replacement (temporarily (mis)uses p) */
2891	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2892		char buffer[2+29+1+1];
2893		char *cp;
2894		sprintf(buffer, "&#%d;", (int)p[collpos]);
2895		for (cp = buffer; *cp; ++cp) {
2896		    x = charmapencode_output(*cp, mapping, res, respos);
2897		    if (x==NULL)
2898			return -1;
2899		    else if (x==Py_None) {
2900			Py_DECREF(x);
2901			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2902			return -1;
2903		    }
2904		    Py_DECREF(x);
2905		}
2906	    }
2907	    *inpos = collendpos;
2908	    break;
2909	default:
2910	    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2911		encoding, reason, p, size, exceptionObject,
2912		collstartpos, collendpos, &newpos);
2913	    if (repunicode == NULL)
2914		return -1;
2915	    /* generate replacement  */
2916	    repsize = PyUnicode_GET_SIZE(repunicode);
2917	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2918		x = charmapencode_output(*uni2, mapping, res, respos);
2919		if (x==NULL) {
2920		    Py_DECREF(repunicode);
2921		    return -1;
2922		}
2923		else if (x==Py_None) {
2924		    Py_DECREF(repunicode);
2925		    Py_DECREF(x);
2926		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2927		    return -1;
2928		}
2929		Py_DECREF(x);
2930	    }
2931	    *inpos = newpos;
2932	    Py_DECREF(repunicode);
2933    }
2934    return 0;
2935}
2936
2937PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2938				  int size,
2939				  PyObject *mapping,
2940				  const char *errors)
2941{
2942    /* output object */
2943    PyObject *res = NULL;
2944    /* current input position */
2945    int inpos = 0;
2946    /* current output position */
2947    int respos = 0;
2948    PyObject *errorHandler = NULL;
2949    PyObject *exc = NULL;
2950    /* the following variable is used for caching string comparisons
2951     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2952     * 3=ignore, 4=xmlcharrefreplace */
2953    int known_errorHandler = -1;
2954
2955    /* Default to Latin-1 */
2956    if (mapping == NULL)
2957	return PyUnicode_EncodeLatin1(p, size, errors);
2958
2959    /* allocate enough for a simple encoding without
2960       replacements, if we need more, we'll resize */
2961    res = PyString_FromStringAndSize(NULL, size);
2962    if (res == NULL)
2963        goto onError;
2964    if (size == 0)
2965	return res;
2966
2967    while (inpos<size) {
2968	/* try to encode it */
2969	PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
2970	if (x==NULL) /* error */
2971	    goto onError;
2972	if (x==Py_None) { /* unencodable character */
2973	    if (charmap_encoding_error(p, size, &inpos, mapping,
2974		&exc,
2975		&known_errorHandler, errorHandler, errors,
2976		&res, &respos))
2977		goto onError;
2978	}
2979	else
2980	    /* done with this character => adjust input position */
2981	    ++inpos;
2982	Py_DECREF(x);
2983    }
2984
2985    /* Resize if we allocated to much */
2986    if (respos<PyString_GET_SIZE(res)) {
2987	if (_PyString_Resize(&res, respos))
2988	    goto onError;
2989    }
2990    Py_XDECREF(exc);
2991    Py_XDECREF(errorHandler);
2992    return res;
2993
2994    onError:
2995    Py_XDECREF(res);
2996    Py_XDECREF(exc);
2997    Py_XDECREF(errorHandler);
2998    return NULL;
2999}
3000
3001PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3002				    PyObject *mapping)
3003{
3004    if (!PyUnicode_Check(unicode) || mapping == NULL) {
3005	PyErr_BadArgument();
3006	return NULL;
3007    }
3008    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3009				   PyUnicode_GET_SIZE(unicode),
3010				   mapping,
3011				   NULL);
3012}
3013
3014/* create or adjust a UnicodeTranslateError */
3015static void make_translate_exception(PyObject **exceptionObject,
3016    const Py_UNICODE *unicode, int size,
3017    int startpos, int endpos,
3018    const char *reason)
3019{
3020    if (*exceptionObject == NULL) {
3021    	*exceptionObject = PyUnicodeTranslateError_Create(
3022	    unicode, size, startpos, endpos, reason);
3023    }
3024    else {
3025	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3026	    goto onError;
3027	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3028	    goto onError;
3029	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3030	    goto onError;
3031	return;
3032	onError:
3033	Py_DECREF(*exceptionObject);
3034	*exceptionObject = NULL;
3035    }
3036}
3037
3038/* raises a UnicodeTranslateError */
3039static void raise_translate_exception(PyObject **exceptionObject,
3040    const Py_UNICODE *unicode, int size,
3041    int startpos, int endpos,
3042    const char *reason)
3043{
3044    make_translate_exception(exceptionObject,
3045	unicode, size, startpos, endpos, reason);
3046    if (*exceptionObject != NULL)
3047	PyCodec_StrictErrors(*exceptionObject);
3048}
3049
3050/* error handling callback helper:
3051   build arguments, call the callback and check the arguments,
3052   put the result into newpos and return the replacement string, which
3053   has to be freed by the caller */
3054static PyObject *unicode_translate_call_errorhandler(const char *errors,
3055    PyObject **errorHandler,
3056    const char *reason,
3057    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3058    int startpos, int endpos,
3059    int *newpos)
3060{
3061    static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3062
3063    PyObject *restuple;
3064    PyObject *resunicode;
3065
3066    if (*errorHandler == NULL) {
3067	*errorHandler = PyCodec_LookupError(errors);
3068        if (*errorHandler == NULL)
3069	    return NULL;
3070    }
3071
3072    make_translate_exception(exceptionObject,
3073	unicode, size, startpos, endpos, reason);
3074    if (*exceptionObject == NULL)
3075	return NULL;
3076
3077    restuple = PyObject_CallFunctionObjArgs(
3078	*errorHandler, *exceptionObject, NULL);
3079    if (restuple == NULL)
3080	return NULL;
3081    if (!PyTuple_Check(restuple)) {
3082	PyErr_Format(PyExc_TypeError, &argparse[4]);
3083	Py_DECREF(restuple);
3084	return NULL;
3085    }
3086    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3087	&resunicode, newpos)) {
3088	Py_DECREF(restuple);
3089	return NULL;
3090    }
3091    if (*newpos<0)
3092	*newpos = size+*newpos;
3093    if (*newpos<0 || *newpos>size) {
3094	PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3095	Py_DECREF(restuple);
3096	return NULL;
3097    }
3098    Py_INCREF(resunicode);
3099    Py_DECREF(restuple);
3100    return resunicode;
3101}
3102
3103/* Lookup the character ch in the mapping and put the result in result,
3104   which must be decrefed by the caller.
3105   Return 0 on success, -1 on error */
3106static
3107int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3108{
3109    PyObject *w = PyInt_FromLong((long)c);
3110    PyObject *x;
3111
3112    if (w == NULL)
3113	 return -1;
3114    x = PyObject_GetItem(mapping, w);
3115    Py_DECREF(w);
3116    if (x == NULL) {
3117	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3118	    /* No mapping found means: use 1:1 mapping. */
3119	    PyErr_Clear();
3120	    *result = NULL;
3121	    return 0;
3122	} else
3123	    return -1;
3124    }
3125    else if (x == Py_None) {
3126	*result = x;
3127	return 0;
3128    }
3129    else if (PyInt_Check(x)) {
3130	long value = PyInt_AS_LONG(x);
3131	long max = PyUnicode_GetMax();
3132	if (value < 0 || value > max) {
3133	    PyErr_Format(PyExc_TypeError,
3134			     "character mapping must be in range(0x%lx)", max+1);
3135	    Py_DECREF(x);
3136	    return -1;
3137	}
3138	*result = x;
3139	return 0;
3140    }
3141    else if (PyUnicode_Check(x)) {
3142	*result = x;
3143	return 0;
3144    }
3145    else {
3146	/* wrong return value */
3147	PyErr_SetString(PyExc_TypeError,
3148	      "character mapping must return integer, None or unicode");
3149	return -1;
3150    }
3151}
3152/* ensure that *outobj is at least requiredsize characters long,
3153if not reallocate and adjust various state variables.
3154Return 0 on success, -1 on error */
3155static
3156int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3157    int requiredsize)
3158{
3159    if (requiredsize > *outsize) {
3160	/* remember old output position */
3161	int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3162	/* exponentially overallocate to minimize reallocations */
3163	if (requiredsize < 2 * *outsize)
3164	    requiredsize = 2 * *outsize;
3165	if (_PyUnicode_Resize(outobj, requiredsize))
3166	    return -1;
3167	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3168	*outsize = requiredsize;
3169    }
3170    return 0;
3171}
3172/* lookup the character, put the result in the output string and adjust
3173   various state variables. Return a new reference to the object that
3174   was put in the output buffer in *result, or Py_None, if the mapping was
3175   undefined (in which case no character was written).
3176   The called must decref result.
3177   Return 0 on success, -1 on error. */
3178static
3179int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3180    PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3181{
3182    if (charmaptranslate_lookup(c, mapping, res))
3183	return -1;
3184    if (*res==NULL) {
3185	/* not found => default to 1:1 mapping */
3186	*(*outp)++ = (Py_UNICODE)c;
3187    }
3188    else if (*res==Py_None)
3189	;
3190    else if (PyInt_Check(*res)) {
3191	/* no overflow check, because we know that the space is enough */
3192	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3193    }
3194    else if (PyUnicode_Check(*res)) {
3195	int repsize = PyUnicode_GET_SIZE(*res);
3196	if (repsize==1) {
3197	    /* no overflow check, because we know that the space is enough */
3198	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3199	}
3200	else if (repsize!=0) {
3201	    /* more than one character */
3202	    int requiredsize = *outsize + repsize - 1;
3203	    if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3204		return -1;
3205	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3206	    *outp += repsize;
3207	}
3208    }
3209    else
3210	return -1;
3211    return 0;
3212}
3213
3214PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3215				     int size,
3216				     PyObject *mapping,
3217				     const char *errors)
3218{
3219    /* output object */
3220    PyObject *res = NULL;
3221    /* pointers to the beginning and end+1 of input */
3222    const Py_UNICODE *startp = p;
3223    const Py_UNICODE *endp = p + size;
3224    /* pointer into the output */
3225    Py_UNICODE *str;
3226    /* current output position */
3227    int respos = 0;
3228    int ressize;
3229    char *reason = "character maps to <undefined>";
3230    PyObject *errorHandler = NULL;
3231    PyObject *exc = NULL;
3232    /* the following variable is used for caching string comparisons
3233     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3234     * 3=ignore, 4=xmlcharrefreplace */
3235    int known_errorHandler = -1;
3236
3237    if (mapping == NULL) {
3238	PyErr_BadArgument();
3239	return NULL;
3240    }
3241
3242    /* allocate enough for a simple 1:1 translation without
3243       replacements, if we need more, we'll resize */
3244    res = PyUnicode_FromUnicode(NULL, size);
3245    if (res == NULL)
3246        goto onError;
3247    if (size == 0)
3248	return res;
3249    str = PyUnicode_AS_UNICODE(res);
3250    ressize = size;
3251
3252    while (p<endp) {
3253	/* try to encode it */
3254	PyObject *x = NULL;
3255	if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3256	    Py_XDECREF(x);
3257	    goto onError;
3258	}
3259	Py_XDECREF(x);
3260	if (x!=Py_None) /* it worked => adjust input pointer */
3261	    ++p;
3262	else { /* untranslatable character */
3263	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3264	    int repsize;
3265	    int newpos;
3266	    Py_UNICODE *uni2;
3267	    /* startpos for collecting untranslatable chars */
3268	    const Py_UNICODE *collstart = p;
3269	    const Py_UNICODE *collend = p+1;
3270	    const Py_UNICODE *coll;
3271
3272	    /* find all untranslatable characters */
3273	    while (collend < endp) {
3274	    	if (charmaptranslate_lookup(*collend, mapping, &x))
3275		    goto onError;
3276		Py_XDECREF(x);
3277		if (x!=Py_None)
3278		    break;
3279		++collend;
3280	    }
3281	    /* cache callback name lookup
3282	     * (if not done yet, i.e. it's the first error) */
3283	    if (known_errorHandler==-1) {
3284		if ((errors==NULL) || (!strcmp(errors, "strict")))
3285		    known_errorHandler = 1;
3286		else if (!strcmp(errors, "replace"))
3287		    known_errorHandler = 2;
3288		else if (!strcmp(errors, "ignore"))
3289		    known_errorHandler = 3;
3290		else if (!strcmp(errors, "xmlcharrefreplace"))
3291		    known_errorHandler = 4;
3292		else
3293		    known_errorHandler = 0;
3294	    }
3295	    switch (known_errorHandler) {
3296		case 1: /* strict */
3297		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3298		    goto onError;
3299		case 2: /* replace */
3300		    /* No need to check for space, this is a 1:1 replacement */
3301		    for (coll = collstart; coll<collend; ++coll)
3302			*str++ = '?';
3303		    /* fall through */
3304		case 3: /* ignore */
3305		    p = collend;
3306		    break;
3307		case 4: /* xmlcharrefreplace */
3308		    /* generate replacement (temporarily (mis)uses p) */
3309		    for (p = collstart; p < collend; ++p) {
3310			char buffer[2+29+1+1];
3311			char *cp;
3312			sprintf(buffer, "&#%d;", (int)*p);
3313			if (charmaptranslate_makespace(&res, &str, &ressize,
3314			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3315			    goto onError;
3316			for (cp = buffer; *cp; ++cp)
3317			    *str++ = *cp;
3318		    }
3319		    p = collend;
3320		    break;
3321		default:
3322		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3323			reason, startp, size, &exc,
3324			collstart-startp, collend-startp, &newpos);
3325		    if (repunicode == NULL)
3326			goto onError;
3327		    /* generate replacement  */
3328		    repsize = PyUnicode_GET_SIZE(repunicode);
3329		    if (charmaptranslate_makespace(&res, &str, &ressize,
3330			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3331			Py_DECREF(repunicode);
3332			goto onError;
3333		    }
3334		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3335			*str++ = *uni2;
3336		    p = startp + newpos;
3337		    Py_DECREF(repunicode);
3338	    }
3339	}
3340    }
3341    /* Resize if we allocated to much */
3342    respos = str-PyUnicode_AS_UNICODE(res);
3343    if (respos<ressize) {
3344	if (_PyUnicode_Resize(&res, respos))
3345	    goto onError;
3346    }
3347    Py_XDECREF(exc);
3348    Py_XDECREF(errorHandler);
3349    return res;
3350
3351    onError:
3352    Py_XDECREF(res);
3353    Py_XDECREF(exc);
3354    Py_XDECREF(errorHandler);
3355    return NULL;
3356}
3357
3358PyObject *PyUnicode_Translate(PyObject *str,
3359			      PyObject *mapping,
3360			      const char *errors)
3361{
3362    PyObject *result;
3363
3364    str = PyUnicode_FromObject(str);
3365    if (str == NULL)
3366	goto onError;
3367    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3368					PyUnicode_GET_SIZE(str),
3369					mapping,
3370					errors);
3371    Py_DECREF(str);
3372    return result;
3373
3374 onError:
3375    Py_XDECREF(str);
3376    return NULL;
3377}
3378
3379/* --- Decimal Encoder ---------------------------------------------------- */
3380
3381int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3382			    int length,
3383			    char *output,
3384			    const char *errors)
3385{
3386    Py_UNICODE *p, *end;
3387    PyObject *errorHandler = NULL;
3388    PyObject *exc = NULL;
3389    const char *encoding = "decimal";
3390    const char *reason = "invalid decimal Unicode string";
3391    /* the following variable is used for caching string comparisons
3392     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3393    int known_errorHandler = -1;
3394
3395    if (output == NULL) {
3396	PyErr_BadArgument();
3397	return -1;
3398    }
3399
3400    p = s;
3401    end = s + length;
3402    while (p < end) {
3403	register Py_UNICODE ch = *p;
3404	int decimal;
3405	PyObject *repunicode;
3406	int repsize;
3407	int newpos;
3408	Py_UNICODE *uni2;
3409	Py_UNICODE *collstart;
3410	Py_UNICODE *collend;
3411
3412	if (Py_UNICODE_ISSPACE(ch)) {
3413	    *output++ = ' ';
3414	    ++p;
3415	    continue;
3416	}
3417	decimal = Py_UNICODE_TODECIMAL(ch);
3418	if (decimal >= 0) {
3419	    *output++ = '0' + decimal;
3420	    ++p;
3421	    continue;
3422	}
3423	if (0 < ch && ch < 256) {
3424	    *output++ = (char)ch;
3425	    ++p;
3426	    continue;
3427	}
3428	/* All other characters are considered unencodable */
3429	collstart = p;
3430	collend = p+1;
3431	while (collend < end) {
3432	    if ((0 < *collend && *collend < 256) ||
3433	        !Py_UNICODE_ISSPACE(*collend) ||
3434	        Py_UNICODE_TODECIMAL(*collend))
3435		break;
3436	}
3437	/* cache callback name lookup
3438	 * (if not done yet, i.e. it's the first error) */
3439	if (known_errorHandler==-1) {
3440	    if ((errors==NULL) || (!strcmp(errors, "strict")))
3441		known_errorHandler = 1;
3442	    else if (!strcmp(errors, "replace"))
3443		known_errorHandler = 2;
3444	    else if (!strcmp(errors, "ignore"))
3445		known_errorHandler = 3;
3446	    else if (!strcmp(errors, "xmlcharrefreplace"))
3447		known_errorHandler = 4;
3448	    else
3449		known_errorHandler = 0;
3450	}
3451	switch (known_errorHandler) {
3452	    case 1: /* strict */
3453		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3454		goto onError;
3455	    case 2: /* replace */
3456		for (p = collstart; p < collend; ++p)
3457		    *output++ = '?';
3458		/* fall through */
3459	    case 3: /* ignore */
3460		p = collend;
3461		break;
3462	    case 4: /* xmlcharrefreplace */
3463		/* generate replacement (temporarily (mis)uses p) */
3464		for (p = collstart; p < collend; ++p)
3465		    output += sprintf(output, "&#%d;", (int)*p);
3466		p = collend;
3467		break;
3468	    default:
3469		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3470		    encoding, reason, s, length, &exc,
3471		    collstart-s, collend-s, &newpos);
3472		if (repunicode == NULL)
3473		    goto onError;
3474		/* generate replacement  */
3475		repsize = PyUnicode_GET_SIZE(repunicode);
3476		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3477		    Py_UNICODE ch = *uni2;
3478		    if (Py_UNICODE_ISSPACE(ch))
3479			*output++ = ' ';
3480		    else {
3481			decimal = Py_UNICODE_TODECIMAL(ch);
3482			if (decimal >= 0)
3483			    *output++ = '0' + decimal;
3484			else if (0 < ch && ch < 256)
3485			    *output++ = (char)ch;
3486			else {
3487			    Py_DECREF(repunicode);
3488			    raise_encode_exception(&exc, encoding,
3489				s, length, collstart-s, collend-s, reason);
3490			    goto onError;
3491			}
3492		    }
3493		}
3494		p = s + newpos;
3495		Py_DECREF(repunicode);
3496	}
3497    }
3498    /* 0-terminate the output string */
3499    *output++ = '\0';
3500    Py_XDECREF(exc);
3501    Py_XDECREF(errorHandler);
3502    return 0;
3503
3504 onError:
3505    Py_XDECREF(exc);
3506    Py_XDECREF(errorHandler);
3507    return -1;
3508}
3509
3510/* --- Helpers ------------------------------------------------------------ */
3511
3512static
3513int count(PyUnicodeObject *self,
3514	  int start,
3515	  int end,
3516	  PyUnicodeObject *substring)
3517{
3518    int count = 0;
3519
3520    if (start < 0)
3521        start += self->length;
3522    if (start < 0)
3523        start = 0;
3524    if (end > self->length)
3525        end = self->length;
3526    if (end < 0)
3527        end += self->length;
3528    if (end < 0)
3529        end = 0;
3530
3531    if (substring->length == 0)
3532	return (end - start + 1);
3533
3534    end -= substring->length;
3535
3536    while (start <= end)
3537        if (Py_UNICODE_MATCH(self, start, substring)) {
3538            count++;
3539            start += substring->length;
3540        } else
3541            start++;
3542
3543    return count;
3544}
3545
3546int PyUnicode_Count(PyObject *str,
3547		    PyObject *substr,
3548		    int start,
3549		    int end)
3550{
3551    int result;
3552
3553    str = PyUnicode_FromObject(str);
3554    if (str == NULL)
3555	return -1;
3556    substr = PyUnicode_FromObject(substr);
3557    if (substr == NULL) {
3558	Py_DECREF(str);
3559	return -1;
3560    }
3561
3562    result = count((PyUnicodeObject *)str,
3563		   start, end,
3564		   (PyUnicodeObject *)substr);
3565
3566    Py_DECREF(str);
3567    Py_DECREF(substr);
3568    return result;
3569}
3570
3571static
3572int findstring(PyUnicodeObject *self,
3573	       PyUnicodeObject *substring,
3574	       int start,
3575	       int end,
3576	       int direction)
3577{
3578    if (start < 0)
3579        start += self->length;
3580    if (start < 0)
3581        start = 0;
3582
3583    if (end > self->length)
3584        end = self->length;
3585    if (end < 0)
3586        end += self->length;
3587    if (end < 0)
3588        end = 0;
3589
3590    if (substring->length == 0)
3591	return (direction > 0) ? start : end;
3592
3593    end -= substring->length;
3594
3595    if (direction < 0) {
3596        for (; end >= start; end--)
3597            if (Py_UNICODE_MATCH(self, end, substring))
3598                return end;
3599    } else {
3600        for (; start <= end; start++)
3601            if (Py_UNICODE_MATCH(self, start, substring))
3602                return start;
3603    }
3604
3605    return -1;
3606}
3607
3608int PyUnicode_Find(PyObject *str,
3609		   PyObject *substr,
3610		   int start,
3611		   int end,
3612		   int direction)
3613{
3614    int result;
3615
3616    str = PyUnicode_FromObject(str);
3617    if (str == NULL)
3618	return -2;
3619    substr = PyUnicode_FromObject(substr);
3620    if (substr == NULL) {
3621	Py_DECREF(str);
3622	return -2;
3623    }
3624
3625    result = findstring((PyUnicodeObject *)str,
3626			(PyUnicodeObject *)substr,
3627			start, end, direction);
3628    Py_DECREF(str);
3629    Py_DECREF(substr);
3630    return result;
3631}
3632
3633static
3634int tailmatch(PyUnicodeObject *self,
3635	      PyUnicodeObject *substring,
3636	      int start,
3637	      int end,
3638	      int direction)
3639{
3640    if (start < 0)
3641        start += self->length;
3642    if (start < 0)
3643        start = 0;
3644
3645    if (substring->length == 0)
3646        return 1;
3647
3648    if (end > self->length)
3649        end = self->length;
3650    if (end < 0)
3651        end += self->length;
3652    if (end < 0)
3653        end = 0;
3654
3655    end -= substring->length;
3656    if (end < start)
3657	return 0;
3658
3659    if (direction > 0) {
3660	if (Py_UNICODE_MATCH(self, end, substring))
3661	    return 1;
3662    } else {
3663        if (Py_UNICODE_MATCH(self, start, substring))
3664	    return 1;
3665    }
3666
3667    return 0;
3668}
3669
3670int PyUnicode_Tailmatch(PyObject *str,
3671			PyObject *substr,
3672			int start,
3673			int end,
3674			int direction)
3675{
3676    int result;
3677
3678    str = PyUnicode_FromObject(str);
3679    if (str == NULL)
3680	return -1;
3681    substr = PyUnicode_FromObject(substr);
3682    if (substr == NULL) {
3683	Py_DECREF(substr);
3684	return -1;
3685    }
3686
3687    result = tailmatch((PyUnicodeObject *)str,
3688		       (PyUnicodeObject *)substr,
3689		       start, end, direction);
3690    Py_DECREF(str);
3691    Py_DECREF(substr);
3692    return result;
3693}
3694
3695static
3696const Py_UNICODE *findchar(const Py_UNICODE *s,
3697		     int size,
3698		     Py_UNICODE ch)
3699{
3700    /* like wcschr, but doesn't stop at NULL characters */
3701
3702    while (size-- > 0) {
3703        if (*s == ch)
3704            return s;
3705        s++;
3706    }
3707
3708    return NULL;
3709}
3710
3711/* Apply fixfct filter to the Unicode object self and return a
3712   reference to the modified object */
3713
3714static
3715PyObject *fixup(PyUnicodeObject *self,
3716		int (*fixfct)(PyUnicodeObject *s))
3717{
3718
3719    PyUnicodeObject *u;
3720
3721    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3722    if (u == NULL)
3723	return NULL;
3724
3725    Py_UNICODE_COPY(u->str, self->str, self->length);
3726
3727    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3728	/* fixfct should return TRUE if it modified the buffer. If
3729	   FALSE, return a reference to the original buffer instead
3730	   (to save space, not time) */
3731	Py_INCREF(self);
3732	Py_DECREF(u);
3733	return (PyObject*) self;
3734    }
3735    return (PyObject*) u;
3736}
3737
3738static
3739int fixupper(PyUnicodeObject *self)
3740{
3741    int len = self->length;
3742    Py_UNICODE *s = self->str;
3743    int status = 0;
3744
3745    while (len-- > 0) {
3746	register Py_UNICODE ch;
3747
3748	ch = Py_UNICODE_TOUPPER(*s);
3749	if (ch != *s) {
3750            status = 1;
3751	    *s = ch;
3752	}
3753        s++;
3754    }
3755
3756    return status;
3757}
3758
3759static
3760int fixlower(PyUnicodeObject *self)
3761{
3762    int len = self->length;
3763    Py_UNICODE *s = self->str;
3764    int status = 0;
3765
3766    while (len-- > 0) {
3767	register Py_UNICODE ch;
3768
3769	ch = Py_UNICODE_TOLOWER(*s);
3770	if (ch != *s) {
3771            status = 1;
3772	    *s = ch;
3773	}
3774        s++;
3775    }
3776
3777    return status;
3778}
3779
3780static
3781int fixswapcase(PyUnicodeObject *self)
3782{
3783    int len = self->length;
3784    Py_UNICODE *s = self->str;
3785    int status = 0;
3786
3787    while (len-- > 0) {
3788        if (Py_UNICODE_ISUPPER(*s)) {
3789            *s = Py_UNICODE_TOLOWER(*s);
3790            status = 1;
3791        } else if (Py_UNICODE_ISLOWER(*s)) {
3792            *s = Py_UNICODE_TOUPPER(*s);
3793            status = 1;
3794        }
3795        s++;
3796    }
3797
3798    return status;
3799}
3800
3801static
3802int fixcapitalize(PyUnicodeObject *self)
3803{
3804    int len = self->length;
3805    Py_UNICODE *s = self->str;
3806    int status = 0;
3807
3808    if (len == 0)
3809	return 0;
3810    if (Py_UNICODE_ISLOWER(*s)) {
3811	*s = Py_UNICODE_TOUPPER(*s);
3812	status = 1;
3813    }
3814    s++;
3815    while (--len > 0) {
3816        if (Py_UNICODE_ISUPPER(*s)) {
3817            *s = Py_UNICODE_TOLOWER(*s);
3818            status = 1;
3819        }
3820        s++;
3821    }
3822    return status;
3823}
3824
3825static
3826int fixtitle(PyUnicodeObject *self)
3827{
3828    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3829    register Py_UNICODE *e;
3830    int previous_is_cased;
3831
3832    /* Shortcut for single character strings */
3833    if (PyUnicode_GET_SIZE(self) == 1) {
3834	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3835	if (*p != ch) {
3836	    *p = ch;
3837	    return 1;
3838	}
3839	else
3840	    return 0;
3841    }
3842
3843    e = p + PyUnicode_GET_SIZE(self);
3844    previous_is_cased = 0;
3845    for (; p < e; p++) {
3846	register const Py_UNICODE ch = *p;
3847
3848	if (previous_is_cased)
3849	    *p = Py_UNICODE_TOLOWER(ch);
3850	else
3851	    *p = Py_UNICODE_TOTITLE(ch);
3852
3853	if (Py_UNICODE_ISLOWER(ch) ||
3854	    Py_UNICODE_ISUPPER(ch) ||
3855	    Py_UNICODE_ISTITLE(ch))
3856	    previous_is_cased = 1;
3857	else
3858	    previous_is_cased = 0;
3859    }
3860    return 1;
3861}
3862
3863PyObject *PyUnicode_Join(PyObject *separator,
3864			 PyObject *seq)
3865{
3866    Py_UNICODE *sep;
3867    int seplen;
3868    PyUnicodeObject *res = NULL;
3869    int reslen = 0;
3870    Py_UNICODE *p;
3871    int sz = 100;
3872    int i;
3873    PyObject *it;
3874
3875    it = PyObject_GetIter(seq);
3876    if (it == NULL)
3877        return NULL;
3878
3879    if (separator == NULL) {
3880	Py_UNICODE blank = ' ';
3881	sep = &blank;
3882	seplen = 1;
3883    }
3884    else {
3885	separator = PyUnicode_FromObject(separator);
3886	if (separator == NULL)
3887	    goto onError;
3888	sep = PyUnicode_AS_UNICODE(separator);
3889	seplen = PyUnicode_GET_SIZE(separator);
3890    }
3891
3892    res = _PyUnicode_New(sz);
3893    if (res == NULL)
3894	goto onError;
3895    p = PyUnicode_AS_UNICODE(res);
3896    reslen = 0;
3897
3898    for (i = 0; ; ++i) {
3899	int itemlen;
3900	PyObject *item = PyIter_Next(it);
3901	if (item == NULL) {
3902	    if (PyErr_Occurred())
3903		goto onError;
3904	    break;
3905	}
3906	if (!PyUnicode_Check(item)) {
3907	    PyObject *v;
3908	    if (!PyString_Check(item)) {
3909		PyErr_Format(PyExc_TypeError,
3910			     "sequence item %i: expected string or Unicode,"
3911			     " %.80s found",
3912			     i, item->ob_type->tp_name);
3913		Py_DECREF(item);
3914		goto onError;
3915	    }
3916	    v = PyUnicode_FromObject(item);
3917	    Py_DECREF(item);
3918	    item = v;
3919	    if (item == NULL)
3920		goto onError;
3921	}
3922	itemlen = PyUnicode_GET_SIZE(item);
3923	while (reslen + itemlen + seplen >= sz) {
3924	    if (_PyUnicode_Resize(&res, sz*2)) {
3925		Py_DECREF(item);
3926		goto onError;
3927	    }
3928	    sz *= 2;
3929	    p = PyUnicode_AS_UNICODE(res) + reslen;
3930	}
3931	if (i > 0) {
3932	    Py_UNICODE_COPY(p, sep, seplen);
3933	    p += seplen;
3934	    reslen += seplen;
3935	}
3936	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3937	p += itemlen;
3938	reslen += itemlen;
3939	Py_DECREF(item);
3940    }
3941    if (_PyUnicode_Resize(&res, reslen))
3942	goto onError;
3943
3944    Py_XDECREF(separator);
3945    Py_DECREF(it);
3946    return (PyObject *)res;
3947
3948 onError:
3949    Py_XDECREF(separator);
3950    Py_XDECREF(res);
3951    Py_DECREF(it);
3952    return NULL;
3953}
3954
3955static
3956PyUnicodeObject *pad(PyUnicodeObject *self,
3957		     int left,
3958		     int right,
3959		     Py_UNICODE fill)
3960{
3961    PyUnicodeObject *u;
3962
3963    if (left < 0)
3964        left = 0;
3965    if (right < 0)
3966        right = 0;
3967
3968    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3969        Py_INCREF(self);
3970        return self;
3971    }
3972
3973    u = _PyUnicode_New(left + self->length + right);
3974    if (u) {
3975        if (left)
3976            Py_UNICODE_FILL(u->str, fill, left);
3977        Py_UNICODE_COPY(u->str + left, self->str, self->length);
3978        if (right)
3979            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3980    }
3981
3982    return u;
3983}
3984
3985#define SPLIT_APPEND(data, left, right)					\
3986	str = PyUnicode_FromUnicode(data + left, right - left);		\
3987	if (!str)							\
3988	    goto onError;						\
3989	if (PyList_Append(list, str)) {					\
3990	    Py_DECREF(str);						\
3991	    goto onError;						\
3992	}								\
3993        else								\
3994            Py_DECREF(str);
3995
3996static
3997PyObject *split_whitespace(PyUnicodeObject *self,
3998			   PyObject *list,
3999			   int maxcount)
4000{
4001    register int i;
4002    register int j;
4003    int len = self->length;
4004    PyObject *str;
4005
4006    for (i = j = 0; i < len; ) {
4007	/* find a token */
4008	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4009	    i++;
4010	j = i;
4011	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4012	    i++;
4013	if (j < i) {
4014	    if (maxcount-- <= 0)
4015		break;
4016	    SPLIT_APPEND(self->str, j, i);
4017	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4018		i++;
4019	    j = i;
4020	}
4021    }
4022    if (j < len) {
4023	SPLIT_APPEND(self->str, j, len);
4024    }
4025    return list;
4026
4027 onError:
4028    Py_DECREF(list);
4029    return NULL;
4030}
4031
4032PyObject *PyUnicode_Splitlines(PyObject *string,
4033			       int keepends)
4034{
4035    register int i;
4036    register int j;
4037    int len;
4038    PyObject *list;
4039    PyObject *str;
4040    Py_UNICODE *data;
4041
4042    string = PyUnicode_FromObject(string);
4043    if (string == NULL)
4044	return NULL;
4045    data = PyUnicode_AS_UNICODE(string);
4046    len = PyUnicode_GET_SIZE(string);
4047
4048    list = PyList_New(0);
4049    if (!list)
4050        goto onError;
4051
4052    for (i = j = 0; i < len; ) {
4053	int eol;
4054
4055	/* Find a line and append it */
4056	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4057	    i++;
4058
4059	/* Skip the line break reading CRLF as one line break */
4060	eol = i;
4061	if (i < len) {
4062	    if (data[i] == '\r' && i + 1 < len &&
4063		data[i+1] == '\n')
4064		i += 2;
4065	    else
4066		i++;
4067	    if (keepends)
4068		eol = i;
4069	}
4070	SPLIT_APPEND(data, j, eol);
4071	j = i;
4072    }
4073    if (j < len) {
4074	SPLIT_APPEND(data, j, len);
4075    }
4076
4077    Py_DECREF(string);
4078    return list;
4079
4080 onError:
4081    Py_DECREF(list);
4082    Py_DECREF(string);
4083    return NULL;
4084}
4085
4086static
4087PyObject *split_char(PyUnicodeObject *self,
4088		     PyObject *list,
4089		     Py_UNICODE ch,
4090		     int maxcount)
4091{
4092    register int i;
4093    register int j;
4094    int len = self->length;
4095    PyObject *str;
4096
4097    for (i = j = 0; i < len; ) {
4098	if (self->str[i] == ch) {
4099	    if (maxcount-- <= 0)
4100		break;
4101	    SPLIT_APPEND(self->str, j, i);
4102	    i = j = i + 1;
4103	} else
4104	    i++;
4105    }
4106    if (j <= len) {
4107	SPLIT_APPEND(self->str, j, len);
4108    }
4109    return list;
4110
4111 onError:
4112    Py_DECREF(list);
4113    return NULL;
4114}
4115
4116static
4117PyObject *split_substring(PyUnicodeObject *self,
4118			  PyObject *list,
4119			  PyUnicodeObject *substring,
4120			  int maxcount)
4121{
4122    register int i;
4123    register int j;
4124    int len = self->length;
4125    int sublen = substring->length;
4126    PyObject *str;
4127
4128    for (i = j = 0; i <= len - sublen; ) {
4129	if (Py_UNICODE_MATCH(self, i, substring)) {
4130	    if (maxcount-- <= 0)
4131		break;
4132	    SPLIT_APPEND(self->str, j, i);
4133	    i = j = i + sublen;
4134	} else
4135	    i++;
4136    }
4137    if (j <= len) {
4138	SPLIT_APPEND(self->str, j, len);
4139    }
4140    return list;
4141
4142 onError:
4143    Py_DECREF(list);
4144    return NULL;
4145}
4146
4147#undef SPLIT_APPEND
4148
4149static
4150PyObject *split(PyUnicodeObject *self,
4151		PyUnicodeObject *substring,
4152		int maxcount)
4153{
4154    PyObject *list;
4155
4156    if (maxcount < 0)
4157        maxcount = INT_MAX;
4158
4159    list = PyList_New(0);
4160    if (!list)
4161        return NULL;
4162
4163    if (substring == NULL)
4164	return split_whitespace(self,list,maxcount);
4165
4166    else if (substring->length == 1)
4167	return split_char(self,list,substring->str[0],maxcount);
4168
4169    else if (substring->length == 0) {
4170	Py_DECREF(list);
4171	PyErr_SetString(PyExc_ValueError, "empty separator");
4172	return NULL;
4173    }
4174    else
4175	return split_substring(self,list,substring,maxcount);
4176}
4177
4178static
4179PyObject *replace(PyUnicodeObject *self,
4180		  PyUnicodeObject *str1,
4181		  PyUnicodeObject *str2,
4182		  int maxcount)
4183{
4184    PyUnicodeObject *u;
4185
4186    if (maxcount < 0)
4187	maxcount = INT_MAX;
4188
4189    if (str1->length == 1 && str2->length == 1) {
4190        int i;
4191
4192        /* replace characters */
4193        if (!findchar(self->str, self->length, str1->str[0]) &&
4194            PyUnicode_CheckExact(self)) {
4195            /* nothing to replace, return original string */
4196            Py_INCREF(self);
4197            u = self;
4198        } else {
4199	    Py_UNICODE u1 = str1->str[0];
4200	    Py_UNICODE u2 = str2->str[0];
4201
4202            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
4203                NULL,
4204                self->length
4205                );
4206            if (u != NULL) {
4207		Py_UNICODE_COPY(u->str, self->str,
4208				self->length);
4209                for (i = 0; i < u->length; i++)
4210                    if (u->str[i] == u1) {
4211                        if (--maxcount < 0)
4212                            break;
4213                        u->str[i] = u2;
4214                    }
4215        }
4216        }
4217
4218    } else {
4219        int n, i;
4220        Py_UNICODE *p;
4221
4222        /* replace strings */
4223        n = count(self, 0, self->length, str1);
4224        if (n > maxcount)
4225            n = maxcount;
4226        if (n == 0) {
4227            /* nothing to replace, return original string */
4228            if (PyUnicode_CheckExact(self)) {
4229                Py_INCREF(self);
4230                u = self;
4231            }
4232            else {
4233                u = (PyUnicodeObject *)
4234                    PyUnicode_FromUnicode(self->str, self->length);
4235	    }
4236        } else {
4237            u = _PyUnicode_New(
4238                self->length + n * (str2->length - str1->length));
4239            if (u) {
4240                i = 0;
4241                p = u->str;
4242                if (str1->length > 0) {
4243                    while (i <= self->length - str1->length)
4244                        if (Py_UNICODE_MATCH(self, i, str1)) {
4245                            /* replace string segment */
4246                            Py_UNICODE_COPY(p, str2->str, str2->length);
4247                            p += str2->length;
4248                            i += str1->length;
4249                            if (--n <= 0) {
4250                                /* copy remaining part */
4251                                Py_UNICODE_COPY(p, self->str+i, self->length-i);
4252                                break;
4253                            }
4254                        } else
4255                            *p++ = self->str[i++];
4256                } else {
4257                    while (n > 0) {
4258                        Py_UNICODE_COPY(p, str2->str, str2->length);
4259                        p += str2->length;
4260                        if (--n <= 0)
4261                            break;
4262                        *p++ = self->str[i++];
4263                    }
4264                    Py_UNICODE_COPY(p, self->str+i, self->length-i);
4265                }
4266            }
4267        }
4268    }
4269
4270    return (PyObject *) u;
4271}
4272
4273/* --- Unicode Object Methods --------------------------------------------- */
4274
4275PyDoc_STRVAR(title__doc__,
4276"S.title() -> unicode\n\
4277\n\
4278Return a titlecased version of S, i.e. words start with title case\n\
4279characters, all remaining cased characters have lower case.");
4280
4281static PyObject*
4282unicode_title(PyUnicodeObject *self)
4283{
4284    return fixup(self, fixtitle);
4285}
4286
4287PyDoc_STRVAR(capitalize__doc__,
4288"S.capitalize() -> unicode\n\
4289\n\
4290Return a capitalized version of S, i.e. make the first character\n\
4291have upper case.");
4292
4293static PyObject*
4294unicode_capitalize(PyUnicodeObject *self)
4295{
4296    return fixup(self, fixcapitalize);
4297}
4298
4299#if 0
4300PyDoc_STRVAR(capwords__doc__,
4301"S.capwords() -> unicode\n\
4302\n\
4303Apply .capitalize() to all words in S and return the result with\n\
4304normalized whitespace (all whitespace strings are replaced by ' ').");
4305
4306static PyObject*
4307unicode_capwords(PyUnicodeObject *self)
4308{
4309    PyObject *list;
4310    PyObject *item;
4311    int i;
4312
4313    /* Split into words */
4314    list = split(self, NULL, -1);
4315    if (!list)
4316        return NULL;
4317
4318    /* Capitalize each word */
4319    for (i = 0; i < PyList_GET_SIZE(list); i++) {
4320        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4321		     fixcapitalize);
4322        if (item == NULL)
4323            goto onError;
4324        Py_DECREF(PyList_GET_ITEM(list, i));
4325        PyList_SET_ITEM(list, i, item);
4326    }
4327
4328    /* Join the words to form a new string */
4329    item = PyUnicode_Join(NULL, list);
4330
4331onError:
4332    Py_DECREF(list);
4333    return (PyObject *)item;
4334}
4335#endif
4336
4337PyDoc_STRVAR(center__doc__,
4338"S.center(width) -> unicode\n\
4339\n\
4340Return S centered in a Unicode string of length width. Padding is done\n\
4341using spaces.");
4342
4343static PyObject *
4344unicode_center(PyUnicodeObject *self, PyObject *args)
4345{
4346    int marg, left;
4347    int width;
4348
4349    if (!PyArg_ParseTuple(args, "i:center", &width))
4350        return NULL;
4351
4352    if (self->length >= width && PyUnicode_CheckExact(self)) {
4353        Py_INCREF(self);
4354        return (PyObject*) self;
4355    }
4356
4357    marg = width - self->length;
4358    left = marg / 2 + (marg & width & 1);
4359
4360    return (PyObject*) pad(self, left, marg - left, ' ');
4361}
4362
4363#if 0
4364
4365/* This code should go into some future Unicode collation support
4366   module. The basic comparison should compare ordinals on a naive
4367   basis (this is what Java does and thus JPython too). */
4368
4369/* speedy UTF-16 code point order comparison */
4370/* gleaned from: */
4371/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4372
4373static short utf16Fixup[32] =
4374{
4375    0, 0, 0, 0, 0, 0, 0, 0,
4376    0, 0, 0, 0, 0, 0, 0, 0,
4377    0, 0, 0, 0, 0, 0, 0, 0,
4378    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4379};
4380
4381static int
4382unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4383{
4384    int len1, len2;
4385
4386    Py_UNICODE *s1 = str1->str;
4387    Py_UNICODE *s2 = str2->str;
4388
4389    len1 = str1->length;
4390    len2 = str2->length;
4391
4392    while (len1 > 0 && len2 > 0) {
4393        Py_UNICODE c1, c2;
4394
4395        c1 = *s1++;
4396        c2 = *s2++;
4397
4398	if (c1 > (1<<11) * 26)
4399	    c1 += utf16Fixup[c1>>11];
4400	if (c2 > (1<<11) * 26)
4401            c2 += utf16Fixup[c2>>11];
4402        /* now c1 and c2 are in UTF-32-compatible order */
4403
4404        if (c1 != c2)
4405            return (c1 < c2) ? -1 : 1;
4406
4407        len1--; len2--;
4408    }
4409
4410    return (len1 < len2) ? -1 : (len1 != len2);
4411}
4412
4413#else
4414
4415static int
4416unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4417{
4418    register int len1, len2;
4419
4420    Py_UNICODE *s1 = str1->str;
4421    Py_UNICODE *s2 = str2->str;
4422
4423    len1 = str1->length;
4424    len2 = str2->length;
4425
4426    while (len1 > 0 && len2 > 0) {
4427        Py_UNICODE c1, c2;
4428
4429        c1 = *s1++;
4430        c2 = *s2++;
4431
4432        if (c1 != c2)
4433            return (c1 < c2) ? -1 : 1;
4434
4435        len1--; len2--;
4436    }
4437
4438    return (len1 < len2) ? -1 : (len1 != len2);
4439}
4440
4441#endif
4442
4443int PyUnicode_Compare(PyObject *left,
4444		      PyObject *right)
4445{
4446    PyUnicodeObject *u = NULL, *v = NULL;
4447    int result;
4448
4449    /* Coerce the two arguments */
4450    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4451    if (u == NULL)
4452	goto onError;
4453    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4454    if (v == NULL)
4455	goto onError;
4456
4457    /* Shortcut for empty or interned objects */
4458    if (v == u) {
4459	Py_DECREF(u);
4460	Py_DECREF(v);
4461	return 0;
4462    }
4463
4464    result = unicode_compare(u, v);
4465
4466    Py_DECREF(u);
4467    Py_DECREF(v);
4468    return result;
4469
4470onError:
4471    Py_XDECREF(u);
4472    Py_XDECREF(v);
4473    return -1;
4474}
4475
4476int PyUnicode_Contains(PyObject *container,
4477		       PyObject *element)
4478{
4479    PyUnicodeObject *u = NULL, *v = NULL;
4480    int result, size;
4481    register const Py_UNICODE *lhs, *end, *rhs;
4482
4483    /* Coerce the two arguments */
4484    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
4485    if (v == NULL) {
4486	PyErr_SetString(PyExc_TypeError,
4487	    "'in <string>' requires string as left operand");
4488	goto onError;
4489    }
4490    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
4491    if (u == NULL)
4492	goto onError;
4493
4494    size = PyUnicode_GET_SIZE(v);
4495    rhs = PyUnicode_AS_UNICODE(v);
4496    lhs = PyUnicode_AS_UNICODE(u);
4497
4498    result = 0;
4499    if (size == 1) {
4500	end = lhs + PyUnicode_GET_SIZE(u);
4501	while (lhs < end) {
4502	    if (*lhs++ == *rhs) {
4503		result = 1;
4504		break;
4505	    }
4506	}
4507    }
4508    else {
4509	end = lhs + (PyUnicode_GET_SIZE(u) - size);
4510	while (lhs <= end) {
4511	    if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
4512		result = 1;
4513		break;
4514	    }
4515	}
4516    }
4517
4518    Py_DECREF(u);
4519    Py_DECREF(v);
4520    return result;
4521
4522onError:
4523    Py_XDECREF(u);
4524    Py_XDECREF(v);
4525    return -1;
4526}
4527
4528/* Concat to string or Unicode object giving a new Unicode object. */
4529
4530PyObject *PyUnicode_Concat(PyObject *left,
4531			   PyObject *right)
4532{
4533    PyUnicodeObject *u = NULL, *v = NULL, *w;
4534
4535    /* Coerce the two arguments */
4536    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4537    if (u == NULL)
4538	goto onError;
4539    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4540    if (v == NULL)
4541	goto onError;
4542
4543    /* Shortcuts */
4544    if (v == unicode_empty) {
4545	Py_DECREF(v);
4546	return (PyObject *)u;
4547    }
4548    if (u == unicode_empty) {
4549	Py_DECREF(u);
4550	return (PyObject *)v;
4551    }
4552
4553    /* Concat the two Unicode strings */
4554    w = _PyUnicode_New(u->length + v->length);
4555    if (w == NULL)
4556	goto onError;
4557    Py_UNICODE_COPY(w->str, u->str, u->length);
4558    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4559
4560    Py_DECREF(u);
4561    Py_DECREF(v);
4562    return (PyObject *)w;
4563
4564onError:
4565    Py_XDECREF(u);
4566    Py_XDECREF(v);
4567    return NULL;
4568}
4569
4570PyDoc_STRVAR(count__doc__,
4571"S.count(sub[, start[, end]]) -> int\n\
4572\n\
4573Return the number of occurrences of substring sub in Unicode string\n\
4574S[start:end].  Optional arguments start and end are\n\
4575interpreted as in slice notation.");
4576
4577static PyObject *
4578unicode_count(PyUnicodeObject *self, PyObject *args)
4579{
4580    PyUnicodeObject *substring;
4581    int start = 0;
4582    int end = INT_MAX;
4583    PyObject *result;
4584
4585    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4586		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4587        return NULL;
4588
4589    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4590						(PyObject *)substring);
4591    if (substring == NULL)
4592	return NULL;
4593
4594    if (start < 0)
4595        start += self->length;
4596    if (start < 0)
4597        start = 0;
4598    if (end > self->length)
4599        end = self->length;
4600    if (end < 0)
4601        end += self->length;
4602    if (end < 0)
4603        end = 0;
4604
4605    result = PyInt_FromLong((long) count(self, start, end, substring));
4606
4607    Py_DECREF(substring);
4608    return result;
4609}
4610
4611PyDoc_STRVAR(encode__doc__,
4612"S.encode([encoding[,errors]]) -> string\n\
4613\n\
4614Return an encoded string version of S. Default encoding is the current\n\
4615default string encoding. errors may be given to set a different error\n\
4616handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4617a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4618'xmlcharrefreplace' as well as any other name registered with\n\
4619codecs.register_error that can handle UnicodeEncodeErrors.");
4620
4621static PyObject *
4622unicode_encode(PyUnicodeObject *self, PyObject *args)
4623{
4624    char *encoding = NULL;
4625    char *errors = NULL;
4626    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4627        return NULL;
4628    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4629}
4630
4631PyDoc_STRVAR(expandtabs__doc__,
4632"S.expandtabs([tabsize]) -> unicode\n\
4633\n\
4634Return a copy of S where all tab characters are expanded using spaces.\n\
4635If tabsize is not given, a tab size of 8 characters is assumed.");
4636
4637static PyObject*
4638unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4639{
4640    Py_UNICODE *e;
4641    Py_UNICODE *p;
4642    Py_UNICODE *q;
4643    int i, j;
4644    PyUnicodeObject *u;
4645    int tabsize = 8;
4646
4647    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4648	return NULL;
4649
4650    /* First pass: determine size of output string */
4651    i = j = 0;
4652    e = self->str + self->length;
4653    for (p = self->str; p < e; p++)
4654        if (*p == '\t') {
4655	    if (tabsize > 0)
4656		j += tabsize - (j % tabsize);
4657	}
4658        else {
4659            j++;
4660            if (*p == '\n' || *p == '\r') {
4661                i += j;
4662                j = 0;
4663            }
4664        }
4665
4666    /* Second pass: create output string and fill it */
4667    u = _PyUnicode_New(i + j);
4668    if (!u)
4669        return NULL;
4670
4671    j = 0;
4672    q = u->str;
4673
4674    for (p = self->str; p < e; p++)
4675        if (*p == '\t') {
4676	    if (tabsize > 0) {
4677		i = tabsize - (j % tabsize);
4678		j += i;
4679		while (i--)
4680		    *q++ = ' ';
4681	    }
4682	}
4683	else {
4684            j++;
4685	    *q++ = *p;
4686            if (*p == '\n' || *p == '\r')
4687                j = 0;
4688        }
4689
4690    return (PyObject*) u;
4691}
4692
4693PyDoc_STRVAR(find__doc__,
4694"S.find(sub [,start [,end]]) -> int\n\
4695\n\
4696Return the lowest index in S where substring sub is found,\n\
4697such that sub is contained within s[start,end].  Optional\n\
4698arguments start and end are interpreted as in slice notation.\n\
4699\n\
4700Return -1 on failure.");
4701
4702static PyObject *
4703unicode_find(PyUnicodeObject *self, PyObject *args)
4704{
4705    PyUnicodeObject *substring;
4706    int start = 0;
4707    int end = INT_MAX;
4708    PyObject *result;
4709
4710    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4711		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4712        return NULL;
4713    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4714						(PyObject *)substring);
4715    if (substring == NULL)
4716	return NULL;
4717
4718    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4719
4720    Py_DECREF(substring);
4721    return result;
4722}
4723
4724static PyObject *
4725unicode_getitem(PyUnicodeObject *self, int index)
4726{
4727    if (index < 0 || index >= self->length) {
4728        PyErr_SetString(PyExc_IndexError, "string index out of range");
4729        return NULL;
4730    }
4731
4732    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4733}
4734
4735static long
4736unicode_hash(PyUnicodeObject *self)
4737{
4738    /* Since Unicode objects compare equal to their ASCII string
4739       counterparts, they should use the individual character values
4740       as basis for their hash value.  This is needed to assure that
4741       strings and Unicode objects behave in the same way as
4742       dictionary keys. */
4743
4744    register int len;
4745    register Py_UNICODE *p;
4746    register long x;
4747
4748    if (self->hash != -1)
4749	return self->hash;
4750    len = PyUnicode_GET_SIZE(self);
4751    p = PyUnicode_AS_UNICODE(self);
4752    x = *p << 7;
4753    while (--len >= 0)
4754	x = (1000003*x) ^ *p++;
4755    x ^= PyUnicode_GET_SIZE(self);
4756    if (x == -1)
4757	x = -2;
4758    self->hash = x;
4759    return x;
4760}
4761
4762PyDoc_STRVAR(index__doc__,
4763"S.index(sub [,start [,end]]) -> int\n\
4764\n\
4765Like S.find() but raise ValueError when the substring is not found.");
4766
4767static PyObject *
4768unicode_index(PyUnicodeObject *self, PyObject *args)
4769{
4770    int result;
4771    PyUnicodeObject *substring;
4772    int start = 0;
4773    int end = INT_MAX;
4774
4775    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4776		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4777        return NULL;
4778
4779    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4780						(PyObject *)substring);
4781    if (substring == NULL)
4782	return NULL;
4783
4784    result = findstring(self, substring, start, end, 1);
4785
4786    Py_DECREF(substring);
4787    if (result < 0) {
4788        PyErr_SetString(PyExc_ValueError, "substring not found");
4789        return NULL;
4790    }
4791    return PyInt_FromLong(result);
4792}
4793
4794PyDoc_STRVAR(islower__doc__,
4795"S.islower() -> bool\n\
4796\n\
4797Return True if all cased characters in S are lowercase and there is\n\
4798at least one cased character in S, False otherwise.");
4799
4800static PyObject*
4801unicode_islower(PyUnicodeObject *self)
4802{
4803    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4804    register const Py_UNICODE *e;
4805    int cased;
4806
4807    /* Shortcut for single character strings */
4808    if (PyUnicode_GET_SIZE(self) == 1)
4809	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
4810
4811    /* Special case for empty strings */
4812    if (PyString_GET_SIZE(self) == 0)
4813	return PyBool_FromLong(0);
4814
4815    e = p + PyUnicode_GET_SIZE(self);
4816    cased = 0;
4817    for (; p < e; p++) {
4818	register const Py_UNICODE ch = *p;
4819
4820	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4821	    return PyBool_FromLong(0);
4822	else if (!cased && Py_UNICODE_ISLOWER(ch))
4823	    cased = 1;
4824    }
4825    return PyBool_FromLong(cased);
4826}
4827
4828PyDoc_STRVAR(isupper__doc__,
4829"S.isupper() -> bool\n\
4830\n\
4831Return True if  all cased characters in S are uppercase and there is\n\
4832at least one cased character in S, False otherwise.");
4833
4834static PyObject*
4835unicode_isupper(PyUnicodeObject *self)
4836{
4837    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4838    register const Py_UNICODE *e;
4839    int cased;
4840
4841    /* Shortcut for single character strings */
4842    if (PyUnicode_GET_SIZE(self) == 1)
4843	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4844
4845    /* Special case for empty strings */
4846    if (PyString_GET_SIZE(self) == 0)
4847	return PyBool_FromLong(0);
4848
4849    e = p + PyUnicode_GET_SIZE(self);
4850    cased = 0;
4851    for (; p < e; p++) {
4852	register const Py_UNICODE ch = *p;
4853
4854	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4855	    return PyBool_FromLong(0);
4856	else if (!cased && Py_UNICODE_ISUPPER(ch))
4857	    cased = 1;
4858    }
4859    return PyBool_FromLong(cased);
4860}
4861
4862PyDoc_STRVAR(istitle__doc__,
4863"S.istitle() -> bool\n\
4864\n\
4865Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4866characters may only follow uncased characters and lowercase characters\n\
4867only cased ones. Return False otherwise.");
4868
4869static PyObject*
4870unicode_istitle(PyUnicodeObject *self)
4871{
4872    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4873    register const Py_UNICODE *e;
4874    int cased, previous_is_cased;
4875
4876    /* Shortcut for single character strings */
4877    if (PyUnicode_GET_SIZE(self) == 1)
4878	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4879			       (Py_UNICODE_ISUPPER(*p) != 0));
4880
4881    /* Special case for empty strings */
4882    if (PyString_GET_SIZE(self) == 0)
4883	return PyBool_FromLong(0);
4884
4885    e = p + PyUnicode_GET_SIZE(self);
4886    cased = 0;
4887    previous_is_cased = 0;
4888    for (; p < e; p++) {
4889	register const Py_UNICODE ch = *p;
4890
4891	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4892	    if (previous_is_cased)
4893		return PyBool_FromLong(0);
4894	    previous_is_cased = 1;
4895	    cased = 1;
4896	}
4897	else if (Py_UNICODE_ISLOWER(ch)) {
4898	    if (!previous_is_cased)
4899		return PyBool_FromLong(0);
4900	    previous_is_cased = 1;
4901	    cased = 1;
4902	}
4903	else
4904	    previous_is_cased = 0;
4905    }
4906    return PyBool_FromLong(cased);
4907}
4908
4909PyDoc_STRVAR(isspace__doc__,
4910"S.isspace() -> bool\n\
4911\n\
4912Return True if there are only whitespace characters in S,\n\
4913False otherwise.");
4914
4915static PyObject*
4916unicode_isspace(PyUnicodeObject *self)
4917{
4918    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4919    register const Py_UNICODE *e;
4920
4921    /* Shortcut for single character strings */
4922    if (PyUnicode_GET_SIZE(self) == 1 &&
4923	Py_UNICODE_ISSPACE(*p))
4924	return PyBool_FromLong(1);
4925
4926    /* Special case for empty strings */
4927    if (PyString_GET_SIZE(self) == 0)
4928	return PyBool_FromLong(0);
4929
4930    e = p + PyUnicode_GET_SIZE(self);
4931    for (; p < e; p++) {
4932	if (!Py_UNICODE_ISSPACE(*p))
4933	    return PyBool_FromLong(0);
4934    }
4935    return PyBool_FromLong(1);
4936}
4937
4938PyDoc_STRVAR(isalpha__doc__,
4939"S.isalpha() -> bool\n\
4940\n\
4941Return True if  all characters in S are alphabetic\n\
4942and there is at least one character in S, False otherwise.");
4943
4944static PyObject*
4945unicode_isalpha(PyUnicodeObject *self)
4946{
4947    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4948    register const Py_UNICODE *e;
4949
4950    /* Shortcut for single character strings */
4951    if (PyUnicode_GET_SIZE(self) == 1 &&
4952	Py_UNICODE_ISALPHA(*p))
4953	return PyBool_FromLong(1);
4954
4955    /* Special case for empty strings */
4956    if (PyString_GET_SIZE(self) == 0)
4957	return PyBool_FromLong(0);
4958
4959    e = p + PyUnicode_GET_SIZE(self);
4960    for (; p < e; p++) {
4961	if (!Py_UNICODE_ISALPHA(*p))
4962	    return PyBool_FromLong(0);
4963    }
4964    return PyBool_FromLong(1);
4965}
4966
4967PyDoc_STRVAR(isalnum__doc__,
4968"S.isalnum() -> bool\n\
4969\n\
4970Return True if  all characters in S are alphanumeric\n\
4971and there is at least one character in S, False otherwise.");
4972
4973static PyObject*
4974unicode_isalnum(PyUnicodeObject *self)
4975{
4976    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4977    register const Py_UNICODE *e;
4978
4979    /* Shortcut for single character strings */
4980    if (PyUnicode_GET_SIZE(self) == 1 &&
4981	Py_UNICODE_ISALNUM(*p))
4982	return PyBool_FromLong(1);
4983
4984    /* Special case for empty strings */
4985    if (PyString_GET_SIZE(self) == 0)
4986	return PyBool_FromLong(0);
4987
4988    e = p + PyUnicode_GET_SIZE(self);
4989    for (; p < e; p++) {
4990	if (!Py_UNICODE_ISALNUM(*p))
4991	    return PyBool_FromLong(0);
4992    }
4993    return PyBool_FromLong(1);
4994}
4995
4996PyDoc_STRVAR(isdecimal__doc__,
4997"S.isdecimal() -> bool\n\
4998\n\
4999Return True if there are only decimal characters in S,\n\
5000False otherwise.");
5001
5002static PyObject*
5003unicode_isdecimal(PyUnicodeObject *self)
5004{
5005    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5006    register const Py_UNICODE *e;
5007
5008    /* Shortcut for single character strings */
5009    if (PyUnicode_GET_SIZE(self) == 1 &&
5010	Py_UNICODE_ISDECIMAL(*p))
5011	return PyBool_FromLong(1);
5012
5013    /* Special case for empty strings */
5014    if (PyString_GET_SIZE(self) == 0)
5015	return PyBool_FromLong(0);
5016
5017    e = p + PyUnicode_GET_SIZE(self);
5018    for (; p < e; p++) {
5019	if (!Py_UNICODE_ISDECIMAL(*p))
5020	    return PyBool_FromLong(0);
5021    }
5022    return PyBool_FromLong(1);
5023}
5024
5025PyDoc_STRVAR(isdigit__doc__,
5026"S.isdigit() -> bool\n\
5027\n\
5028Return True if there are only digit characters in S,\n\
5029False otherwise.");
5030
5031static PyObject*
5032unicode_isdigit(PyUnicodeObject *self)
5033{
5034    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5035    register const Py_UNICODE *e;
5036
5037    /* Shortcut for single character strings */
5038    if (PyUnicode_GET_SIZE(self) == 1 &&
5039	Py_UNICODE_ISDIGIT(*p))
5040	return PyBool_FromLong(1);
5041
5042    /* Special case for empty strings */
5043    if (PyString_GET_SIZE(self) == 0)
5044	return PyBool_FromLong(0);
5045
5046    e = p + PyUnicode_GET_SIZE(self);
5047    for (; p < e; p++) {
5048	if (!Py_UNICODE_ISDIGIT(*p))
5049	    return PyBool_FromLong(0);
5050    }
5051    return PyBool_FromLong(1);
5052}
5053
5054PyDoc_STRVAR(isnumeric__doc__,
5055"S.isnumeric() -> bool\n\
5056\n\
5057Return True if there are only numeric characters in S,\n\
5058False otherwise.");
5059
5060static PyObject*
5061unicode_isnumeric(PyUnicodeObject *self)
5062{
5063    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5064    register const Py_UNICODE *e;
5065
5066    /* Shortcut for single character strings */
5067    if (PyUnicode_GET_SIZE(self) == 1 &&
5068	Py_UNICODE_ISNUMERIC(*p))
5069	return PyBool_FromLong(1);
5070
5071    /* Special case for empty strings */
5072    if (PyString_GET_SIZE(self) == 0)
5073	return PyBool_FromLong(0);
5074
5075    e = p + PyUnicode_GET_SIZE(self);
5076    for (; p < e; p++) {
5077	if (!Py_UNICODE_ISNUMERIC(*p))
5078	    return PyBool_FromLong(0);
5079    }
5080    return PyBool_FromLong(1);
5081}
5082
5083PyDoc_STRVAR(join__doc__,
5084"S.join(sequence) -> unicode\n\
5085\n\
5086Return a string which is the concatenation of the strings in the\n\
5087sequence.  The separator between elements is S.");
5088
5089static PyObject*
5090unicode_join(PyObject *self, PyObject *data)
5091{
5092    return PyUnicode_Join(self, data);
5093}
5094
5095static int
5096unicode_length(PyUnicodeObject *self)
5097{
5098    return self->length;
5099}
5100
5101PyDoc_STRVAR(ljust__doc__,
5102"S.ljust(width) -> unicode\n\
5103\n\
5104Return S left justified in a Unicode string of length width. Padding is\n\
5105done using spaces.");
5106
5107static PyObject *
5108unicode_ljust(PyUnicodeObject *self, PyObject *args)
5109{
5110    int width;
5111    if (!PyArg_ParseTuple(args, "i:ljust", &width))
5112        return NULL;
5113
5114    if (self->length >= width && PyUnicode_CheckExact(self)) {
5115        Py_INCREF(self);
5116        return (PyObject*) self;
5117    }
5118
5119    return (PyObject*) pad(self, 0, width - self->length, ' ');
5120}
5121
5122PyDoc_STRVAR(lower__doc__,
5123"S.lower() -> unicode\n\
5124\n\
5125Return a copy of the string S converted to lowercase.");
5126
5127static PyObject*
5128unicode_lower(PyUnicodeObject *self)
5129{
5130    return fixup(self, fixlower);
5131}
5132
5133#define LEFTSTRIP 0
5134#define RIGHTSTRIP 1
5135#define BOTHSTRIP 2
5136
5137/* Arrays indexed by above */
5138static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5139
5140#define STRIPNAME(i) (stripformat[i]+3)
5141
5142static const Py_UNICODE *
5143unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5144{
5145	size_t i;
5146	for (i = 0; i < n; ++i)
5147		if (s[i] == c)
5148			return s+i;
5149	return NULL;
5150}
5151
5152/* externally visible for str.strip(unicode) */
5153PyObject *
5154_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5155{
5156	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5157	int len = PyUnicode_GET_SIZE(self);
5158	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5159	int seplen = PyUnicode_GET_SIZE(sepobj);
5160	int i, j;
5161
5162	i = 0;
5163	if (striptype != RIGHTSTRIP) {
5164		while (i < len && unicode_memchr(sep, s[i], seplen)) {
5165			i++;
5166		}
5167	}
5168
5169	j = len;
5170	if (striptype != LEFTSTRIP) {
5171		do {
5172			j--;
5173		} while (j >= i && unicode_memchr(sep, s[j], seplen));
5174		j++;
5175	}
5176
5177	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5178		Py_INCREF(self);
5179		return (PyObject*)self;
5180	}
5181	else
5182		return PyUnicode_FromUnicode(s+i, j-i);
5183}
5184
5185
5186static PyObject *
5187do_strip(PyUnicodeObject *self, int striptype)
5188{
5189	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5190	int len = PyUnicode_GET_SIZE(self), i, j;
5191
5192	i = 0;
5193	if (striptype != RIGHTSTRIP) {
5194		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5195			i++;
5196		}
5197	}
5198
5199	j = len;
5200	if (striptype != LEFTSTRIP) {
5201		do {
5202			j--;
5203		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5204		j++;
5205	}
5206
5207	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5208		Py_INCREF(self);
5209		return (PyObject*)self;
5210	}
5211	else
5212		return PyUnicode_FromUnicode(s+i, j-i);
5213}
5214
5215
5216static PyObject *
5217do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5218{
5219	PyObject *sep = NULL;
5220
5221	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5222		return NULL;
5223
5224	if (sep != NULL && sep != Py_None) {
5225		if (PyUnicode_Check(sep))
5226			return _PyUnicode_XStrip(self, striptype, sep);
5227		else if (PyString_Check(sep)) {
5228			PyObject *res;
5229			sep = PyUnicode_FromObject(sep);
5230			if (sep==NULL)
5231				return NULL;
5232			res = _PyUnicode_XStrip(self, striptype, sep);
5233			Py_DECREF(sep);
5234			return res;
5235		}
5236		else {
5237			PyErr_Format(PyExc_TypeError,
5238				     "%s arg must be None, unicode or str",
5239				     STRIPNAME(striptype));
5240			return NULL;
5241		}
5242	}
5243
5244	return do_strip(self, striptype);
5245}
5246
5247
5248PyDoc_STRVAR(strip__doc__,
5249"S.strip([sep]) -> unicode\n\
5250\n\
5251Return a copy of the string S with leading and trailing\n\
5252whitespace removed.\n\
5253If sep is given and not None, remove characters in sep instead.\n\
5254If sep is a str, it will be converted to unicode before stripping");
5255
5256static PyObject *
5257unicode_strip(PyUnicodeObject *self, PyObject *args)
5258{
5259	if (PyTuple_GET_SIZE(args) == 0)
5260		return do_strip(self, BOTHSTRIP); /* Common case */
5261	else
5262		return do_argstrip(self, BOTHSTRIP, args);
5263}
5264
5265
5266PyDoc_STRVAR(lstrip__doc__,
5267"S.lstrip([sep]) -> unicode\n\
5268\n\
5269Return a copy of the string S with leading whitespace removed.\n\
5270If sep is given and not None, remove characters in sep instead.\n\
5271If sep is a str, it will be converted to unicode before stripping");
5272
5273static PyObject *
5274unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5275{
5276	if (PyTuple_GET_SIZE(args) == 0)
5277		return do_strip(self, LEFTSTRIP); /* Common case */
5278	else
5279		return do_argstrip(self, LEFTSTRIP, args);
5280}
5281
5282
5283PyDoc_STRVAR(rstrip__doc__,
5284"S.rstrip([sep]) -> unicode\n\
5285\n\
5286Return a copy of the string S with trailing whitespace removed.\n\
5287If sep is given and not None, remove characters in sep instead.\n\
5288If sep is a str, it will be converted to unicode before stripping");
5289
5290static PyObject *
5291unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5292{
5293	if (PyTuple_GET_SIZE(args) == 0)
5294		return do_strip(self, RIGHTSTRIP); /* Common case */
5295	else
5296		return do_argstrip(self, RIGHTSTRIP, args);
5297}
5298
5299
5300static PyObject*
5301unicode_repeat(PyUnicodeObject *str, int len)
5302{
5303    PyUnicodeObject *u;
5304    Py_UNICODE *p;
5305    int nchars;
5306    size_t nbytes;
5307
5308    if (len < 0)
5309        len = 0;
5310
5311    if (len == 1 && PyUnicode_CheckExact(str)) {
5312        /* no repeat, return original string */
5313        Py_INCREF(str);
5314        return (PyObject*) str;
5315    }
5316
5317    /* ensure # of chars needed doesn't overflow int and # of bytes
5318     * needed doesn't overflow size_t
5319     */
5320    nchars = len * str->length;
5321    if (len && nchars / len != str->length) {
5322        PyErr_SetString(PyExc_OverflowError,
5323                        "repeated string is too long");
5324        return NULL;
5325    }
5326    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5327    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5328        PyErr_SetString(PyExc_OverflowError,
5329                        "repeated string is too long");
5330        return NULL;
5331    }
5332    u = _PyUnicode_New(nchars);
5333    if (!u)
5334        return NULL;
5335
5336    p = u->str;
5337
5338    while (len-- > 0) {
5339        Py_UNICODE_COPY(p, str->str, str->length);
5340        p += str->length;
5341    }
5342
5343    return (PyObject*) u;
5344}
5345
5346PyObject *PyUnicode_Replace(PyObject *obj,
5347			    PyObject *subobj,
5348			    PyObject *replobj,
5349			    int maxcount)
5350{
5351    PyObject *self;
5352    PyObject *str1;
5353    PyObject *str2;
5354    PyObject *result;
5355
5356    self = PyUnicode_FromObject(obj);
5357    if (self == NULL)
5358	return NULL;
5359    str1 = PyUnicode_FromObject(subobj);
5360    if (str1 == NULL) {
5361	Py_DECREF(self);
5362	return NULL;
5363    }
5364    str2 = PyUnicode_FromObject(replobj);
5365    if (str2 == NULL) {
5366	Py_DECREF(self);
5367	Py_DECREF(str1);
5368	return NULL;
5369    }
5370    result = replace((PyUnicodeObject *)self,
5371		     (PyUnicodeObject *)str1,
5372		     (PyUnicodeObject *)str2,
5373		     maxcount);
5374    Py_DECREF(self);
5375    Py_DECREF(str1);
5376    Py_DECREF(str2);
5377    return result;
5378}
5379
5380PyDoc_STRVAR(replace__doc__,
5381"S.replace (old, new[, maxsplit]) -> unicode\n\
5382\n\
5383Return a copy of S with all occurrences of substring\n\
5384old replaced by new.  If the optional argument maxsplit is\n\
5385given, only the first maxsplit occurrences are replaced.");
5386
5387static PyObject*
5388unicode_replace(PyUnicodeObject *self, PyObject *args)
5389{
5390    PyUnicodeObject *str1;
5391    PyUnicodeObject *str2;
5392    int maxcount = -1;
5393    PyObject *result;
5394
5395    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5396        return NULL;
5397    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5398    if (str1 == NULL)
5399	return NULL;
5400    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5401    if (str2 == NULL) {
5402	Py_DECREF(str1);
5403	return NULL;
5404    }
5405
5406    result = replace(self, str1, str2, maxcount);
5407
5408    Py_DECREF(str1);
5409    Py_DECREF(str2);
5410    return result;
5411}
5412
5413static
5414PyObject *unicode_repr(PyObject *unicode)
5415{
5416    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5417				PyUnicode_GET_SIZE(unicode),
5418				1);
5419}
5420
5421PyDoc_STRVAR(rfind__doc__,
5422"S.rfind(sub [,start [,end]]) -> int\n\
5423\n\
5424Return the highest index in S where substring sub is found,\n\
5425such that sub is contained within s[start,end].  Optional\n\
5426arguments start and end are interpreted as in slice notation.\n\
5427\n\
5428Return -1 on failure.");
5429
5430static PyObject *
5431unicode_rfind(PyUnicodeObject *self, PyObject *args)
5432{
5433    PyUnicodeObject *substring;
5434    int start = 0;
5435    int end = INT_MAX;
5436    PyObject *result;
5437
5438    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5439		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5440        return NULL;
5441    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5442						(PyObject *)substring);
5443    if (substring == NULL)
5444	return NULL;
5445
5446    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5447
5448    Py_DECREF(substring);
5449    return result;
5450}
5451
5452PyDoc_STRVAR(rindex__doc__,
5453"S.rindex(sub [,start [,end]]) -> int\n\
5454\n\
5455Like S.rfind() but raise ValueError when the substring is not found.");
5456
5457static PyObject *
5458unicode_rindex(PyUnicodeObject *self, PyObject *args)
5459{
5460    int result;
5461    PyUnicodeObject *substring;
5462    int start = 0;
5463    int end = INT_MAX;
5464
5465    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5466		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5467        return NULL;
5468    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5469						(PyObject *)substring);
5470    if (substring == NULL)
5471	return NULL;
5472
5473    result = findstring(self, substring, start, end, -1);
5474
5475    Py_DECREF(substring);
5476    if (result < 0) {
5477        PyErr_SetString(PyExc_ValueError, "substring not found");
5478        return NULL;
5479    }
5480    return PyInt_FromLong(result);
5481}
5482
5483PyDoc_STRVAR(rjust__doc__,
5484"S.rjust(width) -> unicode\n\
5485\n\
5486Return S right justified in a Unicode string of length width. Padding is\n\
5487done using spaces.");
5488
5489static PyObject *
5490unicode_rjust(PyUnicodeObject *self, PyObject *args)
5491{
5492    int width;
5493    if (!PyArg_ParseTuple(args, "i:rjust", &width))
5494        return NULL;
5495
5496    if (self->length >= width && PyUnicode_CheckExact(self)) {
5497        Py_INCREF(self);
5498        return (PyObject*) self;
5499    }
5500
5501    return (PyObject*) pad(self, width - self->length, 0, ' ');
5502}
5503
5504static PyObject*
5505unicode_slice(PyUnicodeObject *self, int start, int end)
5506{
5507    /* standard clamping */
5508    if (start < 0)
5509        start = 0;
5510    if (end < 0)
5511        end = 0;
5512    if (end > self->length)
5513        end = self->length;
5514    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
5515        /* full slice, return original string */
5516        Py_INCREF(self);
5517        return (PyObject*) self;
5518    }
5519    if (start > end)
5520        start = end;
5521    /* copy slice */
5522    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5523					     end - start);
5524}
5525
5526PyObject *PyUnicode_Split(PyObject *s,
5527			  PyObject *sep,
5528			  int maxsplit)
5529{
5530    PyObject *result;
5531
5532    s = PyUnicode_FromObject(s);
5533    if (s == NULL)
5534	return NULL;
5535    if (sep != NULL) {
5536	sep = PyUnicode_FromObject(sep);
5537	if (sep == NULL) {
5538	    Py_DECREF(s);
5539	    return NULL;
5540	}
5541    }
5542
5543    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5544
5545    Py_DECREF(s);
5546    Py_XDECREF(sep);
5547    return result;
5548}
5549
5550PyDoc_STRVAR(split__doc__,
5551"S.split([sep [,maxsplit]]) -> list of strings\n\
5552\n\
5553Return a list of the words in S, using sep as the\n\
5554delimiter string.  If maxsplit is given, at most maxsplit\n\
5555splits are done. If sep is not specified, any whitespace string\n\
5556is a separator.");
5557
5558static PyObject*
5559unicode_split(PyUnicodeObject *self, PyObject *args)
5560{
5561    PyObject *substring = Py_None;
5562    int maxcount = -1;
5563
5564    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5565        return NULL;
5566
5567    if (substring == Py_None)
5568	return split(self, NULL, maxcount);
5569    else if (PyUnicode_Check(substring))
5570	return split(self, (PyUnicodeObject *)substring, maxcount);
5571    else
5572	return PyUnicode_Split((PyObject *)self, substring, maxcount);
5573}
5574
5575PyDoc_STRVAR(splitlines__doc__,
5576"S.splitlines([keepends]]) -> list of strings\n\
5577\n\
5578Return a list of the lines in S, breaking at line boundaries.\n\
5579Line breaks are not included in the resulting list unless keepends\n\
5580is given and true.");
5581
5582static PyObject*
5583unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5584{
5585    int keepends = 0;
5586
5587    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
5588        return NULL;
5589
5590    return PyUnicode_Splitlines((PyObject *)self, keepends);
5591}
5592
5593static
5594PyObject *unicode_str(PyUnicodeObject *self)
5595{
5596    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
5597}
5598
5599PyDoc_STRVAR(swapcase__doc__,
5600"S.swapcase() -> unicode\n\
5601\n\
5602Return a copy of S with uppercase characters converted to lowercase\n\
5603and vice versa.");
5604
5605static PyObject*
5606unicode_swapcase(PyUnicodeObject *self)
5607{
5608    return fixup(self, fixswapcase);
5609}
5610
5611PyDoc_STRVAR(translate__doc__,
5612"S.translate(table) -> unicode\n\
5613\n\
5614Return a copy of the string S, where all characters have been mapped\n\
5615through the given translation table, which must be a mapping of\n\
5616Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5617Unmapped characters are left untouched. Characters mapped to None\n\
5618are deleted.");
5619
5620static PyObject*
5621unicode_translate(PyUnicodeObject *self, PyObject *table)
5622{
5623    return PyUnicode_TranslateCharmap(self->str,
5624				      self->length,
5625				      table,
5626				      "ignore");
5627}
5628
5629PyDoc_STRVAR(upper__doc__,
5630"S.upper() -> unicode\n\
5631\n\
5632Return a copy of S converted to uppercase.");
5633
5634static PyObject*
5635unicode_upper(PyUnicodeObject *self)
5636{
5637    return fixup(self, fixupper);
5638}
5639
5640PyDoc_STRVAR(zfill__doc__,
5641"S.zfill(width) -> unicode\n\
5642\n\
5643Pad a numeric string x with zeros on the left, to fill a field\n\
5644of the specified width. The string x is never truncated.");
5645
5646static PyObject *
5647unicode_zfill(PyUnicodeObject *self, PyObject *args)
5648{
5649    int fill;
5650    PyUnicodeObject *u;
5651
5652    int width;
5653    if (!PyArg_ParseTuple(args, "i:zfill", &width))
5654        return NULL;
5655
5656    if (self->length >= width) {
5657        if (PyUnicode_CheckExact(self)) {
5658            Py_INCREF(self);
5659            return (PyObject*) self;
5660        }
5661        else
5662            return PyUnicode_FromUnicode(
5663                PyUnicode_AS_UNICODE(self),
5664                PyUnicode_GET_SIZE(self)
5665            );
5666    }
5667
5668    fill = width - self->length;
5669
5670    u = pad(self, fill, 0, '0');
5671
5672    if (u == NULL)
5673        return NULL;
5674
5675    if (u->str[fill] == '+' || u->str[fill] == '-') {
5676        /* move sign to beginning of string */
5677        u->str[0] = u->str[fill];
5678        u->str[fill] = '0';
5679    }
5680
5681    return (PyObject*) u;
5682}
5683
5684#if 0
5685static PyObject*
5686unicode_freelistsize(PyUnicodeObject *self)
5687{
5688    return PyInt_FromLong(unicode_freelist_size);
5689}
5690#endif
5691
5692PyDoc_STRVAR(startswith__doc__,
5693"S.startswith(prefix[, start[, end]]) -> bool\n\
5694\n\
5695Return True if S starts with the specified prefix, False otherwise.  With\n\
5696optional start, test S beginning at that position.  With optional end, stop\n\
5697comparing S at that position.");
5698
5699static PyObject *
5700unicode_startswith(PyUnicodeObject *self,
5701		   PyObject *args)
5702{
5703    PyUnicodeObject *substring;
5704    int start = 0;
5705    int end = INT_MAX;
5706    PyObject *result;
5707
5708    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5709		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5710	return NULL;
5711    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5712						(PyObject *)substring);
5713    if (substring == NULL)
5714	return NULL;
5715
5716    result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
5717
5718    Py_DECREF(substring);
5719    return result;
5720}
5721
5722
5723PyDoc_STRVAR(endswith__doc__,
5724"S.endswith(suffix[, start[, end]]) -> bool\n\
5725\n\
5726Return True if S ends with the specified suffix, False otherwise.  With\n\
5727optional start, test S beginning at that position.  With optional end, stop\n\
5728comparing S at that position.");
5729
5730static PyObject *
5731unicode_endswith(PyUnicodeObject *self,
5732		 PyObject *args)
5733{
5734    PyUnicodeObject *substring;
5735    int start = 0;
5736    int end = INT_MAX;
5737    PyObject *result;
5738
5739    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5740		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5741	return NULL;
5742    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5743						(PyObject *)substring);
5744    if (substring == NULL)
5745	return NULL;
5746
5747    result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
5748
5749    Py_DECREF(substring);
5750    return result;
5751}
5752
5753
5754
5755static PyObject *
5756unicode_getnewargs(PyUnicodeObject *v)
5757{
5758	return Py_BuildValue("(u#)", v->str, v->length);
5759}
5760
5761
5762static PyMethodDef unicode_methods[] = {
5763
5764    /* Order is according to common usage: often used methods should
5765       appear first, since lookup is done sequentially. */
5766
5767    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5768    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5769    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5770    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5771    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5772    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5773    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5774    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5775    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5776    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5777    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5778    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5779    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
5780    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
5781/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5782    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5783    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5784    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
5785    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
5786    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
5787    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
5788    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5789    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5790    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5791    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5792    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5793    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5794    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5795    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5796    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5797    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5798    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5799    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5800    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5801    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
5802    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
5803#if 0
5804    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
5805#endif
5806
5807#if 0
5808    /* This one is just used for debugging the implementation. */
5809    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
5810#endif
5811
5812    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
5813    {NULL, NULL}
5814};
5815
5816static PyObject *
5817unicode_mod(PyObject *v, PyObject *w)
5818{
5819       if (!PyUnicode_Check(v)) {
5820               Py_INCREF(Py_NotImplemented);
5821               return Py_NotImplemented;
5822       }
5823       return PyUnicode_Format(v, w);
5824}
5825
5826static PyNumberMethods unicode_as_number = {
5827	0,				/*nb_add*/
5828	0,				/*nb_subtract*/
5829	0,				/*nb_multiply*/
5830	0,				/*nb_divide*/
5831	unicode_mod,			/*nb_remainder*/
5832};
5833
5834static PySequenceMethods unicode_as_sequence = {
5835    (inquiry) unicode_length, 		/* sq_length */
5836    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
5837    (intargfunc) unicode_repeat, 	/* sq_repeat */
5838    (intargfunc) unicode_getitem, 	/* sq_item */
5839    (intintargfunc) unicode_slice, 	/* sq_slice */
5840    0, 					/* sq_ass_item */
5841    0, 					/* sq_ass_slice */
5842    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
5843};
5844
5845static PyObject*
5846unicode_subscript(PyUnicodeObject* self, PyObject* item)
5847{
5848    if (PyInt_Check(item)) {
5849        long i = PyInt_AS_LONG(item);
5850        if (i < 0)
5851            i += PyString_GET_SIZE(self);
5852        return unicode_getitem(self, i);
5853    } else if (PyLong_Check(item)) {
5854        long i = PyLong_AsLong(item);
5855        if (i == -1 && PyErr_Occurred())
5856            return NULL;
5857        if (i < 0)
5858            i += PyString_GET_SIZE(self);
5859        return unicode_getitem(self, i);
5860    } else if (PySlice_Check(item)) {
5861        int start, stop, step, slicelength, cur, i;
5862        Py_UNICODE* source_buf;
5863        Py_UNICODE* result_buf;
5864        PyObject* result;
5865
5866        if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5867				 &start, &stop, &step, &slicelength) < 0) {
5868            return NULL;
5869        }
5870
5871        if (slicelength <= 0) {
5872            return PyUnicode_FromUnicode(NULL, 0);
5873        } else {
5874            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5875            result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5876
5877            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5878                result_buf[i] = source_buf[cur];
5879            }
5880
5881            result = PyUnicode_FromUnicode(result_buf, slicelength);
5882            PyMem_FREE(result_buf);
5883            return result;
5884        }
5885    } else {
5886        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5887        return NULL;
5888    }
5889}
5890
5891static PyMappingMethods unicode_as_mapping = {
5892    (inquiry)unicode_length,		/* mp_length */
5893    (binaryfunc)unicode_subscript,	/* mp_subscript */
5894    (objobjargproc)0,			/* mp_ass_subscript */
5895};
5896
5897static int
5898unicode_buffer_getreadbuf(PyUnicodeObject *self,
5899			  int index,
5900			  const void **ptr)
5901{
5902    if (index != 0) {
5903        PyErr_SetString(PyExc_SystemError,
5904			"accessing non-existent unicode segment");
5905        return -1;
5906    }
5907    *ptr = (void *) self->str;
5908    return PyUnicode_GET_DATA_SIZE(self);
5909}
5910
5911static int
5912unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5913			   const void **ptr)
5914{
5915    PyErr_SetString(PyExc_TypeError,
5916		    "cannot use unicode as modifiable buffer");
5917    return -1;
5918}
5919
5920static int
5921unicode_buffer_getsegcount(PyUnicodeObject *self,
5922			   int *lenp)
5923{
5924    if (lenp)
5925        *lenp = PyUnicode_GET_DATA_SIZE(self);
5926    return 1;
5927}
5928
5929static int
5930unicode_buffer_getcharbuf(PyUnicodeObject *self,
5931			  int index,
5932			  const void **ptr)
5933{
5934    PyObject *str;
5935
5936    if (index != 0) {
5937        PyErr_SetString(PyExc_SystemError,
5938			"accessing non-existent unicode segment");
5939        return -1;
5940    }
5941    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5942    if (str == NULL)
5943	return -1;
5944    *ptr = (void *) PyString_AS_STRING(str);
5945    return PyString_GET_SIZE(str);
5946}
5947
5948/* Helpers for PyUnicode_Format() */
5949
5950static PyObject *
5951getnextarg(PyObject *args, int arglen, int *p_argidx)
5952{
5953    int argidx = *p_argidx;
5954    if (argidx < arglen) {
5955	(*p_argidx)++;
5956	if (arglen < 0)
5957	    return args;
5958	else
5959	    return PyTuple_GetItem(args, argidx);
5960    }
5961    PyErr_SetString(PyExc_TypeError,
5962		    "not enough arguments for format string");
5963    return NULL;
5964}
5965
5966#define F_LJUST (1<<0)
5967#define F_SIGN	(1<<1)
5968#define F_BLANK (1<<2)
5969#define F_ALT	(1<<3)
5970#define F_ZERO	(1<<4)
5971
5972static
5973int usprintf(register Py_UNICODE *buffer, char *format, ...)
5974{
5975    register int i;
5976    int len;
5977    va_list va;
5978    char *charbuffer;
5979    va_start(va, format);
5980
5981    /* First, format the string as char array, then expand to Py_UNICODE
5982       array. */
5983    charbuffer = (char *)buffer;
5984    len = vsprintf(charbuffer, format, va);
5985    for (i = len - 1; i >= 0; i--)
5986	buffer[i] = (Py_UNICODE) charbuffer[i];
5987
5988    va_end(va);
5989    return len;
5990}
5991
5992/* XXX To save some code duplication, formatfloat/long/int could have been
5993   shared with stringobject.c, converting from 8-bit to Unicode after the
5994   formatting is done. */
5995
5996static int
5997formatfloat(Py_UNICODE *buf,
5998	    size_t buflen,
5999	    int flags,
6000	    int prec,
6001	    int type,
6002	    PyObject *v)
6003{
6004    /* fmt = '%#.' + `prec` + `type`
6005       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
6006    char fmt[20];
6007    double x;
6008
6009    x = PyFloat_AsDouble(v);
6010    if (x == -1.0 && PyErr_Occurred())
6011	return -1;
6012    if (prec < 0)
6013	prec = 6;
6014    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6015	type = 'g';
6016    /* Worst case length calc to ensure no buffer overrun:
6017
6018       'g' formats:
6019	 fmt = %#.<prec>g
6020	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6021	    for any double rep.)
6022	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6023
6024       'f' formats:
6025	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6026	 len = 1 + 50 + 1 + prec = 52 + prec
6027
6028       If prec=0 the effective precision is 1 (the leading digit is
6029       always given), therefore increase the length by one.
6030
6031    */
6032    if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6033	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
6034	PyErr_SetString(PyExc_OverflowError,
6035			"formatted float is too long (precision too large?)");
6036	return -1;
6037    }
6038    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6039		  (flags&F_ALT) ? "#" : "",
6040		  prec, type);
6041    return usprintf(buf, fmt, x);
6042}
6043
6044static PyObject*
6045formatlong(PyObject *val, int flags, int prec, int type)
6046{
6047	char *buf;
6048	int i, len;
6049	PyObject *str; /* temporary string object. */
6050	PyUnicodeObject *result;
6051
6052	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6053	if (!str)
6054		return NULL;
6055	result = _PyUnicode_New(len);
6056	for (i = 0; i < len; i++)
6057		result->str[i] = buf[i];
6058	result->str[len] = 0;
6059	Py_DECREF(str);
6060	return (PyObject*)result;
6061}
6062
6063static int
6064formatint(Py_UNICODE *buf,
6065	  size_t buflen,
6066	  int flags,
6067	  int prec,
6068	  int type,
6069	  PyObject *v)
6070{
6071    /* fmt = '%#.' + `prec` + 'l' + `type`
6072     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6073     *                     + 1 + 1
6074     *                   = 24
6075     */
6076    char fmt[64]; /* plenty big enough! */
6077    long x;
6078
6079    x = PyInt_AsLong(v);
6080    if (x == -1 && PyErr_Occurred())
6081        return -1;
6082    if (x < 0 && type != 'd' && type != 'i') {
6083	if (PyErr_Warn(PyExc_FutureWarning,
6084		       "%u/%o/%x/%X of negative int will return "
6085		       "a signed string in Python 2.4 and up") < 0)
6086	    return -1;
6087    }
6088    if (prec < 0)
6089        prec = 1;
6090
6091    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
6092     * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6093     */
6094    if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
6095        PyErr_SetString(PyExc_OverflowError,
6096    	        "formatted integer is too long (precision too large?)");
6097        return -1;
6098    }
6099
6100    if ((flags & F_ALT) &&
6101        (type == 'x' || type == 'X')) {
6102        /* When converting under %#x or %#X, there are a number
6103         * of issues that cause pain:
6104         * - when 0 is being converted, the C standard leaves off
6105         *   the '0x' or '0X', which is inconsistent with other
6106         *   %#x/%#X conversions and inconsistent with Python's
6107         *   hex() function
6108         * - there are platforms that violate the standard and
6109         *   convert 0 with the '0x' or '0X'
6110         *   (Metrowerks, Compaq Tru64)
6111         * - there are platforms that give '0x' when converting
6112         *   under %#X, but convert 0 in accordance with the
6113         *   standard (OS/2 EMX)
6114         *
6115         * We can achieve the desired consistency by inserting our
6116         * own '0x' or '0X' prefix, and substituting %x/%X in place
6117         * of %#x/%#X.
6118         *
6119         * Note that this is the same approach as used in
6120         * formatint() in stringobject.c
6121         */
6122        PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6123                      type, prec, type);
6124    }
6125    else {
6126        PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6127                      (flags&F_ALT) ? "#" : "",
6128                      prec, type);
6129    }
6130    return usprintf(buf, fmt, x);
6131}
6132
6133static int
6134formatchar(Py_UNICODE *buf,
6135           size_t buflen,
6136           PyObject *v)
6137{
6138    /* presume that the buffer is at least 2 characters long */
6139    if (PyUnicode_Check(v)) {
6140	if (PyUnicode_GET_SIZE(v) != 1)
6141	    goto onError;
6142	buf[0] = PyUnicode_AS_UNICODE(v)[0];
6143    }
6144
6145    else if (PyString_Check(v)) {
6146	if (PyString_GET_SIZE(v) != 1)
6147	    goto onError;
6148	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6149    }
6150
6151    else {
6152	/* Integer input truncated to a character */
6153        long x;
6154	x = PyInt_AsLong(v);
6155	if (x == -1 && PyErr_Occurred())
6156	    goto onError;
6157#ifdef Py_UNICODE_WIDE
6158	if (x < 0 || x > 0x10ffff) {
6159	    PyErr_SetString(PyExc_ValueError,
6160			    "%c arg not in range(0x110000) "
6161			    "(wide Python build)");
6162	    return -1;
6163	}
6164#else
6165	if (x < 0 || x > 0xffff) {
6166	    PyErr_SetString(PyExc_ValueError,
6167			    "%c arg not in range(0x10000) "
6168			    "(narrow Python build)");
6169	    return -1;
6170	}
6171#endif
6172	buf[0] = (Py_UNICODE) x;
6173    }
6174    buf[1] = '\0';
6175    return 1;
6176
6177 onError:
6178    PyErr_SetString(PyExc_TypeError,
6179		    "%c requires int or char");
6180    return -1;
6181}
6182
6183/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6184
6185   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6186   chars are formatted. XXX This is a magic number. Each formatting
6187   routine does bounds checking to ensure no overflow, but a better
6188   solution may be to malloc a buffer of appropriate size for each
6189   format. For now, the current solution is sufficient.
6190*/
6191#define FORMATBUFLEN (size_t)120
6192
6193PyObject *PyUnicode_Format(PyObject *format,
6194			   PyObject *args)
6195{
6196    Py_UNICODE *fmt, *res;
6197    int fmtcnt, rescnt, reslen, arglen, argidx;
6198    int args_owned = 0;
6199    PyUnicodeObject *result = NULL;
6200    PyObject *dict = NULL;
6201    PyObject *uformat;
6202
6203    if (format == NULL || args == NULL) {
6204	PyErr_BadInternalCall();
6205	return NULL;
6206    }
6207    uformat = PyUnicode_FromObject(format);
6208    if (uformat == NULL)
6209	return NULL;
6210    fmt = PyUnicode_AS_UNICODE(uformat);
6211    fmtcnt = PyUnicode_GET_SIZE(uformat);
6212
6213    reslen = rescnt = fmtcnt + 100;
6214    result = _PyUnicode_New(reslen);
6215    if (result == NULL)
6216	goto onError;
6217    res = PyUnicode_AS_UNICODE(result);
6218
6219    if (PyTuple_Check(args)) {
6220	arglen = PyTuple_Size(args);
6221	argidx = 0;
6222    }
6223    else {
6224	arglen = -1;
6225	argidx = -2;
6226    }
6227    if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6228        !PyObject_TypeCheck(args, &PyBaseString_Type))
6229	dict = args;
6230
6231    while (--fmtcnt >= 0) {
6232	if (*fmt != '%') {
6233	    if (--rescnt < 0) {
6234		rescnt = fmtcnt + 100;
6235		reslen += rescnt;
6236		if (_PyUnicode_Resize(&result, reslen) < 0)
6237		    return NULL;
6238		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6239		--rescnt;
6240	    }
6241	    *res++ = *fmt++;
6242	}
6243	else {
6244	    /* Got a format specifier */
6245	    int flags = 0;
6246	    int width = -1;
6247	    int prec = -1;
6248	    Py_UNICODE c = '\0';
6249	    Py_UNICODE fill;
6250	    PyObject *v = NULL;
6251	    PyObject *temp = NULL;
6252	    Py_UNICODE *pbuf;
6253	    Py_UNICODE sign;
6254	    int len;
6255	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
6256
6257	    fmt++;
6258	    if (*fmt == '(') {
6259		Py_UNICODE *keystart;
6260		int keylen;
6261		PyObject *key;
6262		int pcount = 1;
6263
6264		if (dict == NULL) {
6265		    PyErr_SetString(PyExc_TypeError,
6266				    "format requires a mapping");
6267		    goto onError;
6268		}
6269		++fmt;
6270		--fmtcnt;
6271		keystart = fmt;
6272		/* Skip over balanced parentheses */
6273		while (pcount > 0 && --fmtcnt >= 0) {
6274		    if (*fmt == ')')
6275			--pcount;
6276		    else if (*fmt == '(')
6277			++pcount;
6278		    fmt++;
6279		}
6280		keylen = fmt - keystart - 1;
6281		if (fmtcnt < 0 || pcount > 0) {
6282		    PyErr_SetString(PyExc_ValueError,
6283				    "incomplete format key");
6284		    goto onError;
6285		}
6286#if 0
6287		/* keys are converted to strings using UTF-8 and
6288		   then looked up since Python uses strings to hold
6289		   variables names etc. in its namespaces and we
6290		   wouldn't want to break common idioms. */
6291		key = PyUnicode_EncodeUTF8(keystart,
6292					   keylen,
6293					   NULL);
6294#else
6295		key = PyUnicode_FromUnicode(keystart, keylen);
6296#endif
6297		if (key == NULL)
6298		    goto onError;
6299		if (args_owned) {
6300		    Py_DECREF(args);
6301		    args_owned = 0;
6302		}
6303		args = PyObject_GetItem(dict, key);
6304		Py_DECREF(key);
6305		if (args == NULL) {
6306		    goto onError;
6307		}
6308		args_owned = 1;
6309		arglen = -1;
6310		argidx = -2;
6311	    }
6312	    while (--fmtcnt >= 0) {
6313		switch (c = *fmt++) {
6314		case '-': flags |= F_LJUST; continue;
6315		case '+': flags |= F_SIGN; continue;
6316		case ' ': flags |= F_BLANK; continue;
6317		case '#': flags |= F_ALT; continue;
6318		case '0': flags |= F_ZERO; continue;
6319		}
6320		break;
6321	    }
6322	    if (c == '*') {
6323		v = getnextarg(args, arglen, &argidx);
6324		if (v == NULL)
6325		    goto onError;
6326		if (!PyInt_Check(v)) {
6327		    PyErr_SetString(PyExc_TypeError,
6328				    "* wants int");
6329		    goto onError;
6330		}
6331		width = PyInt_AsLong(v);
6332		if (width < 0) {
6333		    flags |= F_LJUST;
6334		    width = -width;
6335		}
6336		if (--fmtcnt >= 0)
6337		    c = *fmt++;
6338	    }
6339	    else if (c >= '0' && c <= '9') {
6340		width = c - '0';
6341		while (--fmtcnt >= 0) {
6342		    c = *fmt++;
6343		    if (c < '0' || c > '9')
6344			break;
6345		    if ((width*10) / 10 != width) {
6346			PyErr_SetString(PyExc_ValueError,
6347					"width too big");
6348			goto onError;
6349		    }
6350		    width = width*10 + (c - '0');
6351		}
6352	    }
6353	    if (c == '.') {
6354		prec = 0;
6355		if (--fmtcnt >= 0)
6356		    c = *fmt++;
6357		if (c == '*') {
6358		    v = getnextarg(args, arglen, &argidx);
6359		    if (v == NULL)
6360			goto onError;
6361		    if (!PyInt_Check(v)) {
6362			PyErr_SetString(PyExc_TypeError,
6363					"* wants int");
6364			goto onError;
6365		    }
6366		    prec = PyInt_AsLong(v);
6367		    if (prec < 0)
6368			prec = 0;
6369		    if (--fmtcnt >= 0)
6370			c = *fmt++;
6371		}
6372		else if (c >= '0' && c <= '9') {
6373		    prec = c - '0';
6374		    while (--fmtcnt >= 0) {
6375			c = Py_CHARMASK(*fmt++);
6376			if (c < '0' || c > '9')
6377			    break;
6378			if ((prec*10) / 10 != prec) {
6379			    PyErr_SetString(PyExc_ValueError,
6380					    "prec too big");
6381			    goto onError;
6382			}
6383			prec = prec*10 + (c - '0');
6384		    }
6385		}
6386	    } /* prec */
6387	    if (fmtcnt >= 0) {
6388		if (c == 'h' || c == 'l' || c == 'L') {
6389		    if (--fmtcnt >= 0)
6390			c = *fmt++;
6391		}
6392	    }
6393	    if (fmtcnt < 0) {
6394		PyErr_SetString(PyExc_ValueError,
6395				"incomplete format");
6396		goto onError;
6397	    }
6398	    if (c != '%') {
6399		v = getnextarg(args, arglen, &argidx);
6400		if (v == NULL)
6401		    goto onError;
6402	    }
6403	    sign = 0;
6404	    fill = ' ';
6405	    switch (c) {
6406
6407	    case '%':
6408		pbuf = formatbuf;
6409		/* presume that buffer length is at least 1 */
6410		pbuf[0] = '%';
6411		len = 1;
6412		break;
6413
6414	    case 's':
6415	    case 'r':
6416		if (PyUnicode_Check(v) && c == 's') {
6417		    temp = v;
6418		    Py_INCREF(temp);
6419		}
6420		else {
6421		    PyObject *unicode;
6422		    if (c == 's')
6423			temp = PyObject_Str(v);
6424		    else
6425			temp = PyObject_Repr(v);
6426		    if (temp == NULL)
6427			goto onError;
6428		    if (!PyString_Check(temp)) {
6429			/* XXX Note: this should never happen, since
6430   			       PyObject_Repr() and PyObject_Str() assure
6431			       this */
6432			Py_DECREF(temp);
6433			PyErr_SetString(PyExc_TypeError,
6434					"%s argument has non-string str()");
6435			goto onError;
6436		    }
6437		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
6438						   PyString_GET_SIZE(temp),
6439					       NULL,
6440						   "strict");
6441		    Py_DECREF(temp);
6442		    temp = unicode;
6443		    if (temp == NULL)
6444			goto onError;
6445		}
6446		pbuf = PyUnicode_AS_UNICODE(temp);
6447		len = PyUnicode_GET_SIZE(temp);
6448		if (prec >= 0 && len > prec)
6449		    len = prec;
6450		break;
6451
6452	    case 'i':
6453	    case 'd':
6454	    case 'u':
6455	    case 'o':
6456	    case 'x':
6457	    case 'X':
6458		if (c == 'i')
6459		    c = 'd';
6460		if (PyLong_Check(v)) {
6461		    temp = formatlong(v, flags, prec, c);
6462		    if (!temp)
6463			goto onError;
6464		    pbuf = PyUnicode_AS_UNICODE(temp);
6465		    len = PyUnicode_GET_SIZE(temp);
6466		    /* unbounded ints can always produce
6467		       a sign character! */
6468		    sign = 1;
6469		}
6470		else {
6471		    pbuf = formatbuf;
6472		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6473				    flags, prec, c, v);
6474		    if (len < 0)
6475			goto onError;
6476		    /* only d conversion is signed */
6477		    sign = c == 'd';
6478		}
6479		if (flags & F_ZERO)
6480		    fill = '0';
6481		break;
6482
6483	    case 'e':
6484	    case 'E':
6485	    case 'f':
6486	    case 'g':
6487	    case 'G':
6488		pbuf = formatbuf;
6489		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6490			flags, prec, c, v);
6491		if (len < 0)
6492		    goto onError;
6493		sign = 1;
6494		if (flags & F_ZERO)
6495		    fill = '0';
6496		break;
6497
6498	    case 'c':
6499		pbuf = formatbuf;
6500		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
6501		if (len < 0)
6502		    goto onError;
6503		break;
6504
6505	    default:
6506		PyErr_Format(PyExc_ValueError,
6507			     "unsupported format character '%c' (0x%x) "
6508			     "at index %i",
6509			     (31<=c && c<=126) ? (char)c : '?',
6510                             (int)c,
6511			     (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
6512		goto onError;
6513	    }
6514	    if (sign) {
6515		if (*pbuf == '-' || *pbuf == '+') {
6516		    sign = *pbuf++;
6517		    len--;
6518		}
6519		else if (flags & F_SIGN)
6520		    sign = '+';
6521		else if (flags & F_BLANK)
6522		    sign = ' ';
6523		else
6524		    sign = 0;
6525	    }
6526	    if (width < len)
6527		width = len;
6528	    if (rescnt - (sign != 0) < width) {
6529		reslen -= rescnt;
6530		rescnt = width + fmtcnt + 100;
6531		reslen += rescnt;
6532		if (reslen < 0) {
6533		    Py_DECREF(result);
6534		    return PyErr_NoMemory();
6535		}
6536		if (_PyUnicode_Resize(&result, reslen) < 0)
6537		    return NULL;
6538		res = PyUnicode_AS_UNICODE(result)
6539		    + reslen - rescnt;
6540	    }
6541	    if (sign) {
6542		if (fill != ' ')
6543		    *res++ = sign;
6544		rescnt--;
6545		if (width > len)
6546		    width--;
6547	    }
6548	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6549		assert(pbuf[0] == '0');
6550		assert(pbuf[1] == c);
6551		if (fill != ' ') {
6552		    *res++ = *pbuf++;
6553		    *res++ = *pbuf++;
6554		}
6555		rescnt -= 2;
6556		width -= 2;
6557		if (width < 0)
6558		    width = 0;
6559		len -= 2;
6560	    }
6561	    if (width > len && !(flags & F_LJUST)) {
6562		do {
6563		    --rescnt;
6564		    *res++ = fill;
6565		} while (--width > len);
6566	    }
6567	    if (fill == ' ') {
6568		if (sign)
6569		    *res++ = sign;
6570		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6571		    assert(pbuf[0] == '0');
6572		    assert(pbuf[1] == c);
6573		    *res++ = *pbuf++;
6574		    *res++ = *pbuf++;
6575		}
6576	    }
6577	    Py_UNICODE_COPY(res, pbuf, len);
6578	    res += len;
6579	    rescnt -= len;
6580	    while (--width >= len) {
6581		--rescnt;
6582		*res++ = ' ';
6583	    }
6584	    if (dict && (argidx < arglen) && c != '%') {
6585		PyErr_SetString(PyExc_TypeError,
6586				"not all arguments converted during string formatting");
6587		goto onError;
6588	    }
6589	    Py_XDECREF(temp);
6590	} /* '%' */
6591    } /* until end */
6592    if (argidx < arglen && !dict) {
6593	PyErr_SetString(PyExc_TypeError,
6594			"not all arguments converted during string formatting");
6595	goto onError;
6596    }
6597
6598    if (args_owned) {
6599	Py_DECREF(args);
6600    }
6601    Py_DECREF(uformat);
6602    if (_PyUnicode_Resize(&result, reslen - rescnt))
6603	goto onError;
6604    return (PyObject *)result;
6605
6606 onError:
6607    Py_XDECREF(result);
6608    Py_DECREF(uformat);
6609    if (args_owned) {
6610	Py_DECREF(args);
6611    }
6612    return NULL;
6613}
6614
6615static PyBufferProcs unicode_as_buffer = {
6616    (getreadbufferproc) unicode_buffer_getreadbuf,
6617    (getwritebufferproc) unicode_buffer_getwritebuf,
6618    (getsegcountproc) unicode_buffer_getsegcount,
6619    (getcharbufferproc) unicode_buffer_getcharbuf,
6620};
6621
6622static PyObject *
6623unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6624
6625static PyObject *
6626unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6627{
6628        PyObject *x = NULL;
6629	static char *kwlist[] = {"string", "encoding", "errors", 0};
6630	char *encoding = NULL;
6631	char *errors = NULL;
6632
6633	if (type != &PyUnicode_Type)
6634		return unicode_subtype_new(type, args, kwds);
6635	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6636					  kwlist, &x, &encoding, &errors))
6637	    return NULL;
6638	if (x == NULL)
6639		return (PyObject *)_PyUnicode_New(0);
6640	if (encoding == NULL && errors == NULL)
6641	    return PyObject_Unicode(x);
6642	else
6643	return PyUnicode_FromEncodedObject(x, encoding, errors);
6644}
6645
6646static PyObject *
6647unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6648{
6649	PyUnicodeObject *tmp, *pnew;
6650	int n;
6651
6652	assert(PyType_IsSubtype(type, &PyUnicode_Type));
6653	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6654	if (tmp == NULL)
6655		return NULL;
6656	assert(PyUnicode_Check(tmp));
6657	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6658	if (pnew == NULL)
6659		return NULL;
6660	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6661	if (pnew->str == NULL) {
6662		_Py_ForgetReference((PyObject *)pnew);
6663		PyObject_Del(pnew);
6664		return NULL;
6665	}
6666	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6667	pnew->length = n;
6668	pnew->hash = tmp->hash;
6669	Py_DECREF(tmp);
6670	return (PyObject *)pnew;
6671}
6672
6673PyDoc_STRVAR(unicode_doc,
6674"unicode(string [, encoding[, errors]]) -> object\n\
6675\n\
6676Create a new Unicode object from the given encoded string.\n\
6677encoding defaults to the current default string encoding.\n\
6678errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
6679
6680PyTypeObject PyUnicode_Type = {
6681    PyObject_HEAD_INIT(&PyType_Type)
6682    0, 					/* ob_size */
6683    "unicode", 				/* tp_name */
6684    sizeof(PyUnicodeObject), 		/* tp_size */
6685    0, 					/* tp_itemsize */
6686    /* Slots */
6687    (destructor)unicode_dealloc, 	/* tp_dealloc */
6688    0, 					/* tp_print */
6689    0,				 	/* tp_getattr */
6690    0, 					/* tp_setattr */
6691    (cmpfunc) unicode_compare, 		/* tp_compare */
6692    (reprfunc) unicode_repr, 		/* tp_repr */
6693    &unicode_as_number, 		/* tp_as_number */
6694    &unicode_as_sequence, 		/* tp_as_sequence */
6695    &unicode_as_mapping, 		/* tp_as_mapping */
6696    (hashfunc) unicode_hash, 		/* tp_hash*/
6697    0, 					/* tp_call*/
6698    (reprfunc) unicode_str,	 	/* tp_str */
6699    PyObject_GenericGetAttr, 		/* tp_getattro */
6700    0,			 		/* tp_setattro */
6701    &unicode_as_buffer,			/* tp_as_buffer */
6702    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6703	    Py_TPFLAGS_BASETYPE,	/* tp_flags */
6704    unicode_doc,			/* tp_doc */
6705    0,					/* tp_traverse */
6706    0,					/* tp_clear */
6707    0,					/* tp_richcompare */
6708    0,					/* tp_weaklistoffset */
6709    0,					/* tp_iter */
6710    0,					/* tp_iternext */
6711    unicode_methods,			/* tp_methods */
6712    0,					/* tp_members */
6713    0,					/* tp_getset */
6714    &PyBaseString_Type,			/* tp_base */
6715    0,					/* tp_dict */
6716    0,					/* tp_descr_get */
6717    0,					/* tp_descr_set */
6718    0,					/* tp_dictoffset */
6719    0,					/* tp_init */
6720    0,					/* tp_alloc */
6721    unicode_new,			/* tp_new */
6722    PyObject_Del,      		/* tp_free */
6723};
6724
6725/* Initialize the Unicode implementation */
6726
6727void _PyUnicode_Init(void)
6728{
6729    int i;
6730
6731    /* Init the implementation */
6732    unicode_freelist = NULL;
6733    unicode_freelist_size = 0;
6734    unicode_empty = _PyUnicode_New(0);
6735    strcpy(unicode_default_encoding, "ascii");
6736    for (i = 0; i < 256; i++)
6737	unicode_latin1[i] = NULL;
6738    if (PyType_Ready(&PyUnicode_Type) < 0)
6739	Py_FatalError("Can't initialize 'unicode'");
6740}
6741
6742/* Finalize the Unicode implementation */
6743
6744void
6745_PyUnicode_Fini(void)
6746{
6747    PyUnicodeObject *u;
6748    int i;
6749
6750    Py_XDECREF(unicode_empty);
6751    unicode_empty = NULL;
6752
6753    for (i = 0; i < 256; i++) {
6754	if (unicode_latin1[i]) {
6755	    Py_DECREF(unicode_latin1[i]);
6756	    unicode_latin1[i] = NULL;
6757	}
6758    }
6759
6760    for (u = unicode_freelist; u != NULL;) {
6761	PyUnicodeObject *v = u;
6762	u = *(PyUnicodeObject **)u;
6763	if (v->str)
6764	    PyMem_DEL(v->str);
6765	Py_XDECREF(v->defenc);
6766	PyObject_Del(v);
6767    }
6768    unicode_freelist = NULL;
6769    unicode_freelist_size = 0;
6770}
6771