unicodeobject.c revision 9a3a9f779142d92655f86eaf9584ce946c61dfea
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WINDOWS
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106Py_UNICODE
107PyUnicode_GetMax(void)
108{
109#ifdef Py_UNICODE_WIDE
110	return 0x10FFFF;
111#else
112	/* This is actually an illegal character, so it should
113	   not be passed to unichr. */
114	return 0xFFFF;
115#endif
116}
117
118/* --- Unicode Object ----------------------------------------------------- */
119
120static
121int unicode_resize(register PyUnicodeObject *unicode,
122                      int length)
123{
124    void *oldstr;
125
126    /* Shortcut if there's nothing much to do. */
127    if (unicode->length == length)
128	goto reset;
129
130    /* Resizing shared object (unicode_empty or single character
131       objects) in-place is not allowed. Use PyUnicode_Resize()
132       instead ! */
133    if (unicode == unicode_empty ||
134	(unicode->length == 1 &&
135	 unicode->str[0] < 256 &&
136	 unicode_latin1[unicode->str[0]] == unicode)) {
137        PyErr_SetString(PyExc_SystemError,
138                        "can't resize shared unicode objects");
139        return -1;
140    }
141
142    /* We allocate one more byte to make sure the string is
143       Ux0000 terminated -- XXX is this needed ? */
144    oldstr = unicode->str;
145    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146    if (!unicode->str) {
147	unicode->str = oldstr;
148        PyErr_NoMemory();
149        return -1;
150    }
151    unicode->str[length] = 0;
152    unicode->length = length;
153
154 reset:
155    /* Reset the object caches */
156    if (unicode->defenc) {
157        Py_DECREF(unicode->defenc);
158        unicode->defenc = NULL;
159    }
160    unicode->hash = -1;
161
162    return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166   Ux0000 terminated -- XXX is this needed ?
167
168   XXX This allocator could further be enhanced by assuring that the
169       free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176    register PyUnicodeObject *unicode;
177
178    /* Optimization for empty strings */
179    if (length == 0 && unicode_empty != NULL) {
180        Py_INCREF(unicode_empty);
181        return unicode_empty;
182    }
183
184    /* Unicode freelist & memory allocation */
185    if (unicode_freelist) {
186        unicode = unicode_freelist;
187        unicode_freelist = *(PyUnicodeObject **)unicode;
188        unicode_freelist_size--;
189	if (unicode->str) {
190	    /* Keep-Alive optimization: we only upsize the buffer,
191	       never downsize it. */
192	    if ((unicode->length < length) &&
193		unicode_resize(unicode, length)) {
194		PyMem_DEL(unicode->str);
195		goto onError;
196	    }
197	}
198        else {
199	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200        }
201        PyObject_INIT(unicode, &PyUnicode_Type);
202    }
203    else {
204        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
205        if (unicode == NULL)
206            return NULL;
207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208    }
209
210    if (!unicode->str) {
211	PyErr_NoMemory();
212	goto onError;
213    }
214    unicode->str[length] = 0;
215    unicode->length = length;
216    unicode->hash = -1;
217    unicode->defenc = NULL;
218    return unicode;
219
220 onError:
221    _Py_ForgetReference((PyObject *)unicode);
222    PyObject_Del(unicode);
223    return NULL;
224}
225
226static
227void unicode_dealloc(register PyUnicodeObject *unicode)
228{
229    if (PyUnicode_CheckExact(unicode) &&
230	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
231        /* Keep-Alive optimization */
232	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
233	    PyMem_DEL(unicode->str);
234	    unicode->str = NULL;
235	    unicode->length = 0;
236	}
237	if (unicode->defenc) {
238	    Py_DECREF(unicode->defenc);
239	    unicode->defenc = NULL;
240	}
241	/* Add to free list */
242        *(PyUnicodeObject **)unicode = unicode_freelist;
243        unicode_freelist = unicode;
244        unicode_freelist_size++;
245    }
246    else {
247	PyMem_DEL(unicode->str);
248	Py_XDECREF(unicode->defenc);
249	unicode->ob_type->tp_free((PyObject *)unicode);
250    }
251}
252
253int PyUnicode_Resize(PyObject **unicode,
254		     int length)
255{
256    register PyUnicodeObject *v;
257
258    /* Argument checks */
259    if (unicode == NULL) {
260	PyErr_BadInternalCall();
261	return -1;
262    }
263    v = (PyUnicodeObject *)*unicode;
264    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
265	PyErr_BadInternalCall();
266	return -1;
267    }
268
269    /* Resizing unicode_empty and single character objects is not
270       possible since these are being shared. We simply return a fresh
271       copy with the same Unicode content. */
272    if (v->length != length &&
273	(v == unicode_empty || v->length == 1)) {
274	PyUnicodeObject *w = _PyUnicode_New(length);
275	if (w == NULL)
276	    return -1;
277	Py_UNICODE_COPY(w->str, v->str,
278			length < v->length ? length : v->length);
279	Py_DECREF(*unicode);
280	*unicode = (PyObject *)w;
281	return 0;
282    }
283
284    /* Note that we don't have to modify *unicode for unshared Unicode
285       objects, since we can modify them in-place. */
286    return unicode_resize(v, length);
287}
288
289/* Internal API for use in unicodeobject.c only ! */
290#define _PyUnicode_Resize(unicodevar, length) \
291        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292
293PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
294				int size)
295{
296    PyUnicodeObject *unicode;
297
298    /* If the Unicode data is known at construction time, we can apply
299       some optimizations which share commonly used objects. */
300    if (u != NULL) {
301
302	/* Optimization for empty strings */
303	if (size == 0 && unicode_empty != NULL) {
304	    Py_INCREF(unicode_empty);
305	    return (PyObject *)unicode_empty;
306	}
307
308	/* Single character Unicode objects in the Latin-1 range are
309	   shared when using this constructor */
310	if (size == 1 && *u < 256) {
311	    unicode = unicode_latin1[*u];
312	    if (!unicode) {
313		unicode = _PyUnicode_New(1);
314		if (!unicode)
315		    return NULL;
316		unicode->str[0] = *u;
317		unicode_latin1[*u] = unicode;
318	    }
319	    Py_INCREF(unicode);
320	    return (PyObject *)unicode;
321	}
322    }
323
324    unicode = _PyUnicode_New(size);
325    if (!unicode)
326        return NULL;
327
328    /* Copy the Unicode data into the new object */
329    if (u != NULL)
330	Py_UNICODE_COPY(unicode->str, u, size);
331
332    return (PyObject *)unicode;
333}
334
335#ifdef HAVE_WCHAR_H
336
337PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
338				 int size)
339{
340    PyUnicodeObject *unicode;
341
342    if (w == NULL) {
343	PyErr_BadInternalCall();
344	return NULL;
345    }
346
347    unicode = _PyUnicode_New(size);
348    if (!unicode)
349        return NULL;
350
351    /* Copy the wchar_t data into the new object */
352#ifdef HAVE_USABLE_WCHAR_T
353    memcpy(unicode->str, w, size * sizeof(wchar_t));
354#else
355    {
356	register Py_UNICODE *u;
357	register int i;
358	u = PyUnicode_AS_UNICODE(unicode);
359	for (i = size; i >= 0; i--)
360	    *u++ = *w++;
361    }
362#endif
363
364    return (PyObject *)unicode;
365}
366
367int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
368			 register wchar_t *w,
369			 int size)
370{
371    if (unicode == NULL) {
372	PyErr_BadInternalCall();
373	return -1;
374    }
375    if (size > PyUnicode_GET_SIZE(unicode))
376	size = PyUnicode_GET_SIZE(unicode);
377#ifdef HAVE_USABLE_WCHAR_T
378    memcpy(w, unicode->str, size * sizeof(wchar_t));
379#else
380    {
381	register Py_UNICODE *u;
382	register int i;
383	u = PyUnicode_AS_UNICODE(unicode);
384	for (i = size; i >= 0; i--)
385	    *w++ = *u++;
386    }
387#endif
388
389    return size;
390}
391
392#endif
393
394PyObject *PyUnicode_FromOrdinal(int ordinal)
395{
396    Py_UNICODE s[2];
397
398#ifdef Py_UNICODE_WIDE
399    if (ordinal < 0 || ordinal > 0x10ffff) {
400	PyErr_SetString(PyExc_ValueError,
401			"unichr() arg not in range(0x110000) "
402			"(wide Python build)");
403	return NULL;
404    }
405#else
406    if (ordinal < 0 || ordinal > 0xffff) {
407	PyErr_SetString(PyExc_ValueError,
408			"unichr() arg not in range(0x10000) "
409			"(narrow Python build)");
410	return NULL;
411    }
412#endif
413
414    if (ordinal <= 0xffff) {
415	/* UCS-2 character */
416	s[0] = (Py_UNICODE) ordinal;
417	return PyUnicode_FromUnicode(s, 1);
418    }
419    else {
420#ifndef Py_UNICODE_WIDE
421	/* UCS-4 character.  store as two surrogate characters */
422	ordinal -= 0x10000L;
423	s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
424	s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
425	return PyUnicode_FromUnicode(s, 2);
426#else
427	s[0] = (Py_UNICODE)ordinal;
428	return PyUnicode_FromUnicode(s, 1);
429#endif
430    }
431}
432
433PyObject *PyUnicode_FromObject(register PyObject *obj)
434{
435    /* XXX Perhaps we should make this API an alias of
436           PyObject_Unicode() instead ?! */
437    if (PyUnicode_CheckExact(obj)) {
438	Py_INCREF(obj);
439	return obj;
440    }
441    if (PyUnicode_Check(obj)) {
442	/* For a Unicode subtype that's not a Unicode object,
443	   return a true Unicode object with the same data. */
444	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
445				     PyUnicode_GET_SIZE(obj));
446    }
447    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
448}
449
450PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
451				      const char *encoding,
452				      const char *errors)
453{
454    const char *s = NULL;
455    int len;
456    PyObject *v;
457
458    if (obj == NULL) {
459	PyErr_BadInternalCall();
460	return NULL;
461    }
462
463#if 0
464    /* For b/w compatibility we also accept Unicode objects provided
465       that no encodings is given and then redirect to
466       PyObject_Unicode() which then applies the additional logic for
467       Unicode subclasses.
468
469       NOTE: This API should really only be used for object which
470             represent *encoded* Unicode !
471
472    */
473	if (PyUnicode_Check(obj)) {
474	    if (encoding) {
475		PyErr_SetString(PyExc_TypeError,
476				"decoding Unicode is not supported");
477	    return NULL;
478	    }
479	return PyObject_Unicode(obj);
480	    }
481#else
482    if (PyUnicode_Check(obj)) {
483	PyErr_SetString(PyExc_TypeError,
484			"decoding Unicode is not supported");
485	return NULL;
486	}
487#endif
488
489    /* Coerce object */
490    if (PyString_Check(obj)) {
491	    s = PyString_AS_STRING(obj);
492	    len = PyString_GET_SIZE(obj);
493	    }
494    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495	/* Overwrite the error message with something more useful in
496	   case of a TypeError. */
497	if (PyErr_ExceptionMatches(PyExc_TypeError))
498	PyErr_Format(PyExc_TypeError,
499			 "coercing to Unicode: need string or buffer, "
500			 "%.80s found",
501		     obj->ob_type->tp_name);
502	goto onError;
503    }
504
505    /* Convert to Unicode */
506    if (len == 0) {
507	Py_INCREF(unicode_empty);
508	v = (PyObject *)unicode_empty;
509    }
510    else
511	v = PyUnicode_Decode(s, len, encoding, errors);
512
513    return v;
514
515 onError:
516    return NULL;
517}
518
519PyObject *PyUnicode_Decode(const char *s,
520			   int size,
521			   const char *encoding,
522			   const char *errors)
523{
524    PyObject *buffer = NULL, *unicode;
525
526    if (encoding == NULL)
527	encoding = PyUnicode_GetDefaultEncoding();
528
529    /* Shortcuts for common default encodings */
530    if (strcmp(encoding, "utf-8") == 0)
531        return PyUnicode_DecodeUTF8(s, size, errors);
532    else if (strcmp(encoding, "latin-1") == 0)
533        return PyUnicode_DecodeLatin1(s, size, errors);
534    else if (strcmp(encoding, "ascii") == 0)
535        return PyUnicode_DecodeASCII(s, size, errors);
536
537    /* Decode via the codec registry */
538    buffer = PyBuffer_FromMemory((void *)s, size);
539    if (buffer == NULL)
540        goto onError;
541    unicode = PyCodec_Decode(buffer, encoding, errors);
542    if (unicode == NULL)
543        goto onError;
544    if (!PyUnicode_Check(unicode)) {
545        PyErr_Format(PyExc_TypeError,
546                     "decoder did not return an unicode object (type=%.400s)",
547                     unicode->ob_type->tp_name);
548        Py_DECREF(unicode);
549        goto onError;
550    }
551    Py_DECREF(buffer);
552    return unicode;
553
554 onError:
555    Py_XDECREF(buffer);
556    return NULL;
557}
558
559PyObject *PyUnicode_Encode(const Py_UNICODE *s,
560			   int size,
561			   const char *encoding,
562			   const char *errors)
563{
564    PyObject *v, *unicode;
565
566    unicode = PyUnicode_FromUnicode(s, size);
567    if (unicode == NULL)
568	return NULL;
569    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
570    Py_DECREF(unicode);
571    return v;
572}
573
574PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
575                                    const char *encoding,
576                                    const char *errors)
577{
578    PyObject *v;
579
580    if (!PyUnicode_Check(unicode)) {
581        PyErr_BadArgument();
582        goto onError;
583    }
584
585    if (encoding == NULL)
586	encoding = PyUnicode_GetDefaultEncoding();
587
588    /* Shortcuts for common default encodings */
589    if (errors == NULL) {
590	if (strcmp(encoding, "utf-8") == 0)
591	    return PyUnicode_AsUTF8String(unicode);
592	else if (strcmp(encoding, "latin-1") == 0)
593	    return PyUnicode_AsLatin1String(unicode);
594	else if (strcmp(encoding, "ascii") == 0)
595	    return PyUnicode_AsASCIIString(unicode);
596    }
597
598    /* Encode via the codec registry */
599    v = PyCodec_Encode(unicode, encoding, errors);
600    if (v == NULL)
601        goto onError;
602    /* XXX Should we really enforce this ? */
603    if (!PyString_Check(v)) {
604        PyErr_Format(PyExc_TypeError,
605                     "encoder did not return a string object (type=%.400s)",
606                     v->ob_type->tp_name);
607        Py_DECREF(v);
608        goto onError;
609    }
610    return v;
611
612 onError:
613    return NULL;
614}
615
616PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
617					    const char *errors)
618{
619    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
620
621    if (v)
622        return v;
623    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
624    if (v && errors == NULL)
625        ((PyUnicodeObject *)unicode)->defenc = v;
626    return v;
627}
628
629Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
630{
631    if (!PyUnicode_Check(unicode)) {
632        PyErr_BadArgument();
633        goto onError;
634    }
635    return PyUnicode_AS_UNICODE(unicode);
636
637 onError:
638    return NULL;
639}
640
641int PyUnicode_GetSize(PyObject *unicode)
642{
643    if (!PyUnicode_Check(unicode)) {
644        PyErr_BadArgument();
645        goto onError;
646    }
647    return PyUnicode_GET_SIZE(unicode);
648
649 onError:
650    return -1;
651}
652
653const char *PyUnicode_GetDefaultEncoding(void)
654{
655    return unicode_default_encoding;
656}
657
658int PyUnicode_SetDefaultEncoding(const char *encoding)
659{
660    PyObject *v;
661
662    /* Make sure the encoding is valid. As side effect, this also
663       loads the encoding into the codec registry cache. */
664    v = _PyCodec_Lookup(encoding);
665    if (v == NULL)
666	goto onError;
667    Py_DECREF(v);
668    strncpy(unicode_default_encoding,
669	    encoding,
670	    sizeof(unicode_default_encoding));
671    return 0;
672
673 onError:
674    return -1;
675}
676
677/* error handling callback helper:
678   build arguments, call the callback and check the arguments,
679   if no exception occured, copy the replacement to the output
680   and adjust various state variables.
681   return 0 on success, -1 on error
682*/
683
684static
685int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
686                 const char *encoding, const char *reason,
687                 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
688                 PyObject **output, int *outpos, Py_UNICODE **outptr)
689{
690    static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
691
692    PyObject *restuple = NULL;
693    PyObject *repunicode = NULL;
694    int outsize = PyUnicode_GET_SIZE(*output);
695    int requiredsize;
696    int newpos;
697    Py_UNICODE *repptr;
698    int repsize;
699    int res = -1;
700
701    if (*errorHandler == NULL) {
702	*errorHandler = PyCodec_LookupError(errors);
703	if (*errorHandler == NULL)
704	   goto onError;
705    }
706
707    if (*exceptionObject == NULL) {
708    	*exceptionObject = PyUnicodeDecodeError_Create(
709	    encoding, input, insize, *startinpos, *endinpos, reason);
710	if (*exceptionObject == NULL)
711	   goto onError;
712    }
713    else {
714	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
715	    goto onError;
716	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
717	    goto onError;
718	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
719	    goto onError;
720    }
721
722    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
723    if (restuple == NULL)
724	goto onError;
725    if (!PyTuple_Check(restuple)) {
726	PyErr_Format(PyExc_TypeError, &argparse[4]);
727	goto onError;
728    }
729    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
730	goto onError;
731    if (newpos<0)
732	newpos = insize+newpos;
733    if (newpos<0 || newpos>insize) {
734	PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
735	goto onError;
736    }
737
738    /* need more space? (at least enough for what we
739       have+the replacement+the rest of the string (starting
740       at the new input position), so we won't have to check space
741       when there are no errors in the rest of the string) */
742    repptr = PyUnicode_AS_UNICODE(repunicode);
743    repsize = PyUnicode_GET_SIZE(repunicode);
744    requiredsize = *outpos + repsize + insize-newpos;
745    if (requiredsize > outsize) {
746	if (requiredsize<2*outsize)
747	    requiredsize = 2*outsize;
748	if (PyUnicode_Resize(output, requiredsize))
749	    goto onError;
750	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
751    }
752    *endinpos = newpos;
753    *inptr = input + newpos;
754    Py_UNICODE_COPY(*outptr, repptr, repsize);
755    *outptr += repsize;
756    *outpos += repsize;
757    /* we made it! */
758    res = 0;
759
760    onError:
761    Py_XDECREF(restuple);
762    return res;
763}
764
765/* --- UTF-7 Codec -------------------------------------------------------- */
766
767/* see RFC2152 for details */
768
769static
770char utf7_special[128] = {
771    /* indicate whether a UTF-7 character is special i.e. cannot be directly
772       encoded:
773	   0 - not special
774	   1 - special
775	   2 - whitespace (optional)
776	   3 - RFC2152 Set O (optional) */
777    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
778    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
779    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
780    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
781    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
782    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
783    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
784    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
785
786};
787
788#define SPECIAL(c, encodeO, encodeWS) \
789	(((c)>127 || utf7_special[(c)] == 1) || \
790	 (encodeWS && (utf7_special[(c)] == 2)) || \
791     (encodeO && (utf7_special[(c)] == 3)))
792
793#define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
794#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
795#define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
796                        (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
797
798#define ENCODE(out, ch, bits) \
799    while (bits >= 6) { \
800        *out++ = B64(ch >> (bits-6)); \
801        bits -= 6; \
802    }
803
804#define DECODE(out, ch, bits, surrogate) \
805    while (bits >= 16) { \
806        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
807        bits -= 16; \
808		if (surrogate) { \
809			/* We have already generated an error for the high surrogate
810               so let's not bother seeing if the low surrogate is correct or not */\
811			surrogate = 0; \
812		} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
813            /* This is a surrogate pair. Unfortunately we can't represent \
814               it in a 16-bit character */ \
815			surrogate = 1; \
816            errmsg = "code pairs are not supported"; \
817	        goto utf7Error; \
818		} else { \
819				*out++ = outCh; \
820		} \
821    } \
822
823PyObject *PyUnicode_DecodeUTF7(const char *s,
824			       int size,
825			       const char *errors)
826{
827    const char *starts = s;
828    int startinpos;
829    int endinpos;
830    int outpos;
831    const char *e;
832    PyUnicodeObject *unicode;
833    Py_UNICODE *p;
834    const char *errmsg = "";
835    int inShift = 0;
836    unsigned int bitsleft = 0;
837    unsigned long charsleft = 0;
838    int surrogate = 0;
839    PyObject *errorHandler = NULL;
840    PyObject *exc = NULL;
841
842    unicode = _PyUnicode_New(size);
843    if (!unicode)
844        return NULL;
845    if (size == 0)
846        return (PyObject *)unicode;
847
848    p = unicode->str;
849    e = s + size;
850
851    while (s < e) {
852        Py_UNICODE ch;
853        restart:
854        ch = *s;
855
856        if (inShift) {
857            if ((ch == '-') || !B64CHAR(ch)) {
858                inShift = 0;
859                s++;
860
861                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
862                if (bitsleft >= 6) {
863                    /* The shift sequence has a partial character in it. If
864                       bitsleft < 6 then we could just classify it as padding
865                       but that is not the case here */
866
867                    errmsg = "partial character in shift sequence";
868                    goto utf7Error;
869                }
870                /* According to RFC2152 the remaining bits should be zero. We
871                   choose to signal an error/insert a replacement character
872                   here so indicate the potential of a misencoded character. */
873
874                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
875                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
876                    errmsg = "non-zero padding bits in shift sequence";
877                    goto utf7Error;
878                }
879
880                if (ch == '-') {
881                    if ((s < e) && (*(s) == '-')) {
882                        *p++ = '-';
883                        inShift = 1;
884                    }
885                } else if (SPECIAL(ch,0,0)) {
886                    errmsg = "unexpected special character";
887	                goto utf7Error;
888                } else  {
889                    *p++ = ch;
890                }
891            } else {
892                charsleft = (charsleft << 6) | UB64(ch);
893                bitsleft += 6;
894                s++;
895                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
896            }
897        }
898        else if ( ch == '+' ) {
899            startinpos = s-starts;
900            s++;
901            if (s < e && *s == '-') {
902                s++;
903                *p++ = '+';
904            } else
905            {
906                inShift = 1;
907                bitsleft = 0;
908            }
909        }
910        else if (SPECIAL(ch,0,0)) {
911            errmsg = "unexpected special character";
912            s++;
913	        goto utf7Error;
914        }
915        else {
916            *p++ = ch;
917            s++;
918        }
919        continue;
920    utf7Error:
921        outpos = p-PyUnicode_AS_UNICODE(unicode);
922        endinpos = s-starts;
923        if (unicode_decode_call_errorhandler(
924             errors, &errorHandler,
925             "utf7", errmsg,
926             starts, size, &startinpos, &endinpos, &exc, &s,
927             (PyObject **)&unicode, &outpos, &p))
928        goto onError;
929    }
930
931    if (inShift) {
932        outpos = p-PyUnicode_AS_UNICODE(unicode);
933        endinpos = size;
934        if (unicode_decode_call_errorhandler(
935             errors, &errorHandler,
936             "utf7", "unterminated shift sequence",
937             starts, size, &startinpos, &endinpos, &exc, &s,
938             (PyObject **)&unicode, &outpos, &p))
939            goto onError;
940        if (s < e)
941           goto restart;
942    }
943
944    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
945        goto onError;
946
947    Py_XDECREF(errorHandler);
948    Py_XDECREF(exc);
949    return (PyObject *)unicode;
950
951onError:
952    Py_XDECREF(errorHandler);
953    Py_XDECREF(exc);
954    Py_DECREF(unicode);
955    return NULL;
956}
957
958
959PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
960                   int size,
961                   int encodeSetO,
962                   int encodeWhiteSpace,
963                   const char *errors)
964{
965    PyObject *v;
966    /* It might be possible to tighten this worst case */
967    unsigned int cbAllocated = 5 * size;
968    int inShift = 0;
969    int i = 0;
970    unsigned int bitsleft = 0;
971    unsigned long charsleft = 0;
972    char * out;
973    char * start;
974
975    if (size == 0)
976		return PyString_FromStringAndSize(NULL, 0);
977
978    v = PyString_FromStringAndSize(NULL, cbAllocated);
979    if (v == NULL)
980        return NULL;
981
982    start = out = PyString_AS_STRING(v);
983    for (;i < size; ++i) {
984        Py_UNICODE ch = s[i];
985
986        if (!inShift) {
987			if (ch == '+') {
988				*out++ = '+';
989                *out++ = '-';
990            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
991                charsleft = ch;
992                bitsleft = 16;
993                *out++ = '+';
994				/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
995                inShift = bitsleft > 0;
996			} else {
997				*out++ = (char) ch;
998			}
999		} else {
1000            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1001                *out++ = B64(charsleft << (6-bitsleft));
1002                charsleft = 0;
1003                bitsleft = 0;
1004                /* Characters not in the BASE64 set implicitly unshift the sequence
1005                   so no '-' is required, except if the character is itself a '-' */
1006                if (B64CHAR(ch) || ch == '-') {
1007                    *out++ = '-';
1008                }
1009                inShift = 0;
1010                *out++ = (char) ch;
1011            } else {
1012                bitsleft += 16;
1013                charsleft = (charsleft << 16) | ch;
1014                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1015
1016                /* If the next character is special then we dont' need to terminate
1017                   the shift sequence. If the next character is not a BASE64 character
1018                   or '-' then the shift sequence will be terminated implicitly and we
1019                   don't have to insert a '-'. */
1020
1021                if (bitsleft == 0) {
1022                    if (i + 1 < size) {
1023                        Py_UNICODE ch2 = s[i+1];
1024
1025                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1026
1027                        } else if (B64CHAR(ch2) || ch2 == '-') {
1028                            *out++ = '-';
1029                            inShift = 0;
1030                        } else {
1031                            inShift = 0;
1032                        }
1033
1034                    }
1035                    else {
1036                        *out++ = '-';
1037                        inShift = 0;
1038                    }
1039                }
1040            }
1041        }
1042	}
1043    if (bitsleft) {
1044        *out++= B64(charsleft << (6-bitsleft) );
1045        *out++ = '-';
1046    }
1047
1048    _PyString_Resize(&v, out - start);
1049    return v;
1050}
1051
1052#undef SPECIAL
1053#undef B64
1054#undef B64CHAR
1055#undef UB64
1056#undef ENCODE
1057#undef DECODE
1058
1059/* --- UTF-8 Codec -------------------------------------------------------- */
1060
1061static
1062char utf8_code_length[256] = {
1063    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1064       illegal prefix.  see RFC 2279 for details */
1065    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1066    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1067    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1068    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1069    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1077    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1078    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1079    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1080    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1081};
1082
1083PyObject *PyUnicode_DecodeUTF8(const char *s,
1084			       int size,
1085			       const char *errors)
1086{
1087    const char *starts = s;
1088    int n;
1089    int startinpos;
1090    int endinpos;
1091    int outpos;
1092    const char *e;
1093    PyUnicodeObject *unicode;
1094    Py_UNICODE *p;
1095    const char *errmsg = "";
1096    PyObject *errorHandler = NULL;
1097    PyObject *exc = NULL;
1098
1099    /* Note: size will always be longer than the resulting Unicode
1100       character count */
1101    unicode = _PyUnicode_New(size);
1102    if (!unicode)
1103        return NULL;
1104    if (size == 0)
1105        return (PyObject *)unicode;
1106
1107    /* Unpack UTF-8 encoded data */
1108    p = unicode->str;
1109    e = s + size;
1110
1111    while (s < e) {
1112        Py_UCS4 ch = (unsigned char)*s;
1113
1114        if (ch < 0x80) {
1115            *p++ = (Py_UNICODE)ch;
1116            s++;
1117            continue;
1118        }
1119
1120        n = utf8_code_length[ch];
1121
1122        if (s + n > e) {
1123	    errmsg = "unexpected end of data";
1124	    startinpos = s-starts;
1125	    endinpos = size;
1126	    goto utf8Error;
1127	}
1128
1129        switch (n) {
1130
1131        case 0:
1132            errmsg = "unexpected code byte";
1133	    startinpos = s-starts;
1134	    endinpos = startinpos+1;
1135	    goto utf8Error;
1136
1137        case 1:
1138            errmsg = "internal error";
1139	    startinpos = s-starts;
1140	    endinpos = startinpos+1;
1141	    goto utf8Error;
1142
1143        case 2:
1144            if ((s[1] & 0xc0) != 0x80) {
1145                errmsg = "invalid data";
1146		startinpos = s-starts;
1147		endinpos = startinpos+2;
1148		goto utf8Error;
1149	    }
1150            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1151            if (ch < 0x80) {
1152		startinpos = s-starts;
1153		endinpos = startinpos+2;
1154                errmsg = "illegal encoding";
1155		goto utf8Error;
1156	    }
1157	    else
1158		*p++ = (Py_UNICODE)ch;
1159            break;
1160
1161        case 3:
1162            if ((s[1] & 0xc0) != 0x80 ||
1163                (s[2] & 0xc0) != 0x80) {
1164                errmsg = "invalid data";
1165		startinpos = s-starts;
1166		endinpos = startinpos+3;
1167		goto utf8Error;
1168	    }
1169            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1170            if (ch < 0x0800) {
1171		/* Note: UTF-8 encodings of surrogates are considered
1172		   legal UTF-8 sequences;
1173
1174		   XXX For wide builds (UCS-4) we should probably try
1175		       to recombine the surrogates into a single code
1176		       unit.
1177		*/
1178                errmsg = "illegal encoding";
1179		startinpos = s-starts;
1180		endinpos = startinpos+3;
1181		goto utf8Error;
1182	    }
1183	    else
1184		*p++ = (Py_UNICODE)ch;
1185            break;
1186
1187        case 4:
1188            if ((s[1] & 0xc0) != 0x80 ||
1189                (s[2] & 0xc0) != 0x80 ||
1190                (s[3] & 0xc0) != 0x80) {
1191                errmsg = "invalid data";
1192		startinpos = s-starts;
1193		endinpos = startinpos+4;
1194		goto utf8Error;
1195	    }
1196            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1197                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1198            /* validate and convert to UTF-16 */
1199            if ((ch < 0x10000)        /* minimum value allowed for 4
1200					 byte encoding */
1201                || (ch > 0x10ffff))   /* maximum value allowed for
1202					 UTF-16 */
1203	    {
1204                errmsg = "illegal encoding";
1205		startinpos = s-starts;
1206		endinpos = startinpos+4;
1207		goto utf8Error;
1208	    }
1209#ifdef Py_UNICODE_WIDE
1210	    *p++ = (Py_UNICODE)ch;
1211#else
1212            /*  compute and append the two surrogates: */
1213
1214            /*  translate from 10000..10FFFF to 0..FFFF */
1215            ch -= 0x10000;
1216
1217            /*  high surrogate = top 10 bits added to D800 */
1218            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1219
1220            /*  low surrogate = bottom 10 bits added to DC00 */
1221            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1222#endif
1223            break;
1224
1225        default:
1226            /* Other sizes are only needed for UCS-4 */
1227            errmsg = "unsupported Unicode code range";
1228	    startinpos = s-starts;
1229	    endinpos = startinpos+n;
1230	    goto utf8Error;
1231        }
1232        s += n;
1233	continue;
1234
1235    utf8Error:
1236    outpos = p-PyUnicode_AS_UNICODE(unicode);
1237    if (unicode_decode_call_errorhandler(
1238	     errors, &errorHandler,
1239	     "utf8", errmsg,
1240	     starts, size, &startinpos, &endinpos, &exc, &s,
1241	     (PyObject **)&unicode, &outpos, &p))
1242	goto onError;
1243    }
1244
1245    /* Adjust length */
1246    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1247        goto onError;
1248
1249    Py_XDECREF(errorHandler);
1250    Py_XDECREF(exc);
1251    return (PyObject *)unicode;
1252
1253onError:
1254    Py_XDECREF(errorHandler);
1255    Py_XDECREF(exc);
1256    Py_DECREF(unicode);
1257    return NULL;
1258}
1259
1260/* Allocation strategy:  if the string is short, convert into a stack buffer
1261   and allocate exactly as much space needed at the end.  Else allocate the
1262   maximum possible needed (4 result bytes per Unicode character), and return
1263   the excess memory at the end.
1264*/
1265PyObject *
1266PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1267		     int size,
1268		     const char *errors)
1269{
1270#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1271
1272    int i;              /* index into s of next input byte */
1273    PyObject *v;        /* result string object */
1274    char *p;            /* next free byte in output buffer */
1275    int nallocated;     /* number of result bytes allocated */
1276    int nneeded;        /* number of result bytes needed */
1277    char stackbuf[MAX_SHORT_UNICHARS * 4];
1278
1279    assert(s != NULL);
1280    assert(size >= 0);
1281
1282    if (size <= MAX_SHORT_UNICHARS) {
1283        /* Write into the stack buffer; nallocated can't overflow.
1284         * At the end, we'll allocate exactly as much heap space as it
1285         * turns out we need.
1286         */
1287        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1288        v = NULL;   /* will allocate after we're done */
1289        p = stackbuf;
1290    }
1291    else {
1292        /* Overallocate on the heap, and give the excess back at the end. */
1293        nallocated = size * 4;
1294        if (nallocated / 4 != size)  /* overflow! */
1295            return PyErr_NoMemory();
1296        v = PyString_FromStringAndSize(NULL, nallocated);
1297        if (v == NULL)
1298            return NULL;
1299        p = PyString_AS_STRING(v);
1300    }
1301
1302    for (i = 0; i < size;) {
1303        Py_UCS4 ch = s[i++];
1304
1305        if (ch < 0x80)
1306            /* Encode ASCII */
1307            *p++ = (char) ch;
1308
1309        else if (ch < 0x0800) {
1310            /* Encode Latin-1 */
1311            *p++ = (char)(0xc0 | (ch >> 6));
1312            *p++ = (char)(0x80 | (ch & 0x3f));
1313        }
1314        else {
1315            /* Encode UCS2 Unicode ordinals */
1316            if (ch < 0x10000) {
1317                /* Special case: check for high surrogate */
1318                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1319                    Py_UCS4 ch2 = s[i];
1320                    /* Check for low surrogate and combine the two to
1321                       form a UCS4 value */
1322                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1323                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1324                        i++;
1325                        goto encodeUCS4;
1326                    }
1327                    /* Fall through: handles isolated high surrogates */
1328                }
1329                *p++ = (char)(0xe0 | (ch >> 12));
1330                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1331                *p++ = (char)(0x80 | (ch & 0x3f));
1332                continue;
1333    	    }
1334encodeUCS4:
1335            /* Encode UCS4 Unicode ordinals */
1336            *p++ = (char)(0xf0 | (ch >> 18));
1337            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1338            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1339            *p++ = (char)(0x80 | (ch & 0x3f));
1340        }
1341    }
1342
1343    if (v == NULL) {
1344        /* This was stack allocated. */
1345        nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1346        assert(nneeded <= nallocated);
1347        v = PyString_FromStringAndSize(stackbuf, nneeded);
1348    }
1349    else {
1350    	/* Cut back to size actually needed. */
1351        nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1352        assert(nneeded <= nallocated);
1353        _PyString_Resize(&v, nneeded);
1354    }
1355    return v;
1356
1357#undef MAX_SHORT_UNICHARS
1358}
1359
1360PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1361{
1362    if (!PyUnicode_Check(unicode)) {
1363        PyErr_BadArgument();
1364        return NULL;
1365    }
1366    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1367				PyUnicode_GET_SIZE(unicode),
1368				NULL);
1369}
1370
1371/* --- UTF-16 Codec ------------------------------------------------------- */
1372
1373PyObject *
1374PyUnicode_DecodeUTF16(const char *s,
1375		      int size,
1376		      const char *errors,
1377		      int *byteorder)
1378{
1379    const char *starts = s;
1380    int startinpos;
1381    int endinpos;
1382    int outpos;
1383    PyUnicodeObject *unicode;
1384    Py_UNICODE *p;
1385    const unsigned char *q, *e;
1386    int bo = 0;       /* assume native ordering by default */
1387    const char *errmsg = "";
1388    /* Offsets from q for retrieving byte pairs in the right order. */
1389#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1390    int ihi = 1, ilo = 0;
1391#else
1392    int ihi = 0, ilo = 1;
1393#endif
1394    PyObject *errorHandler = NULL;
1395    PyObject *exc = NULL;
1396
1397    /* Note: size will always be longer than the resulting Unicode
1398       character count */
1399    unicode = _PyUnicode_New(size);
1400    if (!unicode)
1401        return NULL;
1402    if (size == 0)
1403        return (PyObject *)unicode;
1404
1405    /* Unpack UTF-16 encoded data */
1406    p = unicode->str;
1407    q = (unsigned char *)s;
1408    e = q + size;
1409
1410    if (byteorder)
1411        bo = *byteorder;
1412
1413    /* Check for BOM marks (U+FEFF) in the input and adjust current
1414       byte order setting accordingly. In native mode, the leading BOM
1415       mark is skipped, in all other modes, it is copied to the output
1416       stream as-is (giving a ZWNBSP character). */
1417    if (bo == 0) {
1418        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1419#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1420	if (bom == 0xFEFF) {
1421	    q += 2;
1422	    bo = -1;
1423	}
1424        else if (bom == 0xFFFE) {
1425	    q += 2;
1426	    bo = 1;
1427	}
1428#else
1429	if (bom == 0xFEFF) {
1430	    q += 2;
1431	    bo = 1;
1432	}
1433        else if (bom == 0xFFFE) {
1434	    q += 2;
1435	    bo = -1;
1436	}
1437#endif
1438    }
1439
1440    if (bo == -1) {
1441        /* force LE */
1442        ihi = 1;
1443        ilo = 0;
1444    }
1445    else if (bo == 1) {
1446        /* force BE */
1447        ihi = 0;
1448        ilo = 1;
1449    }
1450
1451    while (q < e) {
1452	Py_UNICODE ch;
1453	/* remaing bytes at the end? (size should be even) */
1454	if (e-q<2) {
1455	    errmsg = "truncated data";
1456	    startinpos = ((const char *)q)-starts;
1457	    endinpos = ((const char *)e)-starts;
1458	    goto utf16Error;
1459	    /* The remaining input chars are ignored if the callback
1460	       chooses to skip the input */
1461	}
1462	ch = (q[ihi] << 8) | q[ilo];
1463
1464	q += 2;
1465
1466	if (ch < 0xD800 || ch > 0xDFFF) {
1467	    *p++ = ch;
1468	    continue;
1469	}
1470
1471	/* UTF-16 code pair: */
1472	if (q >= e) {
1473	    errmsg = "unexpected end of data";
1474	    startinpos = (((const char *)q)-2)-starts;
1475	    endinpos = ((const char *)e)-starts;
1476	    goto utf16Error;
1477	}
1478	if (0xD800 <= ch && ch <= 0xDBFF) {
1479	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1480	    q += 2;
1481	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1482#ifndef Py_UNICODE_WIDE
1483		*p++ = ch;
1484		*p++ = ch2;
1485#else
1486		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1487#endif
1488		continue;
1489	    }
1490	    else {
1491                errmsg = "illegal UTF-16 surrogate";
1492		startinpos = (((const char *)q)-4)-starts;
1493		endinpos = startinpos+2;
1494		goto utf16Error;
1495	    }
1496
1497	}
1498	errmsg = "illegal encoding";
1499	startinpos = (((const char *)q)-2)-starts;
1500	endinpos = startinpos+2;
1501	/* Fall through to report the error */
1502
1503    utf16Error:
1504	outpos = p-PyUnicode_AS_UNICODE(unicode);
1505	if (unicode_decode_call_errorhandler(
1506	         errors, &errorHandler,
1507	         "utf16", errmsg,
1508	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1509	         (PyObject **)&unicode, &outpos, &p))
1510	    goto onError;
1511    }
1512
1513    if (byteorder)
1514        *byteorder = bo;
1515
1516    /* Adjust length */
1517    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1518        goto onError;
1519
1520    Py_XDECREF(errorHandler);
1521    Py_XDECREF(exc);
1522    return (PyObject *)unicode;
1523
1524onError:
1525    Py_DECREF(unicode);
1526    Py_XDECREF(errorHandler);
1527    Py_XDECREF(exc);
1528    return NULL;
1529}
1530
1531PyObject *
1532PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1533		      int size,
1534		      const char *errors,
1535		      int byteorder)
1536{
1537    PyObject *v;
1538    unsigned char *p;
1539    int i, pairs;
1540    /* Offsets from p for storing byte pairs in the right order. */
1541#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1542    int ihi = 1, ilo = 0;
1543#else
1544    int ihi = 0, ilo = 1;
1545#endif
1546
1547#define STORECHAR(CH)                   \
1548    do {                                \
1549        p[ihi] = ((CH) >> 8) & 0xff;    \
1550        p[ilo] = (CH) & 0xff;           \
1551        p += 2;                         \
1552    } while(0)
1553
1554    for (i = pairs = 0; i < size; i++)
1555	if (s[i] >= 0x10000)
1556	    pairs++;
1557    v = PyString_FromStringAndSize(NULL,
1558		  2 * (size + pairs + (byteorder == 0)));
1559    if (v == NULL)
1560        return NULL;
1561
1562    p = (unsigned char *)PyString_AS_STRING(v);
1563    if (byteorder == 0)
1564	STORECHAR(0xFEFF);
1565    if (size == 0)
1566        return v;
1567
1568    if (byteorder == -1) {
1569        /* force LE */
1570        ihi = 1;
1571        ilo = 0;
1572    }
1573    else if (byteorder == 1) {
1574        /* force BE */
1575        ihi = 0;
1576        ilo = 1;
1577    }
1578
1579    while (size-- > 0) {
1580	Py_UNICODE ch = *s++;
1581	Py_UNICODE ch2 = 0;
1582	if (ch >= 0x10000) {
1583	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1584	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1585	}
1586        STORECHAR(ch);
1587        if (ch2)
1588            STORECHAR(ch2);
1589    }
1590    return v;
1591#undef STORECHAR
1592}
1593
1594PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1595{
1596    if (!PyUnicode_Check(unicode)) {
1597        PyErr_BadArgument();
1598        return NULL;
1599    }
1600    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1601				 PyUnicode_GET_SIZE(unicode),
1602				 NULL,
1603				 0);
1604}
1605
1606/* --- Unicode Escape Codec ----------------------------------------------- */
1607
1608static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1609
1610PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1611					int size,
1612					const char *errors)
1613{
1614    const char *starts = s;
1615    int startinpos;
1616    int endinpos;
1617    int outpos;
1618    int i;
1619    PyUnicodeObject *v;
1620    Py_UNICODE *p;
1621    const char *end;
1622    char* message;
1623    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1624    PyObject *errorHandler = NULL;
1625    PyObject *exc = NULL;
1626
1627    /* Escaped strings will always be longer than the resulting
1628       Unicode string, so we start with size here and then reduce the
1629       length after conversion to the true value.
1630       (but if the error callback returns a long replacement string
1631       we'll have to allocate more space) */
1632    v = _PyUnicode_New(size);
1633    if (v == NULL)
1634        goto onError;
1635    if (size == 0)
1636        return (PyObject *)v;
1637
1638    p = PyUnicode_AS_UNICODE(v);
1639    end = s + size;
1640
1641    while (s < end) {
1642        unsigned char c;
1643        Py_UNICODE x;
1644        int digits;
1645
1646        /* Non-escape characters are interpreted as Unicode ordinals */
1647        if (*s != '\\') {
1648            *p++ = (unsigned char) *s++;
1649            continue;
1650        }
1651
1652        startinpos = s-starts;
1653        /* \ - Escapes */
1654        s++;
1655        switch (*s++) {
1656
1657        /* \x escapes */
1658        case '\n': break;
1659        case '\\': *p++ = '\\'; break;
1660        case '\'': *p++ = '\''; break;
1661        case '\"': *p++ = '\"'; break;
1662        case 'b': *p++ = '\b'; break;
1663        case 'f': *p++ = '\014'; break; /* FF */
1664        case 't': *p++ = '\t'; break;
1665        case 'n': *p++ = '\n'; break;
1666        case 'r': *p++ = '\r'; break;
1667        case 'v': *p++ = '\013'; break; /* VT */
1668        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1669
1670        /* \OOO (octal) escapes */
1671        case '0': case '1': case '2': case '3':
1672        case '4': case '5': case '6': case '7':
1673            x = s[-1] - '0';
1674            if ('0' <= *s && *s <= '7') {
1675                x = (x<<3) + *s++ - '0';
1676                if ('0' <= *s && *s <= '7')
1677                    x = (x<<3) + *s++ - '0';
1678            }
1679            *p++ = x;
1680            break;
1681
1682        /* hex escapes */
1683        /* \xXX */
1684        case 'x':
1685            digits = 2;
1686            message = "truncated \\xXX escape";
1687            goto hexescape;
1688
1689        /* \uXXXX */
1690        case 'u':
1691            digits = 4;
1692            message = "truncated \\uXXXX escape";
1693            goto hexescape;
1694
1695        /* \UXXXXXXXX */
1696        case 'U':
1697            digits = 8;
1698            message = "truncated \\UXXXXXXXX escape";
1699        hexescape:
1700            chr = 0;
1701            outpos = p-PyUnicode_AS_UNICODE(v);
1702            if (s+digits>end) {
1703                endinpos = size;
1704                if (unicode_decode_call_errorhandler(
1705                    errors, &errorHandler,
1706                    "unicodeescape", "end of string in escape sequence",
1707                    starts, size, &startinpos, &endinpos, &exc, &s,
1708                    (PyObject **)&v, &outpos, &p))
1709                    goto onError;
1710                goto nextByte;
1711            }
1712            for (i = 0; i < digits; ++i) {
1713                c = (unsigned char) s[i];
1714                if (!isxdigit(c)) {
1715                    endinpos = (s+i+1)-starts;
1716                    if (unicode_decode_call_errorhandler(
1717                        errors, &errorHandler,
1718                        "unicodeescape", message,
1719                        starts, size, &startinpos, &endinpos, &exc, &s,
1720                        (PyObject **)&v, &outpos, &p))
1721                        goto onError;
1722                    goto nextByte;
1723                }
1724                chr = (chr<<4) & ~0xF;
1725                if (c >= '0' && c <= '9')
1726                    chr += c - '0';
1727                else if (c >= 'a' && c <= 'f')
1728                    chr += 10 + c - 'a';
1729                else
1730                    chr += 10 + c - 'A';
1731            }
1732            s += i;
1733            if (chr == 0xffffffff)
1734                /* _decoding_error will have already written into the
1735                   target buffer. */
1736                break;
1737        store:
1738            /* when we get here, chr is a 32-bit unicode character */
1739            if (chr <= 0xffff)
1740                /* UCS-2 character */
1741                *p++ = (Py_UNICODE) chr;
1742            else if (chr <= 0x10ffff) {
1743                /* UCS-4 character. Either store directly, or as
1744                   surrogate pair. */
1745#ifdef Py_UNICODE_WIDE
1746                *p++ = chr;
1747#else
1748                chr -= 0x10000L;
1749                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1750                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1751#endif
1752            } else {
1753                endinpos = s-starts;
1754                outpos = p-PyUnicode_AS_UNICODE(v);
1755                if (unicode_decode_call_errorhandler(
1756                    errors, &errorHandler,
1757                    "unicodeescape", "illegal Unicode character",
1758                    starts, size, &startinpos, &endinpos, &exc, &s,
1759                    (PyObject **)&v, &outpos, &p))
1760                    goto onError;
1761            }
1762            break;
1763
1764        /* \N{name} */
1765        case 'N':
1766            message = "malformed \\N character escape";
1767            if (ucnhash_CAPI == NULL) {
1768                /* load the unicode data module */
1769                PyObject *m, *v;
1770                m = PyImport_ImportModule("unicodedata");
1771                if (m == NULL)
1772                    goto ucnhashError;
1773                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1774                Py_DECREF(m);
1775                if (v == NULL)
1776                    goto ucnhashError;
1777                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1778                Py_DECREF(v);
1779                if (ucnhash_CAPI == NULL)
1780                    goto ucnhashError;
1781            }
1782            if (*s == '{') {
1783                const char *start = s+1;
1784                /* look for the closing brace */
1785                while (*s != '}' && s < end)
1786                    s++;
1787                if (s > start && s < end && *s == '}') {
1788                    /* found a name.  look it up in the unicode database */
1789                    message = "unknown Unicode character name";
1790                    s++;
1791                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1792                        goto store;
1793                }
1794            }
1795            endinpos = s-starts;
1796            outpos = p-PyUnicode_AS_UNICODE(v);
1797            if (unicode_decode_call_errorhandler(
1798                errors, &errorHandler,
1799                "unicodeescape", message,
1800                starts, size, &startinpos, &endinpos, &exc, &s,
1801                (PyObject **)&v, &outpos, &p))
1802                goto onError;
1803            break;
1804
1805        default:
1806            if (s > end) {
1807                message = "\\ at end of string";
1808                s--;
1809                endinpos = s-starts;
1810                outpos = p-PyUnicode_AS_UNICODE(v);
1811                if (unicode_decode_call_errorhandler(
1812                    errors, &errorHandler,
1813                    "unicodeescape", message,
1814                    starts, size, &startinpos, &endinpos, &exc, &s,
1815                    (PyObject **)&v, &outpos, &p))
1816                    goto onError;
1817            }
1818            else {
1819                *p++ = '\\';
1820                *p++ = (unsigned char)s[-1];
1821            }
1822            break;
1823        }
1824        nextByte:
1825        ;
1826    }
1827    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1828        goto onError;
1829    return (PyObject *)v;
1830
1831ucnhashError:
1832    PyErr_SetString(
1833        PyExc_UnicodeError,
1834        "\\N escapes not supported (can't load unicodedata module)"
1835        );
1836    Py_XDECREF(errorHandler);
1837    Py_XDECREF(exc);
1838    return NULL;
1839
1840onError:
1841    Py_XDECREF(v);
1842    Py_XDECREF(errorHandler);
1843    Py_XDECREF(exc);
1844    return NULL;
1845}
1846
1847/* Return a Unicode-Escape string version of the Unicode object.
1848
1849   If quotes is true, the string is enclosed in u"" or u'' quotes as
1850   appropriate.
1851
1852*/
1853
1854static const Py_UNICODE *findchar(const Py_UNICODE *s,
1855				  int size,
1856				  Py_UNICODE ch);
1857
1858static
1859PyObject *unicodeescape_string(const Py_UNICODE *s,
1860                               int size,
1861                               int quotes)
1862{
1863    PyObject *repr;
1864    char *p;
1865
1866    static const char *hexdigit = "0123456789abcdef";
1867
1868    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1869    if (repr == NULL)
1870        return NULL;
1871
1872    p = PyString_AS_STRING(repr);
1873
1874    if (quotes) {
1875        *p++ = 'u';
1876        *p++ = (findchar(s, size, '\'') &&
1877                !findchar(s, size, '"')) ? '"' : '\'';
1878    }
1879    while (size-- > 0) {
1880        Py_UNICODE ch = *s++;
1881
1882        /* Escape quotes */
1883        if (quotes &&
1884	    (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1885            *p++ = '\\';
1886            *p++ = (char) ch;
1887	    continue;
1888        }
1889
1890#ifdef Py_UNICODE_WIDE
1891        /* Map 21-bit characters to '\U00xxxxxx' */
1892        else if (ch >= 0x10000) {
1893	    int offset = p - PyString_AS_STRING(repr);
1894
1895	    /* Resize the string if necessary */
1896	    if (offset + 12 > PyString_GET_SIZE(repr)) {
1897		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1898		    return NULL;
1899		p = PyString_AS_STRING(repr) + offset;
1900	    }
1901
1902            *p++ = '\\';
1903            *p++ = 'U';
1904            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1905            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1906            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1907            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1908            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1909            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1910            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1911            *p++ = hexdigit[ch & 0x0000000F];
1912	    continue;
1913        }
1914#endif
1915	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1916	else if (ch >= 0xD800 && ch < 0xDC00) {
1917	    Py_UNICODE ch2;
1918	    Py_UCS4 ucs;
1919
1920	    ch2 = *s++;
1921	    size--;
1922	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1923		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1924		*p++ = '\\';
1925		*p++ = 'U';
1926		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1927		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1928		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1929		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1930		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1931		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1932		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1933		*p++ = hexdigit[ucs & 0x0000000F];
1934		continue;
1935	    }
1936	    /* Fall through: isolated surrogates are copied as-is */
1937	    s--;
1938	    size++;
1939	}
1940
1941        /* Map 16-bit characters to '\uxxxx' */
1942        if (ch >= 256) {
1943            *p++ = '\\';
1944            *p++ = 'u';
1945            *p++ = hexdigit[(ch >> 12) & 0x000F];
1946            *p++ = hexdigit[(ch >> 8) & 0x000F];
1947            *p++ = hexdigit[(ch >> 4) & 0x000F];
1948            *p++ = hexdigit[ch & 0x000F];
1949        }
1950
1951        /* Map special whitespace to '\t', \n', '\r' */
1952        else if (ch == '\t') {
1953            *p++ = '\\';
1954            *p++ = 't';
1955        }
1956        else if (ch == '\n') {
1957            *p++ = '\\';
1958            *p++ = 'n';
1959        }
1960        else if (ch == '\r') {
1961            *p++ = '\\';
1962            *p++ = 'r';
1963        }
1964
1965        /* Map non-printable US ASCII to '\xhh' */
1966        else if (ch < ' ' || ch >= 0x7F) {
1967            *p++ = '\\';
1968            *p++ = 'x';
1969            *p++ = hexdigit[(ch >> 4) & 0x000F];
1970            *p++ = hexdigit[ch & 0x000F];
1971        }
1972
1973        /* Copy everything else as-is */
1974        else
1975            *p++ = (char) ch;
1976    }
1977    if (quotes)
1978        *p++ = PyString_AS_STRING(repr)[1];
1979
1980    *p = '\0';
1981    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
1982    return repr;
1983}
1984
1985PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1986					int size)
1987{
1988    return unicodeescape_string(s, size, 0);
1989}
1990
1991PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1992{
1993    if (!PyUnicode_Check(unicode)) {
1994        PyErr_BadArgument();
1995        return NULL;
1996    }
1997    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1998					 PyUnicode_GET_SIZE(unicode));
1999}
2000
2001/* --- Raw Unicode Escape Codec ------------------------------------------- */
2002
2003PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2004					   int size,
2005					   const char *errors)
2006{
2007    const char *starts = s;
2008    int startinpos;
2009    int endinpos;
2010    int outpos;
2011    PyUnicodeObject *v;
2012    Py_UNICODE *p;
2013    const char *end;
2014    const char *bs;
2015    PyObject *errorHandler = NULL;
2016    PyObject *exc = NULL;
2017
2018    /* Escaped strings will always be longer than the resulting
2019       Unicode string, so we start with size here and then reduce the
2020       length after conversion to the true value. (But decoding error
2021       handler might have to resize the string) */
2022    v = _PyUnicode_New(size);
2023    if (v == NULL)
2024	goto onError;
2025    if (size == 0)
2026	return (PyObject *)v;
2027    p = PyUnicode_AS_UNICODE(v);
2028    end = s + size;
2029    while (s < end) {
2030	unsigned char c;
2031	Py_UCS4 x;
2032	int i;
2033        int count;
2034
2035	/* Non-escape characters are interpreted as Unicode ordinals */
2036	if (*s != '\\') {
2037	    *p++ = (unsigned char)*s++;
2038	    continue;
2039	}
2040	startinpos = s-starts;
2041
2042	/* \u-escapes are only interpreted iff the number of leading
2043	   backslashes if odd */
2044	bs = s;
2045	for (;s < end;) {
2046	    if (*s != '\\')
2047		break;
2048	    *p++ = (unsigned char)*s++;
2049	}
2050	if (((s - bs) & 1) == 0 ||
2051	    s >= end ||
2052	    (*s != 'u' && *s != 'U')) {
2053	    continue;
2054	}
2055	p--;
2056        count = *s=='u' ? 4 : 8;
2057	s++;
2058
2059	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2060	outpos = p-PyUnicode_AS_UNICODE(v);
2061	for (x = 0, i = 0; i < count; ++i, ++s) {
2062	    c = (unsigned char)*s;
2063	    if (!isxdigit(c)) {
2064		endinpos = s-starts;
2065		if (unicode_decode_call_errorhandler(
2066		    errors, &errorHandler,
2067		    "rawunicodeescape", "truncated \\uXXXX",
2068		    starts, size, &startinpos, &endinpos, &exc, &s,
2069		    (PyObject **)&v, &outpos, &p))
2070		    goto onError;
2071		goto nextByte;
2072	    }
2073	    x = (x<<4) & ~0xF;
2074	    if (c >= '0' && c <= '9')
2075		x += c - '0';
2076	    else if (c >= 'a' && c <= 'f')
2077		x += 10 + c - 'a';
2078	    else
2079		x += 10 + c - 'A';
2080	}
2081#ifndef Py_UNICODE_WIDE
2082        if (x > 0x10000) {
2083            if (unicode_decode_call_errorhandler(
2084                    errors, &errorHandler,
2085                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
2086		    starts, size, &startinpos, &endinpos, &exc, &s,
2087		    (PyObject **)&v, &outpos, &p))
2088		    goto onError;
2089        }
2090#endif
2091	*p++ = x;
2092	nextByte:
2093	;
2094    }
2095    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2096	goto onError;
2097    Py_XDECREF(errorHandler);
2098    Py_XDECREF(exc);
2099    return (PyObject *)v;
2100
2101 onError:
2102    Py_XDECREF(v);
2103    Py_XDECREF(errorHandler);
2104    Py_XDECREF(exc);
2105    return NULL;
2106}
2107
2108PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2109					   int size)
2110{
2111    PyObject *repr;
2112    char *p;
2113    char *q;
2114
2115    static const char *hexdigit = "0123456789abcdef";
2116
2117#ifdef Py_UNICODE_WIDE
2118    repr = PyString_FromStringAndSize(NULL, 10 * size);
2119#else
2120    repr = PyString_FromStringAndSize(NULL, 6 * size);
2121#endif
2122    if (repr == NULL)
2123        return NULL;
2124    if (size == 0)
2125	return repr;
2126
2127    p = q = PyString_AS_STRING(repr);
2128    while (size-- > 0) {
2129        Py_UNICODE ch = *s++;
2130#ifdef Py_UNICODE_WIDE
2131	/* Map 32-bit characters to '\Uxxxxxxxx' */
2132	if (ch >= 0x10000) {
2133            *p++ = '\\';
2134            *p++ = 'U';
2135            *p++ = hexdigit[(ch >> 28) & 0xf];
2136            *p++ = hexdigit[(ch >> 24) & 0xf];
2137            *p++ = hexdigit[(ch >> 20) & 0xf];
2138            *p++ = hexdigit[(ch >> 16) & 0xf];
2139            *p++ = hexdigit[(ch >> 12) & 0xf];
2140            *p++ = hexdigit[(ch >> 8) & 0xf];
2141            *p++ = hexdigit[(ch >> 4) & 0xf];
2142            *p++ = hexdigit[ch & 15];
2143        }
2144        else
2145#endif
2146	/* Map 16-bit characters to '\uxxxx' */
2147	if (ch >= 256) {
2148            *p++ = '\\';
2149            *p++ = 'u';
2150            *p++ = hexdigit[(ch >> 12) & 0xf];
2151            *p++ = hexdigit[(ch >> 8) & 0xf];
2152            *p++ = hexdigit[(ch >> 4) & 0xf];
2153            *p++ = hexdigit[ch & 15];
2154        }
2155	/* Copy everything else as-is */
2156	else
2157            *p++ = (char) ch;
2158    }
2159    *p = '\0';
2160    _PyString_Resize(&repr, p - q);
2161    return repr;
2162}
2163
2164PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2165{
2166    if (!PyUnicode_Check(unicode)) {
2167	PyErr_BadArgument();
2168	return NULL;
2169    }
2170    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2171					    PyUnicode_GET_SIZE(unicode));
2172}
2173
2174/* --- Latin-1 Codec ------------------------------------------------------ */
2175
2176PyObject *PyUnicode_DecodeLatin1(const char *s,
2177				 int size,
2178				 const char *errors)
2179{
2180    PyUnicodeObject *v;
2181    Py_UNICODE *p;
2182
2183    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2184    if (size == 1 && *(unsigned char*)s < 256) {
2185	Py_UNICODE r = *(unsigned char*)s;
2186	return PyUnicode_FromUnicode(&r, 1);
2187    }
2188
2189    v = _PyUnicode_New(size);
2190    if (v == NULL)
2191	goto onError;
2192    if (size == 0)
2193	return (PyObject *)v;
2194    p = PyUnicode_AS_UNICODE(v);
2195    while (size-- > 0)
2196	*p++ = (unsigned char)*s++;
2197    return (PyObject *)v;
2198
2199 onError:
2200    Py_XDECREF(v);
2201    return NULL;
2202}
2203
2204/* create or adjust a UnicodeEncodeError */
2205static void make_encode_exception(PyObject **exceptionObject,
2206    const char *encoding,
2207    const Py_UNICODE *unicode, int size,
2208    int startpos, int endpos,
2209    const char *reason)
2210{
2211    if (*exceptionObject == NULL) {
2212	*exceptionObject = PyUnicodeEncodeError_Create(
2213	    encoding, unicode, size, startpos, endpos, reason);
2214    }
2215    else {
2216	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2217	    goto onError;
2218	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2219	    goto onError;
2220	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2221	    goto onError;
2222	return;
2223	onError:
2224	Py_DECREF(*exceptionObject);
2225	*exceptionObject = NULL;
2226    }
2227}
2228
2229/* raises a UnicodeEncodeError */
2230static void raise_encode_exception(PyObject **exceptionObject,
2231    const char *encoding,
2232    const Py_UNICODE *unicode, int size,
2233    int startpos, int endpos,
2234    const char *reason)
2235{
2236    make_encode_exception(exceptionObject,
2237	encoding, unicode, size, startpos, endpos, reason);
2238    if (*exceptionObject != NULL)
2239	PyCodec_StrictErrors(*exceptionObject);
2240}
2241
2242/* error handling callback helper:
2243   build arguments, call the callback and check the arguments,
2244   put the result into newpos and return the replacement string, which
2245   has to be freed by the caller */
2246static PyObject *unicode_encode_call_errorhandler(const char *errors,
2247    PyObject **errorHandler,
2248    const char *encoding, const char *reason,
2249    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2250    int startpos, int endpos,
2251    int *newpos)
2252{
2253    static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2254
2255    PyObject *restuple;
2256    PyObject *resunicode;
2257
2258    if (*errorHandler == NULL) {
2259	*errorHandler = PyCodec_LookupError(errors);
2260        if (*errorHandler == NULL)
2261	    return NULL;
2262    }
2263
2264    make_encode_exception(exceptionObject,
2265	encoding, unicode, size, startpos, endpos, reason);
2266    if (*exceptionObject == NULL)
2267	return NULL;
2268
2269    restuple = PyObject_CallFunctionObjArgs(
2270	*errorHandler, *exceptionObject, NULL);
2271    if (restuple == NULL)
2272	return NULL;
2273    if (!PyTuple_Check(restuple)) {
2274	PyErr_Format(PyExc_TypeError, &argparse[4]);
2275	Py_DECREF(restuple);
2276	return NULL;
2277    }
2278    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2279	&resunicode, newpos)) {
2280	Py_DECREF(restuple);
2281	return NULL;
2282    }
2283    if (*newpos<0)
2284	*newpos = size+*newpos;
2285    if (*newpos<0 || *newpos>size) {
2286	PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2287	Py_DECREF(restuple);
2288	return NULL;
2289    }
2290    Py_INCREF(resunicode);
2291    Py_DECREF(restuple);
2292    return resunicode;
2293}
2294
2295static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2296				 int size,
2297				 const char *errors,
2298				 int limit)
2299{
2300    /* output object */
2301    PyObject *res;
2302    /* pointers to the beginning and end+1 of input */
2303    const Py_UNICODE *startp = p;
2304    const Py_UNICODE *endp = p + size;
2305    /* pointer to the beginning of the unencodable characters */
2306    /* const Py_UNICODE *badp = NULL; */
2307    /* pointer into the output */
2308    char *str;
2309    /* current output position */
2310    int respos = 0;
2311    int ressize;
2312    char *encoding = (limit == 256) ? "latin-1" : "ascii";
2313    char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2314    PyObject *errorHandler = NULL;
2315    PyObject *exc = NULL;
2316    /* the following variable is used for caching string comparisons
2317     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2318    int known_errorHandler = -1;
2319
2320    /* allocate enough for a simple encoding without
2321       replacements, if we need more, we'll resize */
2322    res = PyString_FromStringAndSize(NULL, size);
2323    if (res == NULL)
2324        goto onError;
2325    if (size == 0)
2326	return res;
2327    str = PyString_AS_STRING(res);
2328    ressize = size;
2329
2330    while (p<endp) {
2331	Py_UNICODE c = *p;
2332
2333	/* can we encode this? */
2334	if (c<limit) {
2335	    /* no overflow check, because we know that the space is enough */
2336	    *str++ = (char)c;
2337	    ++p;
2338	}
2339	else {
2340	    int unicodepos = p-startp;
2341	    int requiredsize;
2342	    PyObject *repunicode;
2343	    int repsize;
2344	    int newpos;
2345	    int respos;
2346	    Py_UNICODE *uni2;
2347	    /* startpos for collecting unencodable chars */
2348	    const Py_UNICODE *collstart = p;
2349	    const Py_UNICODE *collend = p;
2350	    /* find all unecodable characters */
2351	    while ((collend < endp) && ((*collend)>=limit))
2352		++collend;
2353	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2354	    if (known_errorHandler==-1) {
2355		if ((errors==NULL) || (!strcmp(errors, "strict")))
2356		    known_errorHandler = 1;
2357		else if (!strcmp(errors, "replace"))
2358		    known_errorHandler = 2;
2359		else if (!strcmp(errors, "ignore"))
2360		    known_errorHandler = 3;
2361		else if (!strcmp(errors, "xmlcharrefreplace"))
2362		    known_errorHandler = 4;
2363		else
2364		    known_errorHandler = 0;
2365	    }
2366	    switch (known_errorHandler) {
2367		case 1: /* strict */
2368		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2369		    goto onError;
2370		case 2: /* replace */
2371		    while (collstart++<collend)
2372			*str++ = '?'; /* fall through */
2373		case 3: /* ignore */
2374		    p = collend;
2375		    break;
2376		case 4: /* xmlcharrefreplace */
2377		    respos = str-PyString_AS_STRING(res);
2378		    /* determine replacement size (temporarily (mis)uses p) */
2379		    for (p = collstart, repsize = 0; p < collend; ++p) {
2380			if (*p<10)
2381			    repsize += 2+1+1;
2382			else if (*p<100)
2383			    repsize += 2+2+1;
2384			else if (*p<1000)
2385			    repsize += 2+3+1;
2386			else if (*p<10000)
2387			    repsize += 2+4+1;
2388			else if (*p<100000)
2389			    repsize += 2+5+1;
2390			else if (*p<1000000)
2391			    repsize += 2+6+1;
2392			else
2393			    repsize += 2+7+1;
2394		    }
2395		    requiredsize = respos+repsize+(endp-collend);
2396		    if (requiredsize > ressize) {
2397			if (requiredsize<2*ressize)
2398			    requiredsize = 2*ressize;
2399			if (_PyString_Resize(&res, requiredsize))
2400			    goto onError;
2401			str = PyString_AS_STRING(res) + respos;
2402			ressize = requiredsize;
2403		    }
2404		    /* generate replacement (temporarily (mis)uses p) */
2405		    for (p = collstart; p < collend; ++p) {
2406			str += sprintf(str, "&#%d;", (int)*p);
2407		    }
2408		    p = collend;
2409		    break;
2410		default:
2411		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2412			encoding, reason, startp, size, &exc,
2413			collstart-startp, collend-startp, &newpos);
2414		    if (repunicode == NULL)
2415			goto onError;
2416		    /* need more space? (at least enough for what we
2417		       have+the replacement+the rest of the string, so
2418		       we won't have to check space for encodable characters) */
2419		    respos = str-PyString_AS_STRING(res);
2420		    repsize = PyUnicode_GET_SIZE(repunicode);
2421		    requiredsize = respos+repsize+(endp-collend);
2422		    if (requiredsize > ressize) {
2423			if (requiredsize<2*ressize)
2424			    requiredsize = 2*ressize;
2425			if (_PyString_Resize(&res, requiredsize)) {
2426			    Py_DECREF(repunicode);
2427			    goto onError;
2428			}
2429			str = PyString_AS_STRING(res) + respos;
2430			ressize = requiredsize;
2431		    }
2432		    /* check if there is anything unencodable in the replacement
2433		       and copy it to the output */
2434		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2435			c = *uni2;
2436			if (c >= limit) {
2437			    raise_encode_exception(&exc, encoding, startp, size,
2438				unicodepos, unicodepos+1, reason);
2439			    Py_DECREF(repunicode);
2440			    goto onError;
2441			}
2442			*str = (char)c;
2443		    }
2444		    p = startp + newpos;
2445		    Py_DECREF(repunicode);
2446	    }
2447	}
2448    }
2449    /* Resize if we allocated to much */
2450    respos = str-PyString_AS_STRING(res);
2451    if (respos<ressize)
2452       /* If this falls res will be NULL */
2453	_PyString_Resize(&res, respos);
2454    Py_XDECREF(errorHandler);
2455    Py_XDECREF(exc);
2456    return res;
2457
2458    onError:
2459    Py_XDECREF(res);
2460    Py_XDECREF(errorHandler);
2461    Py_XDECREF(exc);
2462    return NULL;
2463}
2464
2465PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2466				 int size,
2467				 const char *errors)
2468{
2469    return unicode_encode_ucs1(p, size, errors, 256);
2470}
2471
2472PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2473{
2474    if (!PyUnicode_Check(unicode)) {
2475	PyErr_BadArgument();
2476	return NULL;
2477    }
2478    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2479				  PyUnicode_GET_SIZE(unicode),
2480				  NULL);
2481}
2482
2483/* --- 7-bit ASCII Codec -------------------------------------------------- */
2484
2485PyObject *PyUnicode_DecodeASCII(const char *s,
2486				int size,
2487				const char *errors)
2488{
2489    const char *starts = s;
2490    PyUnicodeObject *v;
2491    Py_UNICODE *p;
2492    int startinpos;
2493    int endinpos;
2494    int outpos;
2495    const char *e;
2496    PyObject *errorHandler = NULL;
2497    PyObject *exc = NULL;
2498
2499    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2500    if (size == 1 && *(unsigned char*)s < 128) {
2501	Py_UNICODE r = *(unsigned char*)s;
2502	return PyUnicode_FromUnicode(&r, 1);
2503    }
2504
2505    v = _PyUnicode_New(size);
2506    if (v == NULL)
2507	goto onError;
2508    if (size == 0)
2509	return (PyObject *)v;
2510    p = PyUnicode_AS_UNICODE(v);
2511    e = s + size;
2512    while (s < e) {
2513	register unsigned char c = (unsigned char)*s;
2514	if (c < 128) {
2515	    *p++ = c;
2516	    ++s;
2517	}
2518	else {
2519	    startinpos = s-starts;
2520	    endinpos = startinpos + 1;
2521	    outpos = p-PyUnicode_AS_UNICODE(v);
2522	    if (unicode_decode_call_errorhandler(
2523		 errors, &errorHandler,
2524		 "ascii", "ordinal not in range(128)",
2525		 starts, size, &startinpos, &endinpos, &exc, &s,
2526		 (PyObject **)&v, &outpos, &p))
2527		goto onError;
2528	}
2529    }
2530    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2531	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2532	    goto onError;
2533    Py_XDECREF(errorHandler);
2534    Py_XDECREF(exc);
2535    return (PyObject *)v;
2536
2537 onError:
2538    Py_XDECREF(v);
2539    Py_XDECREF(errorHandler);
2540    Py_XDECREF(exc);
2541    return NULL;
2542}
2543
2544PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2545				int size,
2546				const char *errors)
2547{
2548    return unicode_encode_ucs1(p, size, errors, 128);
2549}
2550
2551PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2552{
2553    if (!PyUnicode_Check(unicode)) {
2554	PyErr_BadArgument();
2555	return NULL;
2556    }
2557    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2558				 PyUnicode_GET_SIZE(unicode),
2559				 NULL);
2560}
2561
2562#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2563
2564/* --- MBCS codecs for Windows -------------------------------------------- */
2565
2566PyObject *PyUnicode_DecodeMBCS(const char *s,
2567				int size,
2568				const char *errors)
2569{
2570    PyUnicodeObject *v;
2571    Py_UNICODE *p;
2572
2573    /* First get the size of the result */
2574    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2575    if (size > 0 && usize==0)
2576        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2577
2578    v = _PyUnicode_New(usize);
2579    if (v == NULL)
2580        return NULL;
2581    if (usize == 0)
2582	return (PyObject *)v;
2583    p = PyUnicode_AS_UNICODE(v);
2584    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2585        Py_DECREF(v);
2586        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2587    }
2588
2589    return (PyObject *)v;
2590}
2591
2592PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2593				int size,
2594				const char *errors)
2595{
2596    PyObject *repr;
2597    char *s;
2598    DWORD mbcssize;
2599
2600    /* If there are no characters, bail now! */
2601    if (size==0)
2602	    return PyString_FromString("");
2603
2604    /* First get the size of the result */
2605    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2606    if (mbcssize==0)
2607        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2608
2609    repr = PyString_FromStringAndSize(NULL, mbcssize);
2610    if (repr == NULL)
2611        return NULL;
2612    if (mbcssize == 0)
2613        return repr;
2614
2615    /* Do the conversion */
2616    s = PyString_AS_STRING(repr);
2617    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2618        Py_DECREF(repr);
2619        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2620    }
2621    return repr;
2622}
2623
2624#endif /* MS_WINDOWS */
2625
2626/* --- Character Mapping Codec -------------------------------------------- */
2627
2628PyObject *PyUnicode_DecodeCharmap(const char *s,
2629				  int size,
2630				  PyObject *mapping,
2631				  const char *errors)
2632{
2633    const char *starts = s;
2634    int startinpos;
2635    int endinpos;
2636    int outpos;
2637    const char *e;
2638    PyUnicodeObject *v;
2639    Py_UNICODE *p;
2640    int extrachars = 0;
2641    PyObject *errorHandler = NULL;
2642    PyObject *exc = NULL;
2643
2644    /* Default to Latin-1 */
2645    if (mapping == NULL)
2646	return PyUnicode_DecodeLatin1(s, size, errors);
2647
2648    v = _PyUnicode_New(size);
2649    if (v == NULL)
2650	goto onError;
2651    if (size == 0)
2652	return (PyObject *)v;
2653    p = PyUnicode_AS_UNICODE(v);
2654    e = s + size;
2655    while (s < e) {
2656	unsigned char ch = *s;
2657	PyObject *w, *x;
2658
2659	/* Get mapping (char ordinal -> integer, Unicode char or None) */
2660	w = PyInt_FromLong((long)ch);
2661	if (w == NULL)
2662	    goto onError;
2663	x = PyObject_GetItem(mapping, w);
2664	Py_DECREF(w);
2665	if (x == NULL) {
2666	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2667		/* No mapping found means: mapping is undefined. */
2668		PyErr_Clear();
2669		x = Py_None;
2670		Py_INCREF(x);
2671	    } else
2672		goto onError;
2673	}
2674
2675	/* Apply mapping */
2676	if (PyInt_Check(x)) {
2677	    long value = PyInt_AS_LONG(x);
2678	    if (value < 0 || value > 65535) {
2679		PyErr_SetString(PyExc_TypeError,
2680				"character mapping must be in range(65536)");
2681		Py_DECREF(x);
2682		goto onError;
2683	    }
2684	    *p++ = (Py_UNICODE)value;
2685	}
2686	else if (x == Py_None) {
2687	    /* undefined mapping */
2688	    outpos = p-PyUnicode_AS_UNICODE(v);
2689	    startinpos = s-starts;
2690	    endinpos = startinpos+1;
2691	    if (unicode_decode_call_errorhandler(
2692		 errors, &errorHandler,
2693		 "charmap", "character maps to <undefined>",
2694		 starts, size, &startinpos, &endinpos, &exc, &s,
2695		 (PyObject **)&v, &outpos, &p)) {
2696		Py_DECREF(x);
2697		goto onError;
2698	    }
2699	    continue;
2700	}
2701	else if (PyUnicode_Check(x)) {
2702	    int targetsize = PyUnicode_GET_SIZE(x);
2703
2704	    if (targetsize == 1)
2705		/* 1-1 mapping */
2706		*p++ = *PyUnicode_AS_UNICODE(x);
2707
2708	    else if (targetsize > 1) {
2709		/* 1-n mapping */
2710		if (targetsize > extrachars) {
2711		    /* resize first */
2712		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2713		    int needed = (targetsize - extrachars) + \
2714			         (targetsize << 2);
2715		    extrachars += needed;
2716		    if (_PyUnicode_Resize(&v,
2717					 PyUnicode_GET_SIZE(v) + needed)) {
2718			Py_DECREF(x);
2719			goto onError;
2720		    }
2721		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2722		}
2723		Py_UNICODE_COPY(p,
2724				PyUnicode_AS_UNICODE(x),
2725				targetsize);
2726		p += targetsize;
2727		extrachars -= targetsize;
2728	    }
2729	    /* 1-0 mapping: skip the character */
2730	}
2731	else {
2732	    /* wrong return value */
2733	    PyErr_SetString(PyExc_TypeError,
2734		  "character mapping must return integer, None or unicode");
2735	    Py_DECREF(x);
2736	    goto onError;
2737	}
2738	Py_DECREF(x);
2739	++s;
2740    }
2741    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2742	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2743	    goto onError;
2744    Py_XDECREF(errorHandler);
2745    Py_XDECREF(exc);
2746    return (PyObject *)v;
2747
2748 onError:
2749    Py_XDECREF(errorHandler);
2750    Py_XDECREF(exc);
2751    Py_XDECREF(v);
2752    return NULL;
2753}
2754
2755/* Lookup the character ch in the mapping. If the character
2756   can't be found, Py_None is returned (or NULL, if another
2757   error occured). */
2758static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
2759{
2760    PyObject *w = PyInt_FromLong((long)c);
2761    PyObject *x;
2762
2763    if (w == NULL)
2764	 return NULL;
2765    x = PyObject_GetItem(mapping, w);
2766    Py_DECREF(w);
2767    if (x == NULL) {
2768	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2769	    /* No mapping found means: mapping is undefined. */
2770	    PyErr_Clear();
2771	    x = Py_None;
2772	    Py_INCREF(x);
2773	    return x;
2774	} else
2775	    return NULL;
2776    }
2777    else if (x == Py_None)
2778	return x;
2779    else if (PyInt_Check(x)) {
2780	long value = PyInt_AS_LONG(x);
2781	if (value < 0 || value > 255) {
2782	    PyErr_SetString(PyExc_TypeError,
2783			     "character mapping must be in range(256)");
2784	    Py_DECREF(x);
2785	    return NULL;
2786	}
2787	return x;
2788    }
2789    else if (PyString_Check(x))
2790	return x;
2791    else {
2792	/* wrong return value */
2793	PyErr_SetString(PyExc_TypeError,
2794	      "character mapping must return integer, None or str");
2795	Py_DECREF(x);
2796	return NULL;
2797    }
2798}
2799
2800/* lookup the character, put the result in the output string and adjust
2801   various state variables. Reallocate the output string if not enough
2802   space is available. Return a new reference to the object that
2803   was put in the output buffer, or Py_None, if the mapping was undefined
2804   (in which case no character was written) or NULL, if a
2805   reallocation error ocurred. The called must decref the result */
2806static
2807PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2808    PyObject **outobj, int *outpos)
2809{
2810    PyObject *rep = charmapencode_lookup(c, mapping);
2811
2812    if (rep==NULL)
2813	return NULL;
2814    else if (rep==Py_None)
2815	return rep;
2816    else {
2817	char *outstart = PyString_AS_STRING(*outobj);
2818	int outsize = PyString_GET_SIZE(*outobj);
2819	if (PyInt_Check(rep)) {
2820	    int requiredsize = *outpos+1;
2821	    if (outsize<requiredsize) {
2822		/* exponentially overallocate to minimize reallocations */
2823		if (requiredsize < 2*outsize)
2824		    requiredsize = 2*outsize;
2825		if (_PyString_Resize(outobj, requiredsize)) {
2826		    Py_DECREF(rep);
2827		    return NULL;
2828		}
2829		outstart = PyString_AS_STRING(*outobj);
2830	    }
2831	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2832	}
2833	else {
2834	    const char *repchars = PyString_AS_STRING(rep);
2835	    int repsize = PyString_GET_SIZE(rep);
2836	    int requiredsize = *outpos+repsize;
2837	    if (outsize<requiredsize) {
2838		/* exponentially overallocate to minimize reallocations */
2839		if (requiredsize < 2*outsize)
2840		    requiredsize = 2*outsize;
2841		if (_PyString_Resize(outobj, requiredsize)) {
2842		    Py_DECREF(rep);
2843		    return NULL;
2844		}
2845		outstart = PyString_AS_STRING(*outobj);
2846	    }
2847	    memcpy(outstart + *outpos, repchars, repsize);
2848	    *outpos += repsize;
2849	}
2850    }
2851    return rep;
2852}
2853
2854/* handle an error in PyUnicode_EncodeCharmap
2855   Return 0 on success, -1 on error */
2856static
2857int charmap_encoding_error(
2858    const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2859    PyObject **exceptionObject,
2860    int *known_errorHandler, PyObject *errorHandler, const char *errors,
2861    PyObject **res, int *respos)
2862{
2863    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2864    int repsize;
2865    int newpos;
2866    Py_UNICODE *uni2;
2867    /* startpos for collecting unencodable chars */
2868    int collstartpos = *inpos;
2869    int collendpos = *inpos+1;
2870    int collpos;
2871    char *encoding = "charmap";
2872    char *reason = "character maps to <undefined>";
2873
2874    PyObject *x;
2875    /* find all unencodable characters */
2876    while (collendpos < size) {
2877	x = charmapencode_lookup(p[collendpos], mapping);
2878	if (x==NULL)
2879	    return -1;
2880	else if (x!=Py_None) {
2881	    Py_DECREF(x);
2882	    break;
2883	}
2884	Py_DECREF(x);
2885	++collendpos;
2886    }
2887    /* cache callback name lookup
2888     * (if not done yet, i.e. it's the first error) */
2889    if (*known_errorHandler==-1) {
2890	if ((errors==NULL) || (!strcmp(errors, "strict")))
2891	    *known_errorHandler = 1;
2892	else if (!strcmp(errors, "replace"))
2893	    *known_errorHandler = 2;
2894	else if (!strcmp(errors, "ignore"))
2895	    *known_errorHandler = 3;
2896	else if (!strcmp(errors, "xmlcharrefreplace"))
2897	    *known_errorHandler = 4;
2898	else
2899	    *known_errorHandler = 0;
2900    }
2901    switch (*known_errorHandler) {
2902	case 1: /* strict */
2903	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2904	    return -1;
2905	case 2: /* replace */
2906	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2907		x = charmapencode_output('?', mapping, res, respos);
2908		if (x==NULL) {
2909		    return -1;
2910		}
2911		else if (x==Py_None) {
2912		    Py_DECREF(x);
2913		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2914		    return -1;
2915		}
2916		Py_DECREF(x);
2917	    }
2918	    /* fall through */
2919	case 3: /* ignore */
2920	    *inpos = collendpos;
2921	    break;
2922	case 4: /* xmlcharrefreplace */
2923	    /* generate replacement (temporarily (mis)uses p) */
2924	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2925		char buffer[2+29+1+1];
2926		char *cp;
2927		sprintf(buffer, "&#%d;", (int)p[collpos]);
2928		for (cp = buffer; *cp; ++cp) {
2929		    x = charmapencode_output(*cp, mapping, res, respos);
2930		    if (x==NULL)
2931			return -1;
2932		    else if (x==Py_None) {
2933			Py_DECREF(x);
2934			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2935			return -1;
2936		    }
2937		    Py_DECREF(x);
2938		}
2939	    }
2940	    *inpos = collendpos;
2941	    break;
2942	default:
2943	    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2944		encoding, reason, p, size, exceptionObject,
2945		collstartpos, collendpos, &newpos);
2946	    if (repunicode == NULL)
2947		return -1;
2948	    /* generate replacement  */
2949	    repsize = PyUnicode_GET_SIZE(repunicode);
2950	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2951		x = charmapencode_output(*uni2, mapping, res, respos);
2952		if (x==NULL) {
2953		    Py_DECREF(repunicode);
2954		    return -1;
2955		}
2956		else if (x==Py_None) {
2957		    Py_DECREF(repunicode);
2958		    Py_DECREF(x);
2959		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2960		    return -1;
2961		}
2962		Py_DECREF(x);
2963	    }
2964	    *inpos = newpos;
2965	    Py_DECREF(repunicode);
2966    }
2967    return 0;
2968}
2969
2970PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2971				  int size,
2972				  PyObject *mapping,
2973				  const char *errors)
2974{
2975    /* output object */
2976    PyObject *res = NULL;
2977    /* current input position */
2978    int inpos = 0;
2979    /* current output position */
2980    int respos = 0;
2981    PyObject *errorHandler = NULL;
2982    PyObject *exc = NULL;
2983    /* the following variable is used for caching string comparisons
2984     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2985     * 3=ignore, 4=xmlcharrefreplace */
2986    int known_errorHandler = -1;
2987
2988    /* Default to Latin-1 */
2989    if (mapping == NULL)
2990	return PyUnicode_EncodeLatin1(p, size, errors);
2991
2992    /* allocate enough for a simple encoding without
2993       replacements, if we need more, we'll resize */
2994    res = PyString_FromStringAndSize(NULL, size);
2995    if (res == NULL)
2996        goto onError;
2997    if (size == 0)
2998	return res;
2999
3000    while (inpos<size) {
3001	/* try to encode it */
3002	PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3003	if (x==NULL) /* error */
3004	    goto onError;
3005	if (x==Py_None) { /* unencodable character */
3006	    if (charmap_encoding_error(p, size, &inpos, mapping,
3007		&exc,
3008		&known_errorHandler, errorHandler, errors,
3009		&res, &respos))
3010		goto onError;
3011	}
3012	else
3013	    /* done with this character => adjust input position */
3014	    ++inpos;
3015	Py_DECREF(x);
3016    }
3017
3018    /* Resize if we allocated to much */
3019    if (respos<PyString_GET_SIZE(res)) {
3020	if (_PyString_Resize(&res, respos))
3021	    goto onError;
3022    }
3023    Py_XDECREF(exc);
3024    Py_XDECREF(errorHandler);
3025    return res;
3026
3027    onError:
3028    Py_XDECREF(res);
3029    Py_XDECREF(exc);
3030    Py_XDECREF(errorHandler);
3031    return NULL;
3032}
3033
3034PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3035				    PyObject *mapping)
3036{
3037    if (!PyUnicode_Check(unicode) || mapping == NULL) {
3038	PyErr_BadArgument();
3039	return NULL;
3040    }
3041    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3042				   PyUnicode_GET_SIZE(unicode),
3043				   mapping,
3044				   NULL);
3045}
3046
3047/* create or adjust a UnicodeTranslateError */
3048static void make_translate_exception(PyObject **exceptionObject,
3049    const Py_UNICODE *unicode, int size,
3050    int startpos, int endpos,
3051    const char *reason)
3052{
3053    if (*exceptionObject == NULL) {
3054    	*exceptionObject = PyUnicodeTranslateError_Create(
3055	    unicode, size, startpos, endpos, reason);
3056    }
3057    else {
3058	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3059	    goto onError;
3060	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3061	    goto onError;
3062	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3063	    goto onError;
3064	return;
3065	onError:
3066	Py_DECREF(*exceptionObject);
3067	*exceptionObject = NULL;
3068    }
3069}
3070
3071/* raises a UnicodeTranslateError */
3072static void raise_translate_exception(PyObject **exceptionObject,
3073    const Py_UNICODE *unicode, int size,
3074    int startpos, int endpos,
3075    const char *reason)
3076{
3077    make_translate_exception(exceptionObject,
3078	unicode, size, startpos, endpos, reason);
3079    if (*exceptionObject != NULL)
3080	PyCodec_StrictErrors(*exceptionObject);
3081}
3082
3083/* error handling callback helper:
3084   build arguments, call the callback and check the arguments,
3085   put the result into newpos and return the replacement string, which
3086   has to be freed by the caller */
3087static PyObject *unicode_translate_call_errorhandler(const char *errors,
3088    PyObject **errorHandler,
3089    const char *reason,
3090    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3091    int startpos, int endpos,
3092    int *newpos)
3093{
3094    static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3095
3096    PyObject *restuple;
3097    PyObject *resunicode;
3098
3099    if (*errorHandler == NULL) {
3100	*errorHandler = PyCodec_LookupError(errors);
3101        if (*errorHandler == NULL)
3102	    return NULL;
3103    }
3104
3105    make_translate_exception(exceptionObject,
3106	unicode, size, startpos, endpos, reason);
3107    if (*exceptionObject == NULL)
3108	return NULL;
3109
3110    restuple = PyObject_CallFunctionObjArgs(
3111	*errorHandler, *exceptionObject, NULL);
3112    if (restuple == NULL)
3113	return NULL;
3114    if (!PyTuple_Check(restuple)) {
3115	PyErr_Format(PyExc_TypeError, &argparse[4]);
3116	Py_DECREF(restuple);
3117	return NULL;
3118    }
3119    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3120	&resunicode, newpos)) {
3121	Py_DECREF(restuple);
3122	return NULL;
3123    }
3124    if (*newpos<0)
3125	*newpos = size+*newpos;
3126    if (*newpos<0 || *newpos>size) {
3127	PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3128	Py_DECREF(restuple);
3129	return NULL;
3130    }
3131    Py_INCREF(resunicode);
3132    Py_DECREF(restuple);
3133    return resunicode;
3134}
3135
3136/* Lookup the character ch in the mapping and put the result in result,
3137   which must be decrefed by the caller.
3138   Return 0 on success, -1 on error */
3139static
3140int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3141{
3142    PyObject *w = PyInt_FromLong((long)c);
3143    PyObject *x;
3144
3145    if (w == NULL)
3146	 return -1;
3147    x = PyObject_GetItem(mapping, w);
3148    Py_DECREF(w);
3149    if (x == NULL) {
3150	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3151	    /* No mapping found means: use 1:1 mapping. */
3152	    PyErr_Clear();
3153	    *result = NULL;
3154	    return 0;
3155	} else
3156	    return -1;
3157    }
3158    else if (x == Py_None) {
3159	*result = x;
3160	return 0;
3161    }
3162    else if (PyInt_Check(x)) {
3163	long value = PyInt_AS_LONG(x);
3164	long max = PyUnicode_GetMax();
3165	if (value < 0 || value > max) {
3166	    PyErr_Format(PyExc_TypeError,
3167			     "character mapping must be in range(0x%lx)", max+1);
3168	    Py_DECREF(x);
3169	    return -1;
3170	}
3171	*result = x;
3172	return 0;
3173    }
3174    else if (PyUnicode_Check(x)) {
3175	*result = x;
3176	return 0;
3177    }
3178    else {
3179	/* wrong return value */
3180	PyErr_SetString(PyExc_TypeError,
3181	      "character mapping must return integer, None or unicode");
3182	return -1;
3183    }
3184}
3185/* ensure that *outobj is at least requiredsize characters long,
3186if not reallocate and adjust various state variables.
3187Return 0 on success, -1 on error */
3188static
3189int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3190    int requiredsize)
3191{
3192    if (requiredsize > *outsize) {
3193	/* remember old output position */
3194	int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3195	/* exponentially overallocate to minimize reallocations */
3196	if (requiredsize < 2 * *outsize)
3197	    requiredsize = 2 * *outsize;
3198	if (_PyUnicode_Resize(outobj, requiredsize))
3199	    return -1;
3200	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3201	*outsize = requiredsize;
3202    }
3203    return 0;
3204}
3205/* lookup the character, put the result in the output string and adjust
3206   various state variables. Return a new reference to the object that
3207   was put in the output buffer in *result, or Py_None, if the mapping was
3208   undefined (in which case no character was written).
3209   The called must decref result.
3210   Return 0 on success, -1 on error. */
3211static
3212int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3213    PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3214{
3215    if (charmaptranslate_lookup(c, mapping, res))
3216	return -1;
3217    if (*res==NULL) {
3218	/* not found => default to 1:1 mapping */
3219	*(*outp)++ = (Py_UNICODE)c;
3220    }
3221    else if (*res==Py_None)
3222	;
3223    else if (PyInt_Check(*res)) {
3224	/* no overflow check, because we know that the space is enough */
3225	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3226    }
3227    else if (PyUnicode_Check(*res)) {
3228	int repsize = PyUnicode_GET_SIZE(*res);
3229	if (repsize==1) {
3230	    /* no overflow check, because we know that the space is enough */
3231	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3232	}
3233	else if (repsize!=0) {
3234	    /* more than one character */
3235	    int requiredsize = *outsize + repsize - 1;
3236	    if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3237		return -1;
3238	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3239	    *outp += repsize;
3240	}
3241    }
3242    else
3243	return -1;
3244    return 0;
3245}
3246
3247PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3248				     int size,
3249				     PyObject *mapping,
3250				     const char *errors)
3251{
3252    /* output object */
3253    PyObject *res = NULL;
3254    /* pointers to the beginning and end+1 of input */
3255    const Py_UNICODE *startp = p;
3256    const Py_UNICODE *endp = p + size;
3257    /* pointer into the output */
3258    Py_UNICODE *str;
3259    /* current output position */
3260    int respos = 0;
3261    int ressize;
3262    char *reason = "character maps to <undefined>";
3263    PyObject *errorHandler = NULL;
3264    PyObject *exc = NULL;
3265    /* the following variable is used for caching string comparisons
3266     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3267     * 3=ignore, 4=xmlcharrefreplace */
3268    int known_errorHandler = -1;
3269
3270    if (mapping == NULL) {
3271	PyErr_BadArgument();
3272	return NULL;
3273    }
3274
3275    /* allocate enough for a simple 1:1 translation without
3276       replacements, if we need more, we'll resize */
3277    res = PyUnicode_FromUnicode(NULL, size);
3278    if (res == NULL)
3279        goto onError;
3280    if (size == 0)
3281	return res;
3282    str = PyUnicode_AS_UNICODE(res);
3283    ressize = size;
3284
3285    while (p<endp) {
3286	/* try to encode it */
3287	PyObject *x = NULL;
3288	if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3289	    Py_XDECREF(x);
3290	    goto onError;
3291	}
3292	Py_XDECREF(x);
3293	if (x!=Py_None) /* it worked => adjust input pointer */
3294	    ++p;
3295	else { /* untranslatable character */
3296	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3297	    int repsize;
3298	    int newpos;
3299	    Py_UNICODE *uni2;
3300	    /* startpos for collecting untranslatable chars */
3301	    const Py_UNICODE *collstart = p;
3302	    const Py_UNICODE *collend = p+1;
3303	    const Py_UNICODE *coll;
3304
3305	    /* find all untranslatable characters */
3306	    while (collend < endp) {
3307	    	if (charmaptranslate_lookup(*collend, mapping, &x))
3308		    goto onError;
3309		Py_XDECREF(x);
3310		if (x!=Py_None)
3311		    break;
3312		++collend;
3313	    }
3314	    /* cache callback name lookup
3315	     * (if not done yet, i.e. it's the first error) */
3316	    if (known_errorHandler==-1) {
3317		if ((errors==NULL) || (!strcmp(errors, "strict")))
3318		    known_errorHandler = 1;
3319		else if (!strcmp(errors, "replace"))
3320		    known_errorHandler = 2;
3321		else if (!strcmp(errors, "ignore"))
3322		    known_errorHandler = 3;
3323		else if (!strcmp(errors, "xmlcharrefreplace"))
3324		    known_errorHandler = 4;
3325		else
3326		    known_errorHandler = 0;
3327	    }
3328	    switch (known_errorHandler) {
3329		case 1: /* strict */
3330		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3331		    goto onError;
3332		case 2: /* replace */
3333		    /* No need to check for space, this is a 1:1 replacement */
3334		    for (coll = collstart; coll<collend; ++coll)
3335			*str++ = '?';
3336		    /* fall through */
3337		case 3: /* ignore */
3338		    p = collend;
3339		    break;
3340		case 4: /* xmlcharrefreplace */
3341		    /* generate replacement (temporarily (mis)uses p) */
3342		    for (p = collstart; p < collend; ++p) {
3343			char buffer[2+29+1+1];
3344			char *cp;
3345			sprintf(buffer, "&#%d;", (int)*p);
3346			if (charmaptranslate_makespace(&res, &str, &ressize,
3347			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3348			    goto onError;
3349			for (cp = buffer; *cp; ++cp)
3350			    *str++ = *cp;
3351		    }
3352		    p = collend;
3353		    break;
3354		default:
3355		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3356			reason, startp, size, &exc,
3357			collstart-startp, collend-startp, &newpos);
3358		    if (repunicode == NULL)
3359			goto onError;
3360		    /* generate replacement  */
3361		    repsize = PyUnicode_GET_SIZE(repunicode);
3362		    if (charmaptranslate_makespace(&res, &str, &ressize,
3363			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3364			Py_DECREF(repunicode);
3365			goto onError;
3366		    }
3367		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3368			*str++ = *uni2;
3369		    p = startp + newpos;
3370		    Py_DECREF(repunicode);
3371	    }
3372	}
3373    }
3374    /* Resize if we allocated to much */
3375    respos = str-PyUnicode_AS_UNICODE(res);
3376    if (respos<ressize) {
3377	if (_PyUnicode_Resize(&res, respos))
3378	    goto onError;
3379    }
3380    Py_XDECREF(exc);
3381    Py_XDECREF(errorHandler);
3382    return res;
3383
3384    onError:
3385    Py_XDECREF(res);
3386    Py_XDECREF(exc);
3387    Py_XDECREF(errorHandler);
3388    return NULL;
3389}
3390
3391PyObject *PyUnicode_Translate(PyObject *str,
3392			      PyObject *mapping,
3393			      const char *errors)
3394{
3395    PyObject *result;
3396
3397    str = PyUnicode_FromObject(str);
3398    if (str == NULL)
3399	goto onError;
3400    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3401					PyUnicode_GET_SIZE(str),
3402					mapping,
3403					errors);
3404    Py_DECREF(str);
3405    return result;
3406
3407 onError:
3408    Py_XDECREF(str);
3409    return NULL;
3410}
3411
3412/* --- Decimal Encoder ---------------------------------------------------- */
3413
3414int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3415			    int length,
3416			    char *output,
3417			    const char *errors)
3418{
3419    Py_UNICODE *p, *end;
3420    PyObject *errorHandler = NULL;
3421    PyObject *exc = NULL;
3422    const char *encoding = "decimal";
3423    const char *reason = "invalid decimal Unicode string";
3424    /* the following variable is used for caching string comparisons
3425     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3426    int known_errorHandler = -1;
3427
3428    if (output == NULL) {
3429	PyErr_BadArgument();
3430	return -1;
3431    }
3432
3433    p = s;
3434    end = s + length;
3435    while (p < end) {
3436	register Py_UNICODE ch = *p;
3437	int decimal;
3438	PyObject *repunicode;
3439	int repsize;
3440	int newpos;
3441	Py_UNICODE *uni2;
3442	Py_UNICODE *collstart;
3443	Py_UNICODE *collend;
3444
3445	if (Py_UNICODE_ISSPACE(ch)) {
3446	    *output++ = ' ';
3447	    ++p;
3448	    continue;
3449	}
3450	decimal = Py_UNICODE_TODECIMAL(ch);
3451	if (decimal >= 0) {
3452	    *output++ = '0' + decimal;
3453	    ++p;
3454	    continue;
3455	}
3456	if (0 < ch && ch < 256) {
3457	    *output++ = (char)ch;
3458	    ++p;
3459	    continue;
3460	}
3461	/* All other characters are considered unencodable */
3462	collstart = p;
3463	collend = p+1;
3464	while (collend < end) {
3465	    if ((0 < *collend && *collend < 256) ||
3466	        !Py_UNICODE_ISSPACE(*collend) ||
3467	        Py_UNICODE_TODECIMAL(*collend))
3468		break;
3469	}
3470	/* cache callback name lookup
3471	 * (if not done yet, i.e. it's the first error) */
3472	if (known_errorHandler==-1) {
3473	    if ((errors==NULL) || (!strcmp(errors, "strict")))
3474		known_errorHandler = 1;
3475	    else if (!strcmp(errors, "replace"))
3476		known_errorHandler = 2;
3477	    else if (!strcmp(errors, "ignore"))
3478		known_errorHandler = 3;
3479	    else if (!strcmp(errors, "xmlcharrefreplace"))
3480		known_errorHandler = 4;
3481	    else
3482		known_errorHandler = 0;
3483	}
3484	switch (known_errorHandler) {
3485	    case 1: /* strict */
3486		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3487		goto onError;
3488	    case 2: /* replace */
3489		for (p = collstart; p < collend; ++p)
3490		    *output++ = '?';
3491		/* fall through */
3492	    case 3: /* ignore */
3493		p = collend;
3494		break;
3495	    case 4: /* xmlcharrefreplace */
3496		/* generate replacement (temporarily (mis)uses p) */
3497		for (p = collstart; p < collend; ++p)
3498		    output += sprintf(output, "&#%d;", (int)*p);
3499		p = collend;
3500		break;
3501	    default:
3502		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3503		    encoding, reason, s, length, &exc,
3504		    collstart-s, collend-s, &newpos);
3505		if (repunicode == NULL)
3506		    goto onError;
3507		/* generate replacement  */
3508		repsize = PyUnicode_GET_SIZE(repunicode);
3509		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3510		    Py_UNICODE ch = *uni2;
3511		    if (Py_UNICODE_ISSPACE(ch))
3512			*output++ = ' ';
3513		    else {
3514			decimal = Py_UNICODE_TODECIMAL(ch);
3515			if (decimal >= 0)
3516			    *output++ = '0' + decimal;
3517			else if (0 < ch && ch < 256)
3518			    *output++ = (char)ch;
3519			else {
3520			    Py_DECREF(repunicode);
3521			    raise_encode_exception(&exc, encoding,
3522				s, length, collstart-s, collend-s, reason);
3523			    goto onError;
3524			}
3525		    }
3526		}
3527		p = s + newpos;
3528		Py_DECREF(repunicode);
3529	}
3530    }
3531    /* 0-terminate the output string */
3532    *output++ = '\0';
3533    Py_XDECREF(exc);
3534    Py_XDECREF(errorHandler);
3535    return 0;
3536
3537 onError:
3538    Py_XDECREF(exc);
3539    Py_XDECREF(errorHandler);
3540    return -1;
3541}
3542
3543/* --- Helpers ------------------------------------------------------------ */
3544
3545static
3546int count(PyUnicodeObject *self,
3547	  int start,
3548	  int end,
3549	  PyUnicodeObject *substring)
3550{
3551    int count = 0;
3552
3553    if (start < 0)
3554        start += self->length;
3555    if (start < 0)
3556        start = 0;
3557    if (end > self->length)
3558        end = self->length;
3559    if (end < 0)
3560        end += self->length;
3561    if (end < 0)
3562        end = 0;
3563
3564    if (substring->length == 0)
3565	return (end - start + 1);
3566
3567    end -= substring->length;
3568
3569    while (start <= end)
3570        if (Py_UNICODE_MATCH(self, start, substring)) {
3571            count++;
3572            start += substring->length;
3573        } else
3574            start++;
3575
3576    return count;
3577}
3578
3579int PyUnicode_Count(PyObject *str,
3580		    PyObject *substr,
3581		    int start,
3582		    int end)
3583{
3584    int result;
3585
3586    str = PyUnicode_FromObject(str);
3587    if (str == NULL)
3588	return -1;
3589    substr = PyUnicode_FromObject(substr);
3590    if (substr == NULL) {
3591	Py_DECREF(str);
3592	return -1;
3593    }
3594
3595    result = count((PyUnicodeObject *)str,
3596		   start, end,
3597		   (PyUnicodeObject *)substr);
3598
3599    Py_DECREF(str);
3600    Py_DECREF(substr);
3601    return result;
3602}
3603
3604static
3605int findstring(PyUnicodeObject *self,
3606	       PyUnicodeObject *substring,
3607	       int start,
3608	       int end,
3609	       int direction)
3610{
3611    if (start < 0)
3612        start += self->length;
3613    if (start < 0)
3614        start = 0;
3615
3616    if (end > self->length)
3617        end = self->length;
3618    if (end < 0)
3619        end += self->length;
3620    if (end < 0)
3621        end = 0;
3622
3623    if (substring->length == 0)
3624	return (direction > 0) ? start : end;
3625
3626    end -= substring->length;
3627
3628    if (direction < 0) {
3629        for (; end >= start; end--)
3630            if (Py_UNICODE_MATCH(self, end, substring))
3631                return end;
3632    } else {
3633        for (; start <= end; start++)
3634            if (Py_UNICODE_MATCH(self, start, substring))
3635                return start;
3636    }
3637
3638    return -1;
3639}
3640
3641int PyUnicode_Find(PyObject *str,
3642		   PyObject *substr,
3643		   int start,
3644		   int end,
3645		   int direction)
3646{
3647    int result;
3648
3649    str = PyUnicode_FromObject(str);
3650    if (str == NULL)
3651	return -2;
3652    substr = PyUnicode_FromObject(substr);
3653    if (substr == NULL) {
3654	Py_DECREF(str);
3655	return -2;
3656    }
3657
3658    result = findstring((PyUnicodeObject *)str,
3659			(PyUnicodeObject *)substr,
3660			start, end, direction);
3661    Py_DECREF(str);
3662    Py_DECREF(substr);
3663    return result;
3664}
3665
3666static
3667int tailmatch(PyUnicodeObject *self,
3668	      PyUnicodeObject *substring,
3669	      int start,
3670	      int end,
3671	      int direction)
3672{
3673    if (start < 0)
3674        start += self->length;
3675    if (start < 0)
3676        start = 0;
3677
3678    if (substring->length == 0)
3679        return 1;
3680
3681    if (end > self->length)
3682        end = self->length;
3683    if (end < 0)
3684        end += self->length;
3685    if (end < 0)
3686        end = 0;
3687
3688    end -= substring->length;
3689    if (end < start)
3690	return 0;
3691
3692    if (direction > 0) {
3693	if (Py_UNICODE_MATCH(self, end, substring))
3694	    return 1;
3695    } else {
3696        if (Py_UNICODE_MATCH(self, start, substring))
3697	    return 1;
3698    }
3699
3700    return 0;
3701}
3702
3703int PyUnicode_Tailmatch(PyObject *str,
3704			PyObject *substr,
3705			int start,
3706			int end,
3707			int direction)
3708{
3709    int result;
3710
3711    str = PyUnicode_FromObject(str);
3712    if (str == NULL)
3713	return -1;
3714    substr = PyUnicode_FromObject(substr);
3715    if (substr == NULL) {
3716	Py_DECREF(substr);
3717	return -1;
3718    }
3719
3720    result = tailmatch((PyUnicodeObject *)str,
3721		       (PyUnicodeObject *)substr,
3722		       start, end, direction);
3723    Py_DECREF(str);
3724    Py_DECREF(substr);
3725    return result;
3726}
3727
3728static
3729const Py_UNICODE *findchar(const Py_UNICODE *s,
3730		     int size,
3731		     Py_UNICODE ch)
3732{
3733    /* like wcschr, but doesn't stop at NULL characters */
3734
3735    while (size-- > 0) {
3736        if (*s == ch)
3737            return s;
3738        s++;
3739    }
3740
3741    return NULL;
3742}
3743
3744/* Apply fixfct filter to the Unicode object self and return a
3745   reference to the modified object */
3746
3747static
3748PyObject *fixup(PyUnicodeObject *self,
3749		int (*fixfct)(PyUnicodeObject *s))
3750{
3751
3752    PyUnicodeObject *u;
3753
3754    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3755    if (u == NULL)
3756	return NULL;
3757
3758    Py_UNICODE_COPY(u->str, self->str, self->length);
3759
3760    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3761	/* fixfct should return TRUE if it modified the buffer. If
3762	   FALSE, return a reference to the original buffer instead
3763	   (to save space, not time) */
3764	Py_INCREF(self);
3765	Py_DECREF(u);
3766	return (PyObject*) self;
3767    }
3768    return (PyObject*) u;
3769}
3770
3771static
3772int fixupper(PyUnicodeObject *self)
3773{
3774    int len = self->length;
3775    Py_UNICODE *s = self->str;
3776    int status = 0;
3777
3778    while (len-- > 0) {
3779	register Py_UNICODE ch;
3780
3781	ch = Py_UNICODE_TOUPPER(*s);
3782	if (ch != *s) {
3783            status = 1;
3784	    *s = ch;
3785	}
3786        s++;
3787    }
3788
3789    return status;
3790}
3791
3792static
3793int fixlower(PyUnicodeObject *self)
3794{
3795    int len = self->length;
3796    Py_UNICODE *s = self->str;
3797    int status = 0;
3798
3799    while (len-- > 0) {
3800	register Py_UNICODE ch;
3801
3802	ch = Py_UNICODE_TOLOWER(*s);
3803	if (ch != *s) {
3804            status = 1;
3805	    *s = ch;
3806	}
3807        s++;
3808    }
3809
3810    return status;
3811}
3812
3813static
3814int fixswapcase(PyUnicodeObject *self)
3815{
3816    int len = self->length;
3817    Py_UNICODE *s = self->str;
3818    int status = 0;
3819
3820    while (len-- > 0) {
3821        if (Py_UNICODE_ISUPPER(*s)) {
3822            *s = Py_UNICODE_TOLOWER(*s);
3823            status = 1;
3824        } else if (Py_UNICODE_ISLOWER(*s)) {
3825            *s = Py_UNICODE_TOUPPER(*s);
3826            status = 1;
3827        }
3828        s++;
3829    }
3830
3831    return status;
3832}
3833
3834static
3835int fixcapitalize(PyUnicodeObject *self)
3836{
3837    int len = self->length;
3838    Py_UNICODE *s = self->str;
3839    int status = 0;
3840
3841    if (len == 0)
3842	return 0;
3843    if (Py_UNICODE_ISLOWER(*s)) {
3844	*s = Py_UNICODE_TOUPPER(*s);
3845	status = 1;
3846    }
3847    s++;
3848    while (--len > 0) {
3849        if (Py_UNICODE_ISUPPER(*s)) {
3850            *s = Py_UNICODE_TOLOWER(*s);
3851            status = 1;
3852        }
3853        s++;
3854    }
3855    return status;
3856}
3857
3858static
3859int fixtitle(PyUnicodeObject *self)
3860{
3861    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3862    register Py_UNICODE *e;
3863    int previous_is_cased;
3864
3865    /* Shortcut for single character strings */
3866    if (PyUnicode_GET_SIZE(self) == 1) {
3867	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3868	if (*p != ch) {
3869	    *p = ch;
3870	    return 1;
3871	}
3872	else
3873	    return 0;
3874    }
3875
3876    e = p + PyUnicode_GET_SIZE(self);
3877    previous_is_cased = 0;
3878    for (; p < e; p++) {
3879	register const Py_UNICODE ch = *p;
3880
3881	if (previous_is_cased)
3882	    *p = Py_UNICODE_TOLOWER(ch);
3883	else
3884	    *p = Py_UNICODE_TOTITLE(ch);
3885
3886	if (Py_UNICODE_ISLOWER(ch) ||
3887	    Py_UNICODE_ISUPPER(ch) ||
3888	    Py_UNICODE_ISTITLE(ch))
3889	    previous_is_cased = 1;
3890	else
3891	    previous_is_cased = 0;
3892    }
3893    return 1;
3894}
3895
3896PyObject *PyUnicode_Join(PyObject *separator,
3897			 PyObject *seq)
3898{
3899    Py_UNICODE *sep;
3900    int seplen;
3901    PyUnicodeObject *res = NULL;
3902    int reslen = 0;
3903    Py_UNICODE *p;
3904    int sz = 100;
3905    int i;
3906    PyObject *it;
3907
3908    it = PyObject_GetIter(seq);
3909    if (it == NULL)
3910        return NULL;
3911
3912    if (separator == NULL) {
3913	Py_UNICODE blank = ' ';
3914	sep = &blank;
3915	seplen = 1;
3916    }
3917    else {
3918	separator = PyUnicode_FromObject(separator);
3919	if (separator == NULL)
3920	    goto onError;
3921	sep = PyUnicode_AS_UNICODE(separator);
3922	seplen = PyUnicode_GET_SIZE(separator);
3923    }
3924
3925    res = _PyUnicode_New(sz);
3926    if (res == NULL)
3927	goto onError;
3928    p = PyUnicode_AS_UNICODE(res);
3929    reslen = 0;
3930
3931    for (i = 0; ; ++i) {
3932	int itemlen;
3933	PyObject *item = PyIter_Next(it);
3934	if (item == NULL) {
3935	    if (PyErr_Occurred())
3936		goto onError;
3937	    break;
3938	}
3939	if (!PyUnicode_Check(item)) {
3940	    PyObject *v;
3941	    if (!PyString_Check(item)) {
3942		PyErr_Format(PyExc_TypeError,
3943			     "sequence item %i: expected string or Unicode,"
3944			     " %.80s found",
3945			     i, item->ob_type->tp_name);
3946		Py_DECREF(item);
3947		goto onError;
3948	    }
3949	    v = PyUnicode_FromObject(item);
3950	    Py_DECREF(item);
3951	    item = v;
3952	    if (item == NULL)
3953		goto onError;
3954	}
3955	itemlen = PyUnicode_GET_SIZE(item);
3956	while (reslen + itemlen + seplen >= sz) {
3957	    if (_PyUnicode_Resize(&res, sz*2)) {
3958		Py_DECREF(item);
3959		goto onError;
3960	    }
3961	    sz *= 2;
3962	    p = PyUnicode_AS_UNICODE(res) + reslen;
3963	}
3964	if (i > 0) {
3965	    Py_UNICODE_COPY(p, sep, seplen);
3966	    p += seplen;
3967	    reslen += seplen;
3968	}
3969	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3970	p += itemlen;
3971	reslen += itemlen;
3972	Py_DECREF(item);
3973    }
3974    if (_PyUnicode_Resize(&res, reslen))
3975	goto onError;
3976
3977    Py_XDECREF(separator);
3978    Py_DECREF(it);
3979    return (PyObject *)res;
3980
3981 onError:
3982    Py_XDECREF(separator);
3983    Py_XDECREF(res);
3984    Py_DECREF(it);
3985    return NULL;
3986}
3987
3988static
3989PyUnicodeObject *pad(PyUnicodeObject *self,
3990		     int left,
3991		     int right,
3992		     Py_UNICODE fill)
3993{
3994    PyUnicodeObject *u;
3995
3996    if (left < 0)
3997        left = 0;
3998    if (right < 0)
3999        right = 0;
4000
4001    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4002        Py_INCREF(self);
4003        return self;
4004    }
4005
4006    u = _PyUnicode_New(left + self->length + right);
4007    if (u) {
4008        if (left)
4009            Py_UNICODE_FILL(u->str, fill, left);
4010        Py_UNICODE_COPY(u->str + left, self->str, self->length);
4011        if (right)
4012            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4013    }
4014
4015    return u;
4016}
4017
4018#define SPLIT_APPEND(data, left, right)					\
4019	str = PyUnicode_FromUnicode(data + left, right - left);		\
4020	if (!str)							\
4021	    goto onError;						\
4022	if (PyList_Append(list, str)) {					\
4023	    Py_DECREF(str);						\
4024	    goto onError;						\
4025	}								\
4026        else								\
4027            Py_DECREF(str);
4028
4029static
4030PyObject *split_whitespace(PyUnicodeObject *self,
4031			   PyObject *list,
4032			   int maxcount)
4033{
4034    register int i;
4035    register int j;
4036    int len = self->length;
4037    PyObject *str;
4038
4039    for (i = j = 0; i < len; ) {
4040	/* find a token */
4041	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4042	    i++;
4043	j = i;
4044	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4045	    i++;
4046	if (j < i) {
4047	    if (maxcount-- <= 0)
4048		break;
4049	    SPLIT_APPEND(self->str, j, i);
4050	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4051		i++;
4052	    j = i;
4053	}
4054    }
4055    if (j < len) {
4056	SPLIT_APPEND(self->str, j, len);
4057    }
4058    return list;
4059
4060 onError:
4061    Py_DECREF(list);
4062    return NULL;
4063}
4064
4065PyObject *PyUnicode_Splitlines(PyObject *string,
4066			       int keepends)
4067{
4068    register int i;
4069    register int j;
4070    int len;
4071    PyObject *list;
4072    PyObject *str;
4073    Py_UNICODE *data;
4074
4075    string = PyUnicode_FromObject(string);
4076    if (string == NULL)
4077	return NULL;
4078    data = PyUnicode_AS_UNICODE(string);
4079    len = PyUnicode_GET_SIZE(string);
4080
4081    list = PyList_New(0);
4082    if (!list)
4083        goto onError;
4084
4085    for (i = j = 0; i < len; ) {
4086	int eol;
4087
4088	/* Find a line and append it */
4089	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4090	    i++;
4091
4092	/* Skip the line break reading CRLF as one line break */
4093	eol = i;
4094	if (i < len) {
4095	    if (data[i] == '\r' && i + 1 < len &&
4096		data[i+1] == '\n')
4097		i += 2;
4098	    else
4099		i++;
4100	    if (keepends)
4101		eol = i;
4102	}
4103	SPLIT_APPEND(data, j, eol);
4104	j = i;
4105    }
4106    if (j < len) {
4107	SPLIT_APPEND(data, j, len);
4108    }
4109
4110    Py_DECREF(string);
4111    return list;
4112
4113 onError:
4114    Py_DECREF(list);
4115    Py_DECREF(string);
4116    return NULL;
4117}
4118
4119static
4120PyObject *split_char(PyUnicodeObject *self,
4121		     PyObject *list,
4122		     Py_UNICODE ch,
4123		     int maxcount)
4124{
4125    register int i;
4126    register int j;
4127    int len = self->length;
4128    PyObject *str;
4129
4130    for (i = j = 0; i < len; ) {
4131	if (self->str[i] == ch) {
4132	    if (maxcount-- <= 0)
4133		break;
4134	    SPLIT_APPEND(self->str, j, i);
4135	    i = j = i + 1;
4136	} else
4137	    i++;
4138    }
4139    if (j <= len) {
4140	SPLIT_APPEND(self->str, j, len);
4141    }
4142    return list;
4143
4144 onError:
4145    Py_DECREF(list);
4146    return NULL;
4147}
4148
4149static
4150PyObject *split_substring(PyUnicodeObject *self,
4151			  PyObject *list,
4152			  PyUnicodeObject *substring,
4153			  int maxcount)
4154{
4155    register int i;
4156    register int j;
4157    int len = self->length;
4158    int sublen = substring->length;
4159    PyObject *str;
4160
4161    for (i = j = 0; i <= len - sublen; ) {
4162	if (Py_UNICODE_MATCH(self, i, substring)) {
4163	    if (maxcount-- <= 0)
4164		break;
4165	    SPLIT_APPEND(self->str, j, i);
4166	    i = j = i + sublen;
4167	} else
4168	    i++;
4169    }
4170    if (j <= len) {
4171	SPLIT_APPEND(self->str, j, len);
4172    }
4173    return list;
4174
4175 onError:
4176    Py_DECREF(list);
4177    return NULL;
4178}
4179
4180#undef SPLIT_APPEND
4181
4182static
4183PyObject *split(PyUnicodeObject *self,
4184		PyUnicodeObject *substring,
4185		int maxcount)
4186{
4187    PyObject *list;
4188
4189    if (maxcount < 0)
4190        maxcount = INT_MAX;
4191
4192    list = PyList_New(0);
4193    if (!list)
4194        return NULL;
4195
4196    if (substring == NULL)
4197	return split_whitespace(self,list,maxcount);
4198
4199    else if (substring->length == 1)
4200	return split_char(self,list,substring->str[0],maxcount);
4201
4202    else if (substring->length == 0) {
4203	Py_DECREF(list);
4204	PyErr_SetString(PyExc_ValueError, "empty separator");
4205	return NULL;
4206    }
4207    else
4208	return split_substring(self,list,substring,maxcount);
4209}
4210
4211static
4212PyObject *replace(PyUnicodeObject *self,
4213		  PyUnicodeObject *str1,
4214		  PyUnicodeObject *str2,
4215		  int maxcount)
4216{
4217    PyUnicodeObject *u;
4218
4219    if (maxcount < 0)
4220	maxcount = INT_MAX;
4221
4222    if (str1->length == 1 && str2->length == 1) {
4223        int i;
4224
4225        /* replace characters */
4226        if (!findchar(self->str, self->length, str1->str[0]) &&
4227            PyUnicode_CheckExact(self)) {
4228            /* nothing to replace, return original string */
4229            Py_INCREF(self);
4230            u = self;
4231        } else {
4232	    Py_UNICODE u1 = str1->str[0];
4233	    Py_UNICODE u2 = str2->str[0];
4234
4235            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
4236                NULL,
4237                self->length
4238                );
4239            if (u != NULL) {
4240		Py_UNICODE_COPY(u->str, self->str,
4241				self->length);
4242                for (i = 0; i < u->length; i++)
4243                    if (u->str[i] == u1) {
4244                        if (--maxcount < 0)
4245                            break;
4246                        u->str[i] = u2;
4247                    }
4248        }
4249        }
4250
4251    } else {
4252        int n, i;
4253        Py_UNICODE *p;
4254
4255        /* replace strings */
4256        n = count(self, 0, self->length, str1);
4257        if (n > maxcount)
4258            n = maxcount;
4259        if (n == 0) {
4260            /* nothing to replace, return original string */
4261            if (PyUnicode_CheckExact(self)) {
4262                Py_INCREF(self);
4263                u = self;
4264            }
4265            else {
4266                u = (PyUnicodeObject *)
4267                    PyUnicode_FromUnicode(self->str, self->length);
4268	    }
4269        } else {
4270            u = _PyUnicode_New(
4271                self->length + n * (str2->length - str1->length));
4272            if (u) {
4273                i = 0;
4274                p = u->str;
4275                if (str1->length > 0) {
4276                    while (i <= self->length - str1->length)
4277                        if (Py_UNICODE_MATCH(self, i, str1)) {
4278                            /* replace string segment */
4279                            Py_UNICODE_COPY(p, str2->str, str2->length);
4280                            p += str2->length;
4281                            i += str1->length;
4282                            if (--n <= 0) {
4283                                /* copy remaining part */
4284                                Py_UNICODE_COPY(p, self->str+i, self->length-i);
4285                                break;
4286                            }
4287                        } else
4288                            *p++ = self->str[i++];
4289                } else {
4290                    while (n > 0) {
4291                        Py_UNICODE_COPY(p, str2->str, str2->length);
4292                        p += str2->length;
4293                        if (--n <= 0)
4294                            break;
4295                        *p++ = self->str[i++];
4296                    }
4297                    Py_UNICODE_COPY(p, self->str+i, self->length-i);
4298                }
4299            }
4300        }
4301    }
4302
4303    return (PyObject *) u;
4304}
4305
4306/* --- Unicode Object Methods --------------------------------------------- */
4307
4308PyDoc_STRVAR(title__doc__,
4309"S.title() -> unicode\n\
4310\n\
4311Return a titlecased version of S, i.e. words start with title case\n\
4312characters, all remaining cased characters have lower case.");
4313
4314static PyObject*
4315unicode_title(PyUnicodeObject *self)
4316{
4317    return fixup(self, fixtitle);
4318}
4319
4320PyDoc_STRVAR(capitalize__doc__,
4321"S.capitalize() -> unicode\n\
4322\n\
4323Return a capitalized version of S, i.e. make the first character\n\
4324have upper case.");
4325
4326static PyObject*
4327unicode_capitalize(PyUnicodeObject *self)
4328{
4329    return fixup(self, fixcapitalize);
4330}
4331
4332#if 0
4333PyDoc_STRVAR(capwords__doc__,
4334"S.capwords() -> unicode\n\
4335\n\
4336Apply .capitalize() to all words in S and return the result with\n\
4337normalized whitespace (all whitespace strings are replaced by ' ').");
4338
4339static PyObject*
4340unicode_capwords(PyUnicodeObject *self)
4341{
4342    PyObject *list;
4343    PyObject *item;
4344    int i;
4345
4346    /* Split into words */
4347    list = split(self, NULL, -1);
4348    if (!list)
4349        return NULL;
4350
4351    /* Capitalize each word */
4352    for (i = 0; i < PyList_GET_SIZE(list); i++) {
4353        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4354		     fixcapitalize);
4355        if (item == NULL)
4356            goto onError;
4357        Py_DECREF(PyList_GET_ITEM(list, i));
4358        PyList_SET_ITEM(list, i, item);
4359    }
4360
4361    /* Join the words to form a new string */
4362    item = PyUnicode_Join(NULL, list);
4363
4364onError:
4365    Py_DECREF(list);
4366    return (PyObject *)item;
4367}
4368#endif
4369
4370PyDoc_STRVAR(center__doc__,
4371"S.center(width) -> unicode\n\
4372\n\
4373Return S centered in a Unicode string of length width. Padding is done\n\
4374using spaces.");
4375
4376static PyObject *
4377unicode_center(PyUnicodeObject *self, PyObject *args)
4378{
4379    int marg, left;
4380    int width;
4381
4382    if (!PyArg_ParseTuple(args, "i:center", &width))
4383        return NULL;
4384
4385    if (self->length >= width && PyUnicode_CheckExact(self)) {
4386        Py_INCREF(self);
4387        return (PyObject*) self;
4388    }
4389
4390    marg = width - self->length;
4391    left = marg / 2 + (marg & width & 1);
4392
4393    return (PyObject*) pad(self, left, marg - left, ' ');
4394}
4395
4396#if 0
4397
4398/* This code should go into some future Unicode collation support
4399   module. The basic comparison should compare ordinals on a naive
4400   basis (this is what Java does and thus JPython too). */
4401
4402/* speedy UTF-16 code point order comparison */
4403/* gleaned from: */
4404/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4405
4406static short utf16Fixup[32] =
4407{
4408    0, 0, 0, 0, 0, 0, 0, 0,
4409    0, 0, 0, 0, 0, 0, 0, 0,
4410    0, 0, 0, 0, 0, 0, 0, 0,
4411    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4412};
4413
4414static int
4415unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4416{
4417    int len1, len2;
4418
4419    Py_UNICODE *s1 = str1->str;
4420    Py_UNICODE *s2 = str2->str;
4421
4422    len1 = str1->length;
4423    len2 = str2->length;
4424
4425    while (len1 > 0 && len2 > 0) {
4426        Py_UNICODE c1, c2;
4427
4428        c1 = *s1++;
4429        c2 = *s2++;
4430
4431	if (c1 > (1<<11) * 26)
4432	    c1 += utf16Fixup[c1>>11];
4433	if (c2 > (1<<11) * 26)
4434            c2 += utf16Fixup[c2>>11];
4435        /* now c1 and c2 are in UTF-32-compatible order */
4436
4437        if (c1 != c2)
4438            return (c1 < c2) ? -1 : 1;
4439
4440        len1--; len2--;
4441    }
4442
4443    return (len1 < len2) ? -1 : (len1 != len2);
4444}
4445
4446#else
4447
4448static int
4449unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4450{
4451    register int len1, len2;
4452
4453    Py_UNICODE *s1 = str1->str;
4454    Py_UNICODE *s2 = str2->str;
4455
4456    len1 = str1->length;
4457    len2 = str2->length;
4458
4459    while (len1 > 0 && len2 > 0) {
4460        Py_UNICODE c1, c2;
4461
4462        c1 = *s1++;
4463        c2 = *s2++;
4464
4465        if (c1 != c2)
4466            return (c1 < c2) ? -1 : 1;
4467
4468        len1--; len2--;
4469    }
4470
4471    return (len1 < len2) ? -1 : (len1 != len2);
4472}
4473
4474#endif
4475
4476int PyUnicode_Compare(PyObject *left,
4477		      PyObject *right)
4478{
4479    PyUnicodeObject *u = NULL, *v = NULL;
4480    int result;
4481
4482    /* Coerce the two arguments */
4483    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4484    if (u == NULL)
4485	goto onError;
4486    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4487    if (v == NULL)
4488	goto onError;
4489
4490    /* Shortcut for empty or interned objects */
4491    if (v == u) {
4492	Py_DECREF(u);
4493	Py_DECREF(v);
4494	return 0;
4495    }
4496
4497    result = unicode_compare(u, v);
4498
4499    Py_DECREF(u);
4500    Py_DECREF(v);
4501    return result;
4502
4503onError:
4504    Py_XDECREF(u);
4505    Py_XDECREF(v);
4506    return -1;
4507}
4508
4509int PyUnicode_Contains(PyObject *container,
4510		       PyObject *element)
4511{
4512    PyUnicodeObject *u = NULL, *v = NULL;
4513    int result, size;
4514    register const Py_UNICODE *lhs, *end, *rhs;
4515
4516    /* Coerce the two arguments */
4517    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
4518    if (v == NULL) {
4519	PyErr_SetString(PyExc_TypeError,
4520	    "'in <string>' requires string as left operand");
4521	goto onError;
4522    }
4523    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
4524    if (u == NULL)
4525	goto onError;
4526
4527    size = PyUnicode_GET_SIZE(v);
4528    rhs = PyUnicode_AS_UNICODE(v);
4529    lhs = PyUnicode_AS_UNICODE(u);
4530
4531    result = 0;
4532    if (size == 1) {
4533	end = lhs + PyUnicode_GET_SIZE(u);
4534	while (lhs < end) {
4535	    if (*lhs++ == *rhs) {
4536		result = 1;
4537		break;
4538	    }
4539	}
4540    }
4541    else {
4542	end = lhs + (PyUnicode_GET_SIZE(u) - size);
4543	while (lhs <= end) {
4544	    if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
4545		result = 1;
4546		break;
4547	    }
4548	}
4549    }
4550
4551    Py_DECREF(u);
4552    Py_DECREF(v);
4553    return result;
4554
4555onError:
4556    Py_XDECREF(u);
4557    Py_XDECREF(v);
4558    return -1;
4559}
4560
4561/* Concat to string or Unicode object giving a new Unicode object. */
4562
4563PyObject *PyUnicode_Concat(PyObject *left,
4564			   PyObject *right)
4565{
4566    PyUnicodeObject *u = NULL, *v = NULL, *w;
4567
4568    /* Coerce the two arguments */
4569    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4570    if (u == NULL)
4571	goto onError;
4572    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4573    if (v == NULL)
4574	goto onError;
4575
4576    /* Shortcuts */
4577    if (v == unicode_empty) {
4578	Py_DECREF(v);
4579	return (PyObject *)u;
4580    }
4581    if (u == unicode_empty) {
4582	Py_DECREF(u);
4583	return (PyObject *)v;
4584    }
4585
4586    /* Concat the two Unicode strings */
4587    w = _PyUnicode_New(u->length + v->length);
4588    if (w == NULL)
4589	goto onError;
4590    Py_UNICODE_COPY(w->str, u->str, u->length);
4591    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4592
4593    Py_DECREF(u);
4594    Py_DECREF(v);
4595    return (PyObject *)w;
4596
4597onError:
4598    Py_XDECREF(u);
4599    Py_XDECREF(v);
4600    return NULL;
4601}
4602
4603PyDoc_STRVAR(count__doc__,
4604"S.count(sub[, start[, end]]) -> int\n\
4605\n\
4606Return the number of occurrences of substring sub in Unicode string\n\
4607S[start:end].  Optional arguments start and end are\n\
4608interpreted as in slice notation.");
4609
4610static PyObject *
4611unicode_count(PyUnicodeObject *self, PyObject *args)
4612{
4613    PyUnicodeObject *substring;
4614    int start = 0;
4615    int end = INT_MAX;
4616    PyObject *result;
4617
4618    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4619		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4620        return NULL;
4621
4622    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4623						(PyObject *)substring);
4624    if (substring == NULL)
4625	return NULL;
4626
4627    if (start < 0)
4628        start += self->length;
4629    if (start < 0)
4630        start = 0;
4631    if (end > self->length)
4632        end = self->length;
4633    if (end < 0)
4634        end += self->length;
4635    if (end < 0)
4636        end = 0;
4637
4638    result = PyInt_FromLong((long) count(self, start, end, substring));
4639
4640    Py_DECREF(substring);
4641    return result;
4642}
4643
4644PyDoc_STRVAR(encode__doc__,
4645"S.encode([encoding[,errors]]) -> string\n\
4646\n\
4647Return an encoded string version of S. Default encoding is the current\n\
4648default string encoding. errors may be given to set a different error\n\
4649handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4650a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4651'xmlcharrefreplace' as well as any other name registered with\n\
4652codecs.register_error that can handle UnicodeEncodeErrors.");
4653
4654static PyObject *
4655unicode_encode(PyUnicodeObject *self, PyObject *args)
4656{
4657    char *encoding = NULL;
4658    char *errors = NULL;
4659    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4660        return NULL;
4661    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4662}
4663
4664PyDoc_STRVAR(expandtabs__doc__,
4665"S.expandtabs([tabsize]) -> unicode\n\
4666\n\
4667Return a copy of S where all tab characters are expanded using spaces.\n\
4668If tabsize is not given, a tab size of 8 characters is assumed.");
4669
4670static PyObject*
4671unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4672{
4673    Py_UNICODE *e;
4674    Py_UNICODE *p;
4675    Py_UNICODE *q;
4676    int i, j;
4677    PyUnicodeObject *u;
4678    int tabsize = 8;
4679
4680    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4681	return NULL;
4682
4683    /* First pass: determine size of output string */
4684    i = j = 0;
4685    e = self->str + self->length;
4686    for (p = self->str; p < e; p++)
4687        if (*p == '\t') {
4688	    if (tabsize > 0)
4689		j += tabsize - (j % tabsize);
4690	}
4691        else {
4692            j++;
4693            if (*p == '\n' || *p == '\r') {
4694                i += j;
4695                j = 0;
4696            }
4697        }
4698
4699    /* Second pass: create output string and fill it */
4700    u = _PyUnicode_New(i + j);
4701    if (!u)
4702        return NULL;
4703
4704    j = 0;
4705    q = u->str;
4706
4707    for (p = self->str; p < e; p++)
4708        if (*p == '\t') {
4709	    if (tabsize > 0) {
4710		i = tabsize - (j % tabsize);
4711		j += i;
4712		while (i--)
4713		    *q++ = ' ';
4714	    }
4715	}
4716	else {
4717            j++;
4718	    *q++ = *p;
4719            if (*p == '\n' || *p == '\r')
4720                j = 0;
4721        }
4722
4723    return (PyObject*) u;
4724}
4725
4726PyDoc_STRVAR(find__doc__,
4727"S.find(sub [,start [,end]]) -> int\n\
4728\n\
4729Return the lowest index in S where substring sub is found,\n\
4730such that sub is contained within s[start,end].  Optional\n\
4731arguments start and end are interpreted as in slice notation.\n\
4732\n\
4733Return -1 on failure.");
4734
4735static PyObject *
4736unicode_find(PyUnicodeObject *self, PyObject *args)
4737{
4738    PyUnicodeObject *substring;
4739    int start = 0;
4740    int end = INT_MAX;
4741    PyObject *result;
4742
4743    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4744		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4745        return NULL;
4746    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4747						(PyObject *)substring);
4748    if (substring == NULL)
4749	return NULL;
4750
4751    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4752
4753    Py_DECREF(substring);
4754    return result;
4755}
4756
4757static PyObject *
4758unicode_getitem(PyUnicodeObject *self, int index)
4759{
4760    if (index < 0 || index >= self->length) {
4761        PyErr_SetString(PyExc_IndexError, "string index out of range");
4762        return NULL;
4763    }
4764
4765    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4766}
4767
4768static long
4769unicode_hash(PyUnicodeObject *self)
4770{
4771    /* Since Unicode objects compare equal to their ASCII string
4772       counterparts, they should use the individual character values
4773       as basis for their hash value.  This is needed to assure that
4774       strings and Unicode objects behave in the same way as
4775       dictionary keys. */
4776
4777    register int len;
4778    register Py_UNICODE *p;
4779    register long x;
4780
4781    if (self->hash != -1)
4782	return self->hash;
4783    len = PyUnicode_GET_SIZE(self);
4784    p = PyUnicode_AS_UNICODE(self);
4785    x = *p << 7;
4786    while (--len >= 0)
4787	x = (1000003*x) ^ *p++;
4788    x ^= PyUnicode_GET_SIZE(self);
4789    if (x == -1)
4790	x = -2;
4791    self->hash = x;
4792    return x;
4793}
4794
4795PyDoc_STRVAR(index__doc__,
4796"S.index(sub [,start [,end]]) -> int\n\
4797\n\
4798Like S.find() but raise ValueError when the substring is not found.");
4799
4800static PyObject *
4801unicode_index(PyUnicodeObject *self, PyObject *args)
4802{
4803    int result;
4804    PyUnicodeObject *substring;
4805    int start = 0;
4806    int end = INT_MAX;
4807
4808    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4809		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4810        return NULL;
4811
4812    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4813						(PyObject *)substring);
4814    if (substring == NULL)
4815	return NULL;
4816
4817    result = findstring(self, substring, start, end, 1);
4818
4819    Py_DECREF(substring);
4820    if (result < 0) {
4821        PyErr_SetString(PyExc_ValueError, "substring not found");
4822        return NULL;
4823    }
4824    return PyInt_FromLong(result);
4825}
4826
4827PyDoc_STRVAR(islower__doc__,
4828"S.islower() -> bool\n\
4829\n\
4830Return True if all cased characters in S are lowercase and there is\n\
4831at least one cased character in S, False otherwise.");
4832
4833static PyObject*
4834unicode_islower(PyUnicodeObject *self)
4835{
4836    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4837    register const Py_UNICODE *e;
4838    int cased;
4839
4840    /* Shortcut for single character strings */
4841    if (PyUnicode_GET_SIZE(self) == 1)
4842	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
4843
4844    /* Special case for empty strings */
4845    if (PyString_GET_SIZE(self) == 0)
4846	return PyBool_FromLong(0);
4847
4848    e = p + PyUnicode_GET_SIZE(self);
4849    cased = 0;
4850    for (; p < e; p++) {
4851	register const Py_UNICODE ch = *p;
4852
4853	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4854	    return PyBool_FromLong(0);
4855	else if (!cased && Py_UNICODE_ISLOWER(ch))
4856	    cased = 1;
4857    }
4858    return PyBool_FromLong(cased);
4859}
4860
4861PyDoc_STRVAR(isupper__doc__,
4862"S.isupper() -> bool\n\
4863\n\
4864Return True if  all cased characters in S are uppercase and there is\n\
4865at least one cased character in S, False otherwise.");
4866
4867static PyObject*
4868unicode_isupper(PyUnicodeObject *self)
4869{
4870    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4871    register const Py_UNICODE *e;
4872    int cased;
4873
4874    /* Shortcut for single character strings */
4875    if (PyUnicode_GET_SIZE(self) == 1)
4876	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4877
4878    /* Special case for empty strings */
4879    if (PyString_GET_SIZE(self) == 0)
4880	return PyBool_FromLong(0);
4881
4882    e = p + PyUnicode_GET_SIZE(self);
4883    cased = 0;
4884    for (; p < e; p++) {
4885	register const Py_UNICODE ch = *p;
4886
4887	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4888	    return PyBool_FromLong(0);
4889	else if (!cased && Py_UNICODE_ISUPPER(ch))
4890	    cased = 1;
4891    }
4892    return PyBool_FromLong(cased);
4893}
4894
4895PyDoc_STRVAR(istitle__doc__,
4896"S.istitle() -> bool\n\
4897\n\
4898Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4899characters may only follow uncased characters and lowercase characters\n\
4900only cased ones. Return False otherwise.");
4901
4902static PyObject*
4903unicode_istitle(PyUnicodeObject *self)
4904{
4905    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4906    register const Py_UNICODE *e;
4907    int cased, previous_is_cased;
4908
4909    /* Shortcut for single character strings */
4910    if (PyUnicode_GET_SIZE(self) == 1)
4911	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4912			       (Py_UNICODE_ISUPPER(*p) != 0));
4913
4914    /* Special case for empty strings */
4915    if (PyString_GET_SIZE(self) == 0)
4916	return PyBool_FromLong(0);
4917
4918    e = p + PyUnicode_GET_SIZE(self);
4919    cased = 0;
4920    previous_is_cased = 0;
4921    for (; p < e; p++) {
4922	register const Py_UNICODE ch = *p;
4923
4924	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4925	    if (previous_is_cased)
4926		return PyBool_FromLong(0);
4927	    previous_is_cased = 1;
4928	    cased = 1;
4929	}
4930	else if (Py_UNICODE_ISLOWER(ch)) {
4931	    if (!previous_is_cased)
4932		return PyBool_FromLong(0);
4933	    previous_is_cased = 1;
4934	    cased = 1;
4935	}
4936	else
4937	    previous_is_cased = 0;
4938    }
4939    return PyBool_FromLong(cased);
4940}
4941
4942PyDoc_STRVAR(isspace__doc__,
4943"S.isspace() -> bool\n\
4944\n\
4945Return True if there are only whitespace characters in S,\n\
4946False otherwise.");
4947
4948static PyObject*
4949unicode_isspace(PyUnicodeObject *self)
4950{
4951    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4952    register const Py_UNICODE *e;
4953
4954    /* Shortcut for single character strings */
4955    if (PyUnicode_GET_SIZE(self) == 1 &&
4956	Py_UNICODE_ISSPACE(*p))
4957	return PyBool_FromLong(1);
4958
4959    /* Special case for empty strings */
4960    if (PyString_GET_SIZE(self) == 0)
4961	return PyBool_FromLong(0);
4962
4963    e = p + PyUnicode_GET_SIZE(self);
4964    for (; p < e; p++) {
4965	if (!Py_UNICODE_ISSPACE(*p))
4966	    return PyBool_FromLong(0);
4967    }
4968    return PyBool_FromLong(1);
4969}
4970
4971PyDoc_STRVAR(isalpha__doc__,
4972"S.isalpha() -> bool\n\
4973\n\
4974Return True if  all characters in S are alphabetic\n\
4975and there is at least one character in S, False otherwise.");
4976
4977static PyObject*
4978unicode_isalpha(PyUnicodeObject *self)
4979{
4980    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4981    register const Py_UNICODE *e;
4982
4983    /* Shortcut for single character strings */
4984    if (PyUnicode_GET_SIZE(self) == 1 &&
4985	Py_UNICODE_ISALPHA(*p))
4986	return PyBool_FromLong(1);
4987
4988    /* Special case for empty strings */
4989    if (PyString_GET_SIZE(self) == 0)
4990	return PyBool_FromLong(0);
4991
4992    e = p + PyUnicode_GET_SIZE(self);
4993    for (; p < e; p++) {
4994	if (!Py_UNICODE_ISALPHA(*p))
4995	    return PyBool_FromLong(0);
4996    }
4997    return PyBool_FromLong(1);
4998}
4999
5000PyDoc_STRVAR(isalnum__doc__,
5001"S.isalnum() -> bool\n\
5002\n\
5003Return True if  all characters in S are alphanumeric\n\
5004and there is at least one character in S, False otherwise.");
5005
5006static PyObject*
5007unicode_isalnum(PyUnicodeObject *self)
5008{
5009    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5010    register const Py_UNICODE *e;
5011
5012    /* Shortcut for single character strings */
5013    if (PyUnicode_GET_SIZE(self) == 1 &&
5014	Py_UNICODE_ISALNUM(*p))
5015	return PyBool_FromLong(1);
5016
5017    /* Special case for empty strings */
5018    if (PyString_GET_SIZE(self) == 0)
5019	return PyBool_FromLong(0);
5020
5021    e = p + PyUnicode_GET_SIZE(self);
5022    for (; p < e; p++) {
5023	if (!Py_UNICODE_ISALNUM(*p))
5024	    return PyBool_FromLong(0);
5025    }
5026    return PyBool_FromLong(1);
5027}
5028
5029PyDoc_STRVAR(isdecimal__doc__,
5030"S.isdecimal() -> bool\n\
5031\n\
5032Return True if there are only decimal characters in S,\n\
5033False otherwise.");
5034
5035static PyObject*
5036unicode_isdecimal(PyUnicodeObject *self)
5037{
5038    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5039    register const Py_UNICODE *e;
5040
5041    /* Shortcut for single character strings */
5042    if (PyUnicode_GET_SIZE(self) == 1 &&
5043	Py_UNICODE_ISDECIMAL(*p))
5044	return PyBool_FromLong(1);
5045
5046    /* Special case for empty strings */
5047    if (PyString_GET_SIZE(self) == 0)
5048	return PyBool_FromLong(0);
5049
5050    e = p + PyUnicode_GET_SIZE(self);
5051    for (; p < e; p++) {
5052	if (!Py_UNICODE_ISDECIMAL(*p))
5053	    return PyBool_FromLong(0);
5054    }
5055    return PyBool_FromLong(1);
5056}
5057
5058PyDoc_STRVAR(isdigit__doc__,
5059"S.isdigit() -> bool\n\
5060\n\
5061Return True if there are only digit characters in S,\n\
5062False otherwise.");
5063
5064static PyObject*
5065unicode_isdigit(PyUnicodeObject *self)
5066{
5067    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5068    register const Py_UNICODE *e;
5069
5070    /* Shortcut for single character strings */
5071    if (PyUnicode_GET_SIZE(self) == 1 &&
5072	Py_UNICODE_ISDIGIT(*p))
5073	return PyBool_FromLong(1);
5074
5075    /* Special case for empty strings */
5076    if (PyString_GET_SIZE(self) == 0)
5077	return PyBool_FromLong(0);
5078
5079    e = p + PyUnicode_GET_SIZE(self);
5080    for (; p < e; p++) {
5081	if (!Py_UNICODE_ISDIGIT(*p))
5082	    return PyBool_FromLong(0);
5083    }
5084    return PyBool_FromLong(1);
5085}
5086
5087PyDoc_STRVAR(isnumeric__doc__,
5088"S.isnumeric() -> bool\n\
5089\n\
5090Return True if there are only numeric characters in S,\n\
5091False otherwise.");
5092
5093static PyObject*
5094unicode_isnumeric(PyUnicodeObject *self)
5095{
5096    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5097    register const Py_UNICODE *e;
5098
5099    /* Shortcut for single character strings */
5100    if (PyUnicode_GET_SIZE(self) == 1 &&
5101	Py_UNICODE_ISNUMERIC(*p))
5102	return PyBool_FromLong(1);
5103
5104    /* Special case for empty strings */
5105    if (PyString_GET_SIZE(self) == 0)
5106	return PyBool_FromLong(0);
5107
5108    e = p + PyUnicode_GET_SIZE(self);
5109    for (; p < e; p++) {
5110	if (!Py_UNICODE_ISNUMERIC(*p))
5111	    return PyBool_FromLong(0);
5112    }
5113    return PyBool_FromLong(1);
5114}
5115
5116PyDoc_STRVAR(join__doc__,
5117"S.join(sequence) -> unicode\n\
5118\n\
5119Return a string which is the concatenation of the strings in the\n\
5120sequence.  The separator between elements is S.");
5121
5122static PyObject*
5123unicode_join(PyObject *self, PyObject *data)
5124{
5125    return PyUnicode_Join(self, data);
5126}
5127
5128static int
5129unicode_length(PyUnicodeObject *self)
5130{
5131    return self->length;
5132}
5133
5134PyDoc_STRVAR(ljust__doc__,
5135"S.ljust(width) -> unicode\n\
5136\n\
5137Return S left justified in a Unicode string of length width. Padding is\n\
5138done using spaces.");
5139
5140static PyObject *
5141unicode_ljust(PyUnicodeObject *self, PyObject *args)
5142{
5143    int width;
5144    if (!PyArg_ParseTuple(args, "i:ljust", &width))
5145        return NULL;
5146
5147    if (self->length >= width && PyUnicode_CheckExact(self)) {
5148        Py_INCREF(self);
5149        return (PyObject*) self;
5150    }
5151
5152    return (PyObject*) pad(self, 0, width - self->length, ' ');
5153}
5154
5155PyDoc_STRVAR(lower__doc__,
5156"S.lower() -> unicode\n\
5157\n\
5158Return a copy of the string S converted to lowercase.");
5159
5160static PyObject*
5161unicode_lower(PyUnicodeObject *self)
5162{
5163    return fixup(self, fixlower);
5164}
5165
5166#define LEFTSTRIP 0
5167#define RIGHTSTRIP 1
5168#define BOTHSTRIP 2
5169
5170/* Arrays indexed by above */
5171static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5172
5173#define STRIPNAME(i) (stripformat[i]+3)
5174
5175static const Py_UNICODE *
5176unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5177{
5178	size_t i;
5179	for (i = 0; i < n; ++i)
5180		if (s[i] == c)
5181			return s+i;
5182	return NULL;
5183}
5184
5185/* externally visible for str.strip(unicode) */
5186PyObject *
5187_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5188{
5189	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5190	int len = PyUnicode_GET_SIZE(self);
5191	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5192	int seplen = PyUnicode_GET_SIZE(sepobj);
5193	int i, j;
5194
5195	i = 0;
5196	if (striptype != RIGHTSTRIP) {
5197		while (i < len && unicode_memchr(sep, s[i], seplen)) {
5198			i++;
5199		}
5200	}
5201
5202	j = len;
5203	if (striptype != LEFTSTRIP) {
5204		do {
5205			j--;
5206		} while (j >= i && unicode_memchr(sep, s[j], seplen));
5207		j++;
5208	}
5209
5210	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5211		Py_INCREF(self);
5212		return (PyObject*)self;
5213	}
5214	else
5215		return PyUnicode_FromUnicode(s+i, j-i);
5216}
5217
5218
5219static PyObject *
5220do_strip(PyUnicodeObject *self, int striptype)
5221{
5222	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5223	int len = PyUnicode_GET_SIZE(self), i, j;
5224
5225	i = 0;
5226	if (striptype != RIGHTSTRIP) {
5227		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5228			i++;
5229		}
5230	}
5231
5232	j = len;
5233	if (striptype != LEFTSTRIP) {
5234		do {
5235			j--;
5236		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5237		j++;
5238	}
5239
5240	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5241		Py_INCREF(self);
5242		return (PyObject*)self;
5243	}
5244	else
5245		return PyUnicode_FromUnicode(s+i, j-i);
5246}
5247
5248
5249static PyObject *
5250do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5251{
5252	PyObject *sep = NULL;
5253
5254	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5255		return NULL;
5256
5257	if (sep != NULL && sep != Py_None) {
5258		if (PyUnicode_Check(sep))
5259			return _PyUnicode_XStrip(self, striptype, sep);
5260		else if (PyString_Check(sep)) {
5261			PyObject *res;
5262			sep = PyUnicode_FromObject(sep);
5263			if (sep==NULL)
5264				return NULL;
5265			res = _PyUnicode_XStrip(self, striptype, sep);
5266			Py_DECREF(sep);
5267			return res;
5268		}
5269		else {
5270			PyErr_Format(PyExc_TypeError,
5271				     "%s arg must be None, unicode or str",
5272				     STRIPNAME(striptype));
5273			return NULL;
5274		}
5275	}
5276
5277	return do_strip(self, striptype);
5278}
5279
5280
5281PyDoc_STRVAR(strip__doc__,
5282"S.strip([chars]) -> unicode\n\
5283\n\
5284Return a copy of the string S with leading and trailing\n\
5285whitespace removed.\n\
5286If chars is given and not None, remove characters in chars instead.\n\
5287If chars is a str, it will be converted to unicode before stripping");
5288
5289static PyObject *
5290unicode_strip(PyUnicodeObject *self, PyObject *args)
5291{
5292	if (PyTuple_GET_SIZE(args) == 0)
5293		return do_strip(self, BOTHSTRIP); /* Common case */
5294	else
5295		return do_argstrip(self, BOTHSTRIP, args);
5296}
5297
5298
5299PyDoc_STRVAR(lstrip__doc__,
5300"S.lstrip([chars]) -> unicode\n\
5301\n\
5302Return a copy of the string S with leading whitespace removed.\n\
5303If chars is given and not None, remove characters in chars instead.\n\
5304If chars is a str, it will be converted to unicode before stripping");
5305
5306static PyObject *
5307unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5308{
5309	if (PyTuple_GET_SIZE(args) == 0)
5310		return do_strip(self, LEFTSTRIP); /* Common case */
5311	else
5312		return do_argstrip(self, LEFTSTRIP, args);
5313}
5314
5315
5316PyDoc_STRVAR(rstrip__doc__,
5317"S.rstrip([chars]) -> unicode\n\
5318\n\
5319Return a copy of the string S with trailing whitespace removed.\n\
5320If chars is given and not None, remove characters in chars instead.\n\
5321If chars is a str, it will be converted to unicode before stripping");
5322
5323static PyObject *
5324unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5325{
5326	if (PyTuple_GET_SIZE(args) == 0)
5327		return do_strip(self, RIGHTSTRIP); /* Common case */
5328	else
5329		return do_argstrip(self, RIGHTSTRIP, args);
5330}
5331
5332
5333static PyObject*
5334unicode_repeat(PyUnicodeObject *str, int len)
5335{
5336    PyUnicodeObject *u;
5337    Py_UNICODE *p;
5338    int nchars;
5339    size_t nbytes;
5340
5341    if (len < 0)
5342        len = 0;
5343
5344    if (len == 1 && PyUnicode_CheckExact(str)) {
5345        /* no repeat, return original string */
5346        Py_INCREF(str);
5347        return (PyObject*) str;
5348    }
5349
5350    /* ensure # of chars needed doesn't overflow int and # of bytes
5351     * needed doesn't overflow size_t
5352     */
5353    nchars = len * str->length;
5354    if (len && nchars / len != str->length) {
5355        PyErr_SetString(PyExc_OverflowError,
5356                        "repeated string is too long");
5357        return NULL;
5358    }
5359    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5360    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5361        PyErr_SetString(PyExc_OverflowError,
5362                        "repeated string is too long");
5363        return NULL;
5364    }
5365    u = _PyUnicode_New(nchars);
5366    if (!u)
5367        return NULL;
5368
5369    p = u->str;
5370
5371    while (len-- > 0) {
5372        Py_UNICODE_COPY(p, str->str, str->length);
5373        p += str->length;
5374    }
5375
5376    return (PyObject*) u;
5377}
5378
5379PyObject *PyUnicode_Replace(PyObject *obj,
5380			    PyObject *subobj,
5381			    PyObject *replobj,
5382			    int maxcount)
5383{
5384    PyObject *self;
5385    PyObject *str1;
5386    PyObject *str2;
5387    PyObject *result;
5388
5389    self = PyUnicode_FromObject(obj);
5390    if (self == NULL)
5391	return NULL;
5392    str1 = PyUnicode_FromObject(subobj);
5393    if (str1 == NULL) {
5394	Py_DECREF(self);
5395	return NULL;
5396    }
5397    str2 = PyUnicode_FromObject(replobj);
5398    if (str2 == NULL) {
5399	Py_DECREF(self);
5400	Py_DECREF(str1);
5401	return NULL;
5402    }
5403    result = replace((PyUnicodeObject *)self,
5404		     (PyUnicodeObject *)str1,
5405		     (PyUnicodeObject *)str2,
5406		     maxcount);
5407    Py_DECREF(self);
5408    Py_DECREF(str1);
5409    Py_DECREF(str2);
5410    return result;
5411}
5412
5413PyDoc_STRVAR(replace__doc__,
5414"S.replace (old, new[, maxsplit]) -> unicode\n\
5415\n\
5416Return a copy of S with all occurrences of substring\n\
5417old replaced by new.  If the optional argument maxsplit is\n\
5418given, only the first maxsplit occurrences are replaced.");
5419
5420static PyObject*
5421unicode_replace(PyUnicodeObject *self, PyObject *args)
5422{
5423    PyUnicodeObject *str1;
5424    PyUnicodeObject *str2;
5425    int maxcount = -1;
5426    PyObject *result;
5427
5428    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5429        return NULL;
5430    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5431    if (str1 == NULL)
5432	return NULL;
5433    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5434    if (str2 == NULL) {
5435	Py_DECREF(str1);
5436	return NULL;
5437    }
5438
5439    result = replace(self, str1, str2, maxcount);
5440
5441    Py_DECREF(str1);
5442    Py_DECREF(str2);
5443    return result;
5444}
5445
5446static
5447PyObject *unicode_repr(PyObject *unicode)
5448{
5449    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5450				PyUnicode_GET_SIZE(unicode),
5451				1);
5452}
5453
5454PyDoc_STRVAR(rfind__doc__,
5455"S.rfind(sub [,start [,end]]) -> int\n\
5456\n\
5457Return the highest index in S where substring sub is found,\n\
5458such that sub is contained within s[start,end].  Optional\n\
5459arguments start and end are interpreted as in slice notation.\n\
5460\n\
5461Return -1 on failure.");
5462
5463static PyObject *
5464unicode_rfind(PyUnicodeObject *self, PyObject *args)
5465{
5466    PyUnicodeObject *substring;
5467    int start = 0;
5468    int end = INT_MAX;
5469    PyObject *result;
5470
5471    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5472		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5473        return NULL;
5474    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5475						(PyObject *)substring);
5476    if (substring == NULL)
5477	return NULL;
5478
5479    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5480
5481    Py_DECREF(substring);
5482    return result;
5483}
5484
5485PyDoc_STRVAR(rindex__doc__,
5486"S.rindex(sub [,start [,end]]) -> int\n\
5487\n\
5488Like S.rfind() but raise ValueError when the substring is not found.");
5489
5490static PyObject *
5491unicode_rindex(PyUnicodeObject *self, PyObject *args)
5492{
5493    int result;
5494    PyUnicodeObject *substring;
5495    int start = 0;
5496    int end = INT_MAX;
5497
5498    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5499		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5500        return NULL;
5501    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5502						(PyObject *)substring);
5503    if (substring == NULL)
5504	return NULL;
5505
5506    result = findstring(self, substring, start, end, -1);
5507
5508    Py_DECREF(substring);
5509    if (result < 0) {
5510        PyErr_SetString(PyExc_ValueError, "substring not found");
5511        return NULL;
5512    }
5513    return PyInt_FromLong(result);
5514}
5515
5516PyDoc_STRVAR(rjust__doc__,
5517"S.rjust(width) -> unicode\n\
5518\n\
5519Return S right justified in a Unicode string of length width. Padding is\n\
5520done using spaces.");
5521
5522static PyObject *
5523unicode_rjust(PyUnicodeObject *self, PyObject *args)
5524{
5525    int width;
5526    if (!PyArg_ParseTuple(args, "i:rjust", &width))
5527        return NULL;
5528
5529    if (self->length >= width && PyUnicode_CheckExact(self)) {
5530        Py_INCREF(self);
5531        return (PyObject*) self;
5532    }
5533
5534    return (PyObject*) pad(self, width - self->length, 0, ' ');
5535}
5536
5537static PyObject*
5538unicode_slice(PyUnicodeObject *self, int start, int end)
5539{
5540    /* standard clamping */
5541    if (start < 0)
5542        start = 0;
5543    if (end < 0)
5544        end = 0;
5545    if (end > self->length)
5546        end = self->length;
5547    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
5548        /* full slice, return original string */
5549        Py_INCREF(self);
5550        return (PyObject*) self;
5551    }
5552    if (start > end)
5553        start = end;
5554    /* copy slice */
5555    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5556					     end - start);
5557}
5558
5559PyObject *PyUnicode_Split(PyObject *s,
5560			  PyObject *sep,
5561			  int maxsplit)
5562{
5563    PyObject *result;
5564
5565    s = PyUnicode_FromObject(s);
5566    if (s == NULL)
5567	return NULL;
5568    if (sep != NULL) {
5569	sep = PyUnicode_FromObject(sep);
5570	if (sep == NULL) {
5571	    Py_DECREF(s);
5572	    return NULL;
5573	}
5574    }
5575
5576    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5577
5578    Py_DECREF(s);
5579    Py_XDECREF(sep);
5580    return result;
5581}
5582
5583PyDoc_STRVAR(split__doc__,
5584"S.split([sep [,maxsplit]]) -> list of strings\n\
5585\n\
5586Return a list of the words in S, using sep as the\n\
5587delimiter string.  If maxsplit is given, at most maxsplit\n\
5588splits are done. If sep is not specified, any whitespace string\n\
5589is a separator.");
5590
5591static PyObject*
5592unicode_split(PyUnicodeObject *self, PyObject *args)
5593{
5594    PyObject *substring = Py_None;
5595    int maxcount = -1;
5596
5597    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5598        return NULL;
5599
5600    if (substring == Py_None)
5601	return split(self, NULL, maxcount);
5602    else if (PyUnicode_Check(substring))
5603	return split(self, (PyUnicodeObject *)substring, maxcount);
5604    else
5605	return PyUnicode_Split((PyObject *)self, substring, maxcount);
5606}
5607
5608PyDoc_STRVAR(splitlines__doc__,
5609"S.splitlines([keepends]]) -> list of strings\n\
5610\n\
5611Return a list of the lines in S, breaking at line boundaries.\n\
5612Line breaks are not included in the resulting list unless keepends\n\
5613is given and true.");
5614
5615static PyObject*
5616unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5617{
5618    int keepends = 0;
5619
5620    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
5621        return NULL;
5622
5623    return PyUnicode_Splitlines((PyObject *)self, keepends);
5624}
5625
5626static
5627PyObject *unicode_str(PyUnicodeObject *self)
5628{
5629    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
5630}
5631
5632PyDoc_STRVAR(swapcase__doc__,
5633"S.swapcase() -> unicode\n\
5634\n\
5635Return a copy of S with uppercase characters converted to lowercase\n\
5636and vice versa.");
5637
5638static PyObject*
5639unicode_swapcase(PyUnicodeObject *self)
5640{
5641    return fixup(self, fixswapcase);
5642}
5643
5644PyDoc_STRVAR(translate__doc__,
5645"S.translate(table) -> unicode\n\
5646\n\
5647Return a copy of the string S, where all characters have been mapped\n\
5648through the given translation table, which must be a mapping of\n\
5649Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5650Unmapped characters are left untouched. Characters mapped to None\n\
5651are deleted.");
5652
5653static PyObject*
5654unicode_translate(PyUnicodeObject *self, PyObject *table)
5655{
5656    return PyUnicode_TranslateCharmap(self->str,
5657				      self->length,
5658				      table,
5659				      "ignore");
5660}
5661
5662PyDoc_STRVAR(upper__doc__,
5663"S.upper() -> unicode\n\
5664\n\
5665Return a copy of S converted to uppercase.");
5666
5667static PyObject*
5668unicode_upper(PyUnicodeObject *self)
5669{
5670    return fixup(self, fixupper);
5671}
5672
5673PyDoc_STRVAR(zfill__doc__,
5674"S.zfill(width) -> unicode\n\
5675\n\
5676Pad a numeric string x with zeros on the left, to fill a field\n\
5677of the specified width. The string x is never truncated.");
5678
5679static PyObject *
5680unicode_zfill(PyUnicodeObject *self, PyObject *args)
5681{
5682    int fill;
5683    PyUnicodeObject *u;
5684
5685    int width;
5686    if (!PyArg_ParseTuple(args, "i:zfill", &width))
5687        return NULL;
5688
5689    if (self->length >= width) {
5690        if (PyUnicode_CheckExact(self)) {
5691            Py_INCREF(self);
5692            return (PyObject*) self;
5693        }
5694        else
5695            return PyUnicode_FromUnicode(
5696                PyUnicode_AS_UNICODE(self),
5697                PyUnicode_GET_SIZE(self)
5698            );
5699    }
5700
5701    fill = width - self->length;
5702
5703    u = pad(self, fill, 0, '0');
5704
5705    if (u == NULL)
5706        return NULL;
5707
5708    if (u->str[fill] == '+' || u->str[fill] == '-') {
5709        /* move sign to beginning of string */
5710        u->str[0] = u->str[fill];
5711        u->str[fill] = '0';
5712    }
5713
5714    return (PyObject*) u;
5715}
5716
5717#if 0
5718static PyObject*
5719unicode_freelistsize(PyUnicodeObject *self)
5720{
5721    return PyInt_FromLong(unicode_freelist_size);
5722}
5723#endif
5724
5725PyDoc_STRVAR(startswith__doc__,
5726"S.startswith(prefix[, start[, end]]) -> bool\n\
5727\n\
5728Return True if S starts with the specified prefix, False otherwise.\n\
5729With optional start, test S beginning at that position.\n\
5730With optional end, stop comparing S at that position.");
5731
5732static PyObject *
5733unicode_startswith(PyUnicodeObject *self,
5734		   PyObject *args)
5735{
5736    PyUnicodeObject *substring;
5737    int start = 0;
5738    int end = INT_MAX;
5739    PyObject *result;
5740
5741    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5742		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5743	return NULL;
5744    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5745						(PyObject *)substring);
5746    if (substring == NULL)
5747	return NULL;
5748
5749    result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
5750
5751    Py_DECREF(substring);
5752    return result;
5753}
5754
5755
5756PyDoc_STRVAR(endswith__doc__,
5757"S.endswith(suffix[, start[, end]]) -> bool\n\
5758\n\
5759Return True if S ends with the specified suffix, False otherwise.\n\
5760With optional start, test S beginning at that position.\n\
5761With optional end, stop comparing S at that position.");
5762
5763static PyObject *
5764unicode_endswith(PyUnicodeObject *self,
5765		 PyObject *args)
5766{
5767    PyUnicodeObject *substring;
5768    int start = 0;
5769    int end = INT_MAX;
5770    PyObject *result;
5771
5772    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5773		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5774	return NULL;
5775    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5776						(PyObject *)substring);
5777    if (substring == NULL)
5778	return NULL;
5779
5780    result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
5781
5782    Py_DECREF(substring);
5783    return result;
5784}
5785
5786
5787
5788static PyObject *
5789unicode_getnewargs(PyUnicodeObject *v)
5790{
5791	return Py_BuildValue("(u#)", v->str, v->length);
5792}
5793
5794
5795static PyMethodDef unicode_methods[] = {
5796
5797    /* Order is according to common usage: often used methods should
5798       appear first, since lookup is done sequentially. */
5799
5800    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5801    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5802    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5803    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5804    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5805    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5806    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5807    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5808    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5809    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5810    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5811    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5812    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
5813    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
5814/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5815    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5816    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5817    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
5818    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
5819    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
5820    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
5821    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5822    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5823    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5824    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5825    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5826    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5827    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5828    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5829    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5830    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5831    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5832    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5833    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5834    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
5835    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
5836#if 0
5837    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
5838#endif
5839
5840#if 0
5841    /* This one is just used for debugging the implementation. */
5842    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
5843#endif
5844
5845    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
5846    {NULL, NULL}
5847};
5848
5849static PyObject *
5850unicode_mod(PyObject *v, PyObject *w)
5851{
5852       if (!PyUnicode_Check(v)) {
5853               Py_INCREF(Py_NotImplemented);
5854               return Py_NotImplemented;
5855       }
5856       return PyUnicode_Format(v, w);
5857}
5858
5859static PyNumberMethods unicode_as_number = {
5860	0,				/*nb_add*/
5861	0,				/*nb_subtract*/
5862	0,				/*nb_multiply*/
5863	0,				/*nb_divide*/
5864	unicode_mod,			/*nb_remainder*/
5865};
5866
5867static PySequenceMethods unicode_as_sequence = {
5868    (inquiry) unicode_length, 		/* sq_length */
5869    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
5870    (intargfunc) unicode_repeat, 	/* sq_repeat */
5871    (intargfunc) unicode_getitem, 	/* sq_item */
5872    (intintargfunc) unicode_slice, 	/* sq_slice */
5873    0, 					/* sq_ass_item */
5874    0, 					/* sq_ass_slice */
5875    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
5876};
5877
5878static PyObject*
5879unicode_subscript(PyUnicodeObject* self, PyObject* item)
5880{
5881    if (PyInt_Check(item)) {
5882        long i = PyInt_AS_LONG(item);
5883        if (i < 0)
5884            i += PyString_GET_SIZE(self);
5885        return unicode_getitem(self, i);
5886    } else if (PyLong_Check(item)) {
5887        long i = PyLong_AsLong(item);
5888        if (i == -1 && PyErr_Occurred())
5889            return NULL;
5890        if (i < 0)
5891            i += PyString_GET_SIZE(self);
5892        return unicode_getitem(self, i);
5893    } else if (PySlice_Check(item)) {
5894        int start, stop, step, slicelength, cur, i;
5895        Py_UNICODE* source_buf;
5896        Py_UNICODE* result_buf;
5897        PyObject* result;
5898
5899        if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5900				 &start, &stop, &step, &slicelength) < 0) {
5901            return NULL;
5902        }
5903
5904        if (slicelength <= 0) {
5905            return PyUnicode_FromUnicode(NULL, 0);
5906        } else {
5907            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5908            result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5909
5910            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5911                result_buf[i] = source_buf[cur];
5912            }
5913
5914            result = PyUnicode_FromUnicode(result_buf, slicelength);
5915            PyMem_FREE(result_buf);
5916            return result;
5917        }
5918    } else {
5919        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5920        return NULL;
5921    }
5922}
5923
5924static PyMappingMethods unicode_as_mapping = {
5925    (inquiry)unicode_length,		/* mp_length */
5926    (binaryfunc)unicode_subscript,	/* mp_subscript */
5927    (objobjargproc)0,			/* mp_ass_subscript */
5928};
5929
5930static int
5931unicode_buffer_getreadbuf(PyUnicodeObject *self,
5932			  int index,
5933			  const void **ptr)
5934{
5935    if (index != 0) {
5936        PyErr_SetString(PyExc_SystemError,
5937			"accessing non-existent unicode segment");
5938        return -1;
5939    }
5940    *ptr = (void *) self->str;
5941    return PyUnicode_GET_DATA_SIZE(self);
5942}
5943
5944static int
5945unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5946			   const void **ptr)
5947{
5948    PyErr_SetString(PyExc_TypeError,
5949		    "cannot use unicode as modifiable buffer");
5950    return -1;
5951}
5952
5953static int
5954unicode_buffer_getsegcount(PyUnicodeObject *self,
5955			   int *lenp)
5956{
5957    if (lenp)
5958        *lenp = PyUnicode_GET_DATA_SIZE(self);
5959    return 1;
5960}
5961
5962static int
5963unicode_buffer_getcharbuf(PyUnicodeObject *self,
5964			  int index,
5965			  const void **ptr)
5966{
5967    PyObject *str;
5968
5969    if (index != 0) {
5970        PyErr_SetString(PyExc_SystemError,
5971			"accessing non-existent unicode segment");
5972        return -1;
5973    }
5974    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5975    if (str == NULL)
5976	return -1;
5977    *ptr = (void *) PyString_AS_STRING(str);
5978    return PyString_GET_SIZE(str);
5979}
5980
5981/* Helpers for PyUnicode_Format() */
5982
5983static PyObject *
5984getnextarg(PyObject *args, int arglen, int *p_argidx)
5985{
5986    int argidx = *p_argidx;
5987    if (argidx < arglen) {
5988	(*p_argidx)++;
5989	if (arglen < 0)
5990	    return args;
5991	else
5992	    return PyTuple_GetItem(args, argidx);
5993    }
5994    PyErr_SetString(PyExc_TypeError,
5995		    "not enough arguments for format string");
5996    return NULL;
5997}
5998
5999#define F_LJUST (1<<0)
6000#define F_SIGN	(1<<1)
6001#define F_BLANK (1<<2)
6002#define F_ALT	(1<<3)
6003#define F_ZERO	(1<<4)
6004
6005static
6006int usprintf(register Py_UNICODE *buffer, char *format, ...)
6007{
6008    register int i;
6009    int len;
6010    va_list va;
6011    char *charbuffer;
6012    va_start(va, format);
6013
6014    /* First, format the string as char array, then expand to Py_UNICODE
6015       array. */
6016    charbuffer = (char *)buffer;
6017    len = vsprintf(charbuffer, format, va);
6018    for (i = len - 1; i >= 0; i--)
6019	buffer[i] = (Py_UNICODE) charbuffer[i];
6020
6021    va_end(va);
6022    return len;
6023}
6024
6025/* XXX To save some code duplication, formatfloat/long/int could have been
6026   shared with stringobject.c, converting from 8-bit to Unicode after the
6027   formatting is done. */
6028
6029static int
6030formatfloat(Py_UNICODE *buf,
6031	    size_t buflen,
6032	    int flags,
6033	    int prec,
6034	    int type,
6035	    PyObject *v)
6036{
6037    /* fmt = '%#.' + `prec` + `type`
6038       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
6039    char fmt[20];
6040    double x;
6041
6042    x = PyFloat_AsDouble(v);
6043    if (x == -1.0 && PyErr_Occurred())
6044	return -1;
6045    if (prec < 0)
6046	prec = 6;
6047    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6048	type = 'g';
6049    /* Worst case length calc to ensure no buffer overrun:
6050
6051       'g' formats:
6052	 fmt = %#.<prec>g
6053	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6054	    for any double rep.)
6055	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6056
6057       'f' formats:
6058	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6059	 len = 1 + 50 + 1 + prec = 52 + prec
6060
6061       If prec=0 the effective precision is 1 (the leading digit is
6062       always given), therefore increase the length by one.
6063
6064    */
6065    if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6066	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
6067	PyErr_SetString(PyExc_OverflowError,
6068			"formatted float is too long (precision too large?)");
6069	return -1;
6070    }
6071    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6072		  (flags&F_ALT) ? "#" : "",
6073		  prec, type);
6074    return usprintf(buf, fmt, x);
6075}
6076
6077static PyObject*
6078formatlong(PyObject *val, int flags, int prec, int type)
6079{
6080	char *buf;
6081	int i, len;
6082	PyObject *str; /* temporary string object. */
6083	PyUnicodeObject *result;
6084
6085	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6086	if (!str)
6087		return NULL;
6088	result = _PyUnicode_New(len);
6089	for (i = 0; i < len; i++)
6090		result->str[i] = buf[i];
6091	result->str[len] = 0;
6092	Py_DECREF(str);
6093	return (PyObject*)result;
6094}
6095
6096static int
6097formatint(Py_UNICODE *buf,
6098	  size_t buflen,
6099	  int flags,
6100	  int prec,
6101	  int type,
6102	  PyObject *v)
6103{
6104    /* fmt = '%#.' + `prec` + 'l' + `type`
6105     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6106     *                     + 1 + 1
6107     *                   = 24
6108     */
6109    char fmt[64]; /* plenty big enough! */
6110    long x;
6111
6112    x = PyInt_AsLong(v);
6113    if (x == -1 && PyErr_Occurred())
6114        return -1;
6115    if (x < 0 && type != 'd' && type != 'i') {
6116	if (PyErr_Warn(PyExc_FutureWarning,
6117		       "%u/%o/%x/%X of negative int will return "
6118		       "a signed string in Python 2.4 and up") < 0)
6119	    return -1;
6120    }
6121    if (prec < 0)
6122        prec = 1;
6123
6124    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
6125     * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6126     */
6127    if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
6128        PyErr_SetString(PyExc_OverflowError,
6129    	        "formatted integer is too long (precision too large?)");
6130        return -1;
6131    }
6132
6133    if ((flags & F_ALT) &&
6134        (type == 'x' || type == 'X')) {
6135        /* When converting under %#x or %#X, there are a number
6136         * of issues that cause pain:
6137         * - when 0 is being converted, the C standard leaves off
6138         *   the '0x' or '0X', which is inconsistent with other
6139         *   %#x/%#X conversions and inconsistent with Python's
6140         *   hex() function
6141         * - there are platforms that violate the standard and
6142         *   convert 0 with the '0x' or '0X'
6143         *   (Metrowerks, Compaq Tru64)
6144         * - there are platforms that give '0x' when converting
6145         *   under %#X, but convert 0 in accordance with the
6146         *   standard (OS/2 EMX)
6147         *
6148         * We can achieve the desired consistency by inserting our
6149         * own '0x' or '0X' prefix, and substituting %x/%X in place
6150         * of %#x/%#X.
6151         *
6152         * Note that this is the same approach as used in
6153         * formatint() in stringobject.c
6154         */
6155        PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6156                      type, prec, type);
6157    }
6158    else {
6159        PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6160                      (flags&F_ALT) ? "#" : "",
6161                      prec, type);
6162    }
6163    return usprintf(buf, fmt, x);
6164}
6165
6166static int
6167formatchar(Py_UNICODE *buf,
6168           size_t buflen,
6169           PyObject *v)
6170{
6171    /* presume that the buffer is at least 2 characters long */
6172    if (PyUnicode_Check(v)) {
6173	if (PyUnicode_GET_SIZE(v) != 1)
6174	    goto onError;
6175	buf[0] = PyUnicode_AS_UNICODE(v)[0];
6176    }
6177
6178    else if (PyString_Check(v)) {
6179	if (PyString_GET_SIZE(v) != 1)
6180	    goto onError;
6181	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6182    }
6183
6184    else {
6185	/* Integer input truncated to a character */
6186        long x;
6187	x = PyInt_AsLong(v);
6188	if (x == -1 && PyErr_Occurred())
6189	    goto onError;
6190#ifdef Py_UNICODE_WIDE
6191	if (x < 0 || x > 0x10ffff) {
6192	    PyErr_SetString(PyExc_OverflowError,
6193			    "%c arg not in range(0x110000) "
6194			    "(wide Python build)");
6195	    return -1;
6196	}
6197#else
6198	if (x < 0 || x > 0xffff) {
6199	    PyErr_SetString(PyExc_OverflowError,
6200			    "%c arg not in range(0x10000) "
6201			    "(narrow Python build)");
6202	    return -1;
6203	}
6204#endif
6205	buf[0] = (Py_UNICODE) x;
6206    }
6207    buf[1] = '\0';
6208    return 1;
6209
6210 onError:
6211    PyErr_SetString(PyExc_TypeError,
6212		    "%c requires int or char");
6213    return -1;
6214}
6215
6216/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6217
6218   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6219   chars are formatted. XXX This is a magic number. Each formatting
6220   routine does bounds checking to ensure no overflow, but a better
6221   solution may be to malloc a buffer of appropriate size for each
6222   format. For now, the current solution is sufficient.
6223*/
6224#define FORMATBUFLEN (size_t)120
6225
6226PyObject *PyUnicode_Format(PyObject *format,
6227			   PyObject *args)
6228{
6229    Py_UNICODE *fmt, *res;
6230    int fmtcnt, rescnt, reslen, arglen, argidx;
6231    int args_owned = 0;
6232    PyUnicodeObject *result = NULL;
6233    PyObject *dict = NULL;
6234    PyObject *uformat;
6235
6236    if (format == NULL || args == NULL) {
6237	PyErr_BadInternalCall();
6238	return NULL;
6239    }
6240    uformat = PyUnicode_FromObject(format);
6241    if (uformat == NULL)
6242	return NULL;
6243    fmt = PyUnicode_AS_UNICODE(uformat);
6244    fmtcnt = PyUnicode_GET_SIZE(uformat);
6245
6246    reslen = rescnt = fmtcnt + 100;
6247    result = _PyUnicode_New(reslen);
6248    if (result == NULL)
6249	goto onError;
6250    res = PyUnicode_AS_UNICODE(result);
6251
6252    if (PyTuple_Check(args)) {
6253	arglen = PyTuple_Size(args);
6254	argidx = 0;
6255    }
6256    else {
6257	arglen = -1;
6258	argidx = -2;
6259    }
6260    if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6261        !PyObject_TypeCheck(args, &PyBaseString_Type))
6262	dict = args;
6263
6264    while (--fmtcnt >= 0) {
6265	if (*fmt != '%') {
6266	    if (--rescnt < 0) {
6267		rescnt = fmtcnt + 100;
6268		reslen += rescnt;
6269		if (_PyUnicode_Resize(&result, reslen) < 0)
6270		    return NULL;
6271		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6272		--rescnt;
6273	    }
6274	    *res++ = *fmt++;
6275	}
6276	else {
6277	    /* Got a format specifier */
6278	    int flags = 0;
6279	    int width = -1;
6280	    int prec = -1;
6281	    Py_UNICODE c = '\0';
6282	    Py_UNICODE fill;
6283	    PyObject *v = NULL;
6284	    PyObject *temp = NULL;
6285	    Py_UNICODE *pbuf;
6286	    Py_UNICODE sign;
6287	    int len;
6288	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
6289
6290	    fmt++;
6291	    if (*fmt == '(') {
6292		Py_UNICODE *keystart;
6293		int keylen;
6294		PyObject *key;
6295		int pcount = 1;
6296
6297		if (dict == NULL) {
6298		    PyErr_SetString(PyExc_TypeError,
6299				    "format requires a mapping");
6300		    goto onError;
6301		}
6302		++fmt;
6303		--fmtcnt;
6304		keystart = fmt;
6305		/* Skip over balanced parentheses */
6306		while (pcount > 0 && --fmtcnt >= 0) {
6307		    if (*fmt == ')')
6308			--pcount;
6309		    else if (*fmt == '(')
6310			++pcount;
6311		    fmt++;
6312		}
6313		keylen = fmt - keystart - 1;
6314		if (fmtcnt < 0 || pcount > 0) {
6315		    PyErr_SetString(PyExc_ValueError,
6316				    "incomplete format key");
6317		    goto onError;
6318		}
6319#if 0
6320		/* keys are converted to strings using UTF-8 and
6321		   then looked up since Python uses strings to hold
6322		   variables names etc. in its namespaces and we
6323		   wouldn't want to break common idioms. */
6324		key = PyUnicode_EncodeUTF8(keystart,
6325					   keylen,
6326					   NULL);
6327#else
6328		key = PyUnicode_FromUnicode(keystart, keylen);
6329#endif
6330		if (key == NULL)
6331		    goto onError;
6332		if (args_owned) {
6333		    Py_DECREF(args);
6334		    args_owned = 0;
6335		}
6336		args = PyObject_GetItem(dict, key);
6337		Py_DECREF(key);
6338		if (args == NULL) {
6339		    goto onError;
6340		}
6341		args_owned = 1;
6342		arglen = -1;
6343		argidx = -2;
6344	    }
6345	    while (--fmtcnt >= 0) {
6346		switch (c = *fmt++) {
6347		case '-': flags |= F_LJUST; continue;
6348		case '+': flags |= F_SIGN; continue;
6349		case ' ': flags |= F_BLANK; continue;
6350		case '#': flags |= F_ALT; continue;
6351		case '0': flags |= F_ZERO; continue;
6352		}
6353		break;
6354	    }
6355	    if (c == '*') {
6356		v = getnextarg(args, arglen, &argidx);
6357		if (v == NULL)
6358		    goto onError;
6359		if (!PyInt_Check(v)) {
6360		    PyErr_SetString(PyExc_TypeError,
6361				    "* wants int");
6362		    goto onError;
6363		}
6364		width = PyInt_AsLong(v);
6365		if (width < 0) {
6366		    flags |= F_LJUST;
6367		    width = -width;
6368		}
6369		if (--fmtcnt >= 0)
6370		    c = *fmt++;
6371	    }
6372	    else if (c >= '0' && c <= '9') {
6373		width = c - '0';
6374		while (--fmtcnt >= 0) {
6375		    c = *fmt++;
6376		    if (c < '0' || c > '9')
6377			break;
6378		    if ((width*10) / 10 != width) {
6379			PyErr_SetString(PyExc_ValueError,
6380					"width too big");
6381			goto onError;
6382		    }
6383		    width = width*10 + (c - '0');
6384		}
6385	    }
6386	    if (c == '.') {
6387		prec = 0;
6388		if (--fmtcnt >= 0)
6389		    c = *fmt++;
6390		if (c == '*') {
6391		    v = getnextarg(args, arglen, &argidx);
6392		    if (v == NULL)
6393			goto onError;
6394		    if (!PyInt_Check(v)) {
6395			PyErr_SetString(PyExc_TypeError,
6396					"* wants int");
6397			goto onError;
6398		    }
6399		    prec = PyInt_AsLong(v);
6400		    if (prec < 0)
6401			prec = 0;
6402		    if (--fmtcnt >= 0)
6403			c = *fmt++;
6404		}
6405		else if (c >= '0' && c <= '9') {
6406		    prec = c - '0';
6407		    while (--fmtcnt >= 0) {
6408			c = Py_CHARMASK(*fmt++);
6409			if (c < '0' || c > '9')
6410			    break;
6411			if ((prec*10) / 10 != prec) {
6412			    PyErr_SetString(PyExc_ValueError,
6413					    "prec too big");
6414			    goto onError;
6415			}
6416			prec = prec*10 + (c - '0');
6417		    }
6418		}
6419	    } /* prec */
6420	    if (fmtcnt >= 0) {
6421		if (c == 'h' || c == 'l' || c == 'L') {
6422		    if (--fmtcnt >= 0)
6423			c = *fmt++;
6424		}
6425	    }
6426	    if (fmtcnt < 0) {
6427		PyErr_SetString(PyExc_ValueError,
6428				"incomplete format");
6429		goto onError;
6430	    }
6431	    if (c != '%') {
6432		v = getnextarg(args, arglen, &argidx);
6433		if (v == NULL)
6434		    goto onError;
6435	    }
6436	    sign = 0;
6437	    fill = ' ';
6438	    switch (c) {
6439
6440	    case '%':
6441		pbuf = formatbuf;
6442		/* presume that buffer length is at least 1 */
6443		pbuf[0] = '%';
6444		len = 1;
6445		break;
6446
6447	    case 's':
6448	    case 'r':
6449		if (PyUnicode_Check(v) && c == 's') {
6450		    temp = v;
6451		    Py_INCREF(temp);
6452		}
6453		else {
6454		    PyObject *unicode;
6455		    if (c == 's')
6456			temp = PyObject_Str(v);
6457		    else
6458			temp = PyObject_Repr(v);
6459		    if (temp == NULL)
6460			goto onError;
6461		    if (!PyString_Check(temp)) {
6462			/* XXX Note: this should never happen, since
6463   			       PyObject_Repr() and PyObject_Str() assure
6464			       this */
6465			Py_DECREF(temp);
6466			PyErr_SetString(PyExc_TypeError,
6467					"%s argument has non-string str()");
6468			goto onError;
6469		    }
6470		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
6471						   PyString_GET_SIZE(temp),
6472					       NULL,
6473						   "strict");
6474		    Py_DECREF(temp);
6475		    temp = unicode;
6476		    if (temp == NULL)
6477			goto onError;
6478		}
6479		pbuf = PyUnicode_AS_UNICODE(temp);
6480		len = PyUnicode_GET_SIZE(temp);
6481		if (prec >= 0 && len > prec)
6482		    len = prec;
6483		break;
6484
6485	    case 'i':
6486	    case 'd':
6487	    case 'u':
6488	    case 'o':
6489	    case 'x':
6490	    case 'X':
6491		if (c == 'i')
6492		    c = 'd';
6493		if (PyLong_Check(v)) {
6494		    temp = formatlong(v, flags, prec, c);
6495		    if (!temp)
6496			goto onError;
6497		    pbuf = PyUnicode_AS_UNICODE(temp);
6498		    len = PyUnicode_GET_SIZE(temp);
6499		    /* unbounded ints can always produce
6500		       a sign character! */
6501		    sign = 1;
6502		}
6503		else {
6504		    pbuf = formatbuf;
6505		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6506				    flags, prec, c, v);
6507		    if (len < 0)
6508			goto onError;
6509		    /* only d conversion is signed */
6510		    sign = c == 'd';
6511		}
6512		if (flags & F_ZERO)
6513		    fill = '0';
6514		break;
6515
6516	    case 'e':
6517	    case 'E':
6518	    case 'f':
6519	    case 'g':
6520	    case 'G':
6521		pbuf = formatbuf;
6522		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6523			flags, prec, c, v);
6524		if (len < 0)
6525		    goto onError;
6526		sign = 1;
6527		if (flags & F_ZERO)
6528		    fill = '0';
6529		break;
6530
6531	    case 'c':
6532		pbuf = formatbuf;
6533		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
6534		if (len < 0)
6535		    goto onError;
6536		break;
6537
6538	    default:
6539		PyErr_Format(PyExc_ValueError,
6540			     "unsupported format character '%c' (0x%x) "
6541			     "at index %i",
6542			     (31<=c && c<=126) ? (char)c : '?',
6543                             (int)c,
6544			     (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
6545		goto onError;
6546	    }
6547	    if (sign) {
6548		if (*pbuf == '-' || *pbuf == '+') {
6549		    sign = *pbuf++;
6550		    len--;
6551		}
6552		else if (flags & F_SIGN)
6553		    sign = '+';
6554		else if (flags & F_BLANK)
6555		    sign = ' ';
6556		else
6557		    sign = 0;
6558	    }
6559	    if (width < len)
6560		width = len;
6561	    if (rescnt - (sign != 0) < width) {
6562		reslen -= rescnt;
6563		rescnt = width + fmtcnt + 100;
6564		reslen += rescnt;
6565		if (reslen < 0) {
6566		    Py_DECREF(result);
6567		    return PyErr_NoMemory();
6568		}
6569		if (_PyUnicode_Resize(&result, reslen) < 0)
6570		    return NULL;
6571		res = PyUnicode_AS_UNICODE(result)
6572		    + reslen - rescnt;
6573	    }
6574	    if (sign) {
6575		if (fill != ' ')
6576		    *res++ = sign;
6577		rescnt--;
6578		if (width > len)
6579		    width--;
6580	    }
6581	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6582		assert(pbuf[0] == '0');
6583		assert(pbuf[1] == c);
6584		if (fill != ' ') {
6585		    *res++ = *pbuf++;
6586		    *res++ = *pbuf++;
6587		}
6588		rescnt -= 2;
6589		width -= 2;
6590		if (width < 0)
6591		    width = 0;
6592		len -= 2;
6593	    }
6594	    if (width > len && !(flags & F_LJUST)) {
6595		do {
6596		    --rescnt;
6597		    *res++ = fill;
6598		} while (--width > len);
6599	    }
6600	    if (fill == ' ') {
6601		if (sign)
6602		    *res++ = sign;
6603		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6604		    assert(pbuf[0] == '0');
6605		    assert(pbuf[1] == c);
6606		    *res++ = *pbuf++;
6607		    *res++ = *pbuf++;
6608		}
6609	    }
6610	    Py_UNICODE_COPY(res, pbuf, len);
6611	    res += len;
6612	    rescnt -= len;
6613	    while (--width >= len) {
6614		--rescnt;
6615		*res++ = ' ';
6616	    }
6617	    if (dict && (argidx < arglen) && c != '%') {
6618		PyErr_SetString(PyExc_TypeError,
6619				"not all arguments converted during string formatting");
6620		goto onError;
6621	    }
6622	    Py_XDECREF(temp);
6623	} /* '%' */
6624    } /* until end */
6625    if (argidx < arglen && !dict) {
6626	PyErr_SetString(PyExc_TypeError,
6627			"not all arguments converted during string formatting");
6628	goto onError;
6629    }
6630
6631    if (args_owned) {
6632	Py_DECREF(args);
6633    }
6634    Py_DECREF(uformat);
6635    if (_PyUnicode_Resize(&result, reslen - rescnt))
6636	goto onError;
6637    return (PyObject *)result;
6638
6639 onError:
6640    Py_XDECREF(result);
6641    Py_DECREF(uformat);
6642    if (args_owned) {
6643	Py_DECREF(args);
6644    }
6645    return NULL;
6646}
6647
6648static PyBufferProcs unicode_as_buffer = {
6649    (getreadbufferproc) unicode_buffer_getreadbuf,
6650    (getwritebufferproc) unicode_buffer_getwritebuf,
6651    (getsegcountproc) unicode_buffer_getsegcount,
6652    (getcharbufferproc) unicode_buffer_getcharbuf,
6653};
6654
6655static PyObject *
6656unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6657
6658static PyObject *
6659unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6660{
6661        PyObject *x = NULL;
6662	static char *kwlist[] = {"string", "encoding", "errors", 0};
6663	char *encoding = NULL;
6664	char *errors = NULL;
6665
6666	if (type != &PyUnicode_Type)
6667		return unicode_subtype_new(type, args, kwds);
6668	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6669					  kwlist, &x, &encoding, &errors))
6670	    return NULL;
6671	if (x == NULL)
6672		return (PyObject *)_PyUnicode_New(0);
6673	if (encoding == NULL && errors == NULL)
6674	    return PyObject_Unicode(x);
6675	else
6676	return PyUnicode_FromEncodedObject(x, encoding, errors);
6677}
6678
6679static PyObject *
6680unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6681{
6682	PyUnicodeObject *tmp, *pnew;
6683	int n;
6684
6685	assert(PyType_IsSubtype(type, &PyUnicode_Type));
6686	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6687	if (tmp == NULL)
6688		return NULL;
6689	assert(PyUnicode_Check(tmp));
6690	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6691	if (pnew == NULL)
6692		return NULL;
6693	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6694	if (pnew->str == NULL) {
6695		_Py_ForgetReference((PyObject *)pnew);
6696		PyObject_Del(pnew);
6697		return PyErr_NoMemory();
6698	}
6699	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6700	pnew->length = n;
6701	pnew->hash = tmp->hash;
6702	Py_DECREF(tmp);
6703	return (PyObject *)pnew;
6704}
6705
6706PyDoc_STRVAR(unicode_doc,
6707"unicode(string [, encoding[, errors]]) -> object\n\
6708\n\
6709Create a new Unicode object from the given encoded string.\n\
6710encoding defaults to the current default string encoding.\n\
6711errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
6712
6713PyTypeObject PyUnicode_Type = {
6714    PyObject_HEAD_INIT(&PyType_Type)
6715    0, 					/* ob_size */
6716    "unicode", 				/* tp_name */
6717    sizeof(PyUnicodeObject), 		/* tp_size */
6718    0, 					/* tp_itemsize */
6719    /* Slots */
6720    (destructor)unicode_dealloc, 	/* tp_dealloc */
6721    0, 					/* tp_print */
6722    0,				 	/* tp_getattr */
6723    0, 					/* tp_setattr */
6724    (cmpfunc) unicode_compare, 		/* tp_compare */
6725    (reprfunc) unicode_repr, 		/* tp_repr */
6726    &unicode_as_number, 		/* tp_as_number */
6727    &unicode_as_sequence, 		/* tp_as_sequence */
6728    &unicode_as_mapping, 		/* tp_as_mapping */
6729    (hashfunc) unicode_hash, 		/* tp_hash*/
6730    0, 					/* tp_call*/
6731    (reprfunc) unicode_str,	 	/* tp_str */
6732    PyObject_GenericGetAttr, 		/* tp_getattro */
6733    0,			 		/* tp_setattro */
6734    &unicode_as_buffer,			/* tp_as_buffer */
6735    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6736	    Py_TPFLAGS_BASETYPE,	/* tp_flags */
6737    unicode_doc,			/* tp_doc */
6738    0,					/* tp_traverse */
6739    0,					/* tp_clear */
6740    0,					/* tp_richcompare */
6741    0,					/* tp_weaklistoffset */
6742    0,					/* tp_iter */
6743    0,					/* tp_iternext */
6744    unicode_methods,			/* tp_methods */
6745    0,					/* tp_members */
6746    0,					/* tp_getset */
6747    &PyBaseString_Type,			/* tp_base */
6748    0,					/* tp_dict */
6749    0,					/* tp_descr_get */
6750    0,					/* tp_descr_set */
6751    0,					/* tp_dictoffset */
6752    0,					/* tp_init */
6753    0,					/* tp_alloc */
6754    unicode_new,			/* tp_new */
6755    PyObject_Del,      		/* tp_free */
6756};
6757
6758/* Initialize the Unicode implementation */
6759
6760void _PyUnicode_Init(void)
6761{
6762    int i;
6763
6764    /* Init the implementation */
6765    unicode_freelist = NULL;
6766    unicode_freelist_size = 0;
6767    unicode_empty = _PyUnicode_New(0);
6768    strcpy(unicode_default_encoding, "ascii");
6769    for (i = 0; i < 256; i++)
6770	unicode_latin1[i] = NULL;
6771    if (PyType_Ready(&PyUnicode_Type) < 0)
6772	Py_FatalError("Can't initialize 'unicode'");
6773}
6774
6775/* Finalize the Unicode implementation */
6776
6777void
6778_PyUnicode_Fini(void)
6779{
6780    PyUnicodeObject *u;
6781    int i;
6782
6783    Py_XDECREF(unicode_empty);
6784    unicode_empty = NULL;
6785
6786    for (i = 0; i < 256; i++) {
6787	if (unicode_latin1[i]) {
6788	    Py_DECREF(unicode_latin1[i]);
6789	    unicode_latin1[i] = NULL;
6790	}
6791    }
6792
6793    for (u = unicode_freelist; u != NULL;) {
6794	PyUnicodeObject *v = u;
6795	u = *(PyUnicodeObject **)u;
6796	if (v->str)
6797	    PyMem_DEL(v->str);
6798	Py_XDECREF(v->defenc);
6799	PyObject_Del(v);
6800    }
6801    unicode_freelist = NULL;
6802    unicode_freelist_size = 0;
6803}
6804
6805/*
6806Local variables:
6807c-basic-offset: 4
6808indent-tabs-mode: nil
6809End:
6810*/
6811