unicodeobject.c revision 782afc5927c5d37c3de1a082b6363a79e4bd5962
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WINDOWS
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106Py_UNICODE
107PyUnicode_GetMax(void)
108{
109#ifdef Py_UNICODE_WIDE
110	return 0x10FFFF;
111#else
112	/* This is actually an illegal character, so it should
113	   not be passed to unichr. */
114	return 0xFFFF;
115#endif
116}
117
118/* --- Unicode Object ----------------------------------------------------- */
119
120static
121int unicode_resize(register PyUnicodeObject *unicode,
122                      int length)
123{
124    void *oldstr;
125
126    /* Shortcut if there's nothing much to do. */
127    if (unicode->length == length)
128	goto reset;
129
130    /* Resizing shared object (unicode_empty or single character
131       objects) in-place is not allowed. Use PyUnicode_Resize()
132       instead ! */
133    if (unicode == unicode_empty ||
134	(unicode->length == 1 &&
135         /* MvL said unicode->str[] may be signed.  Python generally assumes
136          * an int contains at least 32 bits, and we don't use more than
137          * 32 bits even in a UCS4 build, so casting to unsigned int should
138          * be correct.
139          */
140	 (unsigned int)unicode->str[0] < 256U &&
141	 unicode_latin1[unicode->str[0]] == unicode)) {
142        PyErr_SetString(PyExc_SystemError,
143                        "can't resize shared unicode objects");
144        return -1;
145    }
146
147    /* We allocate one more byte to make sure the string is
148       Ux0000 terminated -- XXX is this needed ? */
149    oldstr = unicode->str;
150    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151    if (!unicode->str) {
152	unicode->str = oldstr;
153        PyErr_NoMemory();
154        return -1;
155    }
156    unicode->str[length] = 0;
157    unicode->length = length;
158
159 reset:
160    /* Reset the object caches */
161    if (unicode->defenc) {
162        Py_DECREF(unicode->defenc);
163        unicode->defenc = NULL;
164    }
165    unicode->hash = -1;
166
167    return 0;
168}
169
170/* We allocate one more byte to make sure the string is
171   Ux0000 terminated -- XXX is this needed ?
172
173   XXX This allocator could further be enhanced by assuring that the
174       free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181    register PyUnicodeObject *unicode;
182
183    /* Optimization fo empty strings */
184    if (length == 0 && unicode_empty != NULL) {
185        Py_INCREF(unicode_empty);
186        return unicode_empty;
187    }
188
189    /* Unicode freelist & memory allocation */
190    if (unicode_freelist) {
191        unicode = unicode_freelist;
192        unicode_freelist = *(PyUnicodeObject **)unicode;
193        unicode_freelist_size--;
194	if (unicode->str) {
195	    /* Keep-Alive optimization: we only upsize the buffer,
196	       never downsize it. */
197	    if ((unicode->length < length) &&
198                unicode_resize(unicode, length) < 0) {
199		PyMem_DEL(unicode->str);
200		goto onError;
201	    }
202	}
203        else {
204	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
205        }
206        PyObject_INIT(unicode, &PyUnicode_Type);
207    }
208    else {
209        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
210        if (unicode == NULL)
211            return NULL;
212	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213    }
214
215    if (!unicode->str) {
216	PyErr_NoMemory();
217	goto onError;
218    }
219    /* Initialize the first element to guard against cases where
220     * the caller fails before initializing str -- unicode_resize()
221     * reads str[0], and the Keep-Alive optimization can keep memory
222     * allocated for str alive across a call to unicode_dealloc(unicode).
223     * We don't want unicode_resize to read uninitialized memory in
224     * that case.
225     */
226    unicode->str[0] = 0;
227    unicode->str[length] = 0;
228    unicode->length = length;
229    unicode->hash = -1;
230    unicode->defenc = NULL;
231    return unicode;
232
233 onError:
234    _Py_ForgetReference((PyObject *)unicode);
235    PyObject_Del(unicode);
236    return NULL;
237}
238
239static
240void unicode_dealloc(register PyUnicodeObject *unicode)
241{
242    if (PyUnicode_CheckExact(unicode) &&
243	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
244        /* Keep-Alive optimization */
245	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
246	    PyMem_DEL(unicode->str);
247	    unicode->str = NULL;
248	    unicode->length = 0;
249	}
250	if (unicode->defenc) {
251	    Py_DECREF(unicode->defenc);
252	    unicode->defenc = NULL;
253	}
254	/* Add to free list */
255        *(PyUnicodeObject **)unicode = unicode_freelist;
256        unicode_freelist = unicode;
257        unicode_freelist_size++;
258    }
259    else {
260	PyMem_DEL(unicode->str);
261	Py_XDECREF(unicode->defenc);
262	unicode->ob_type->tp_free((PyObject *)unicode);
263    }
264}
265
266int PyUnicode_Resize(PyObject **unicode, int length)
267{
268    register PyUnicodeObject *v;
269
270    /* Argument checks */
271    if (unicode == NULL) {
272	PyErr_BadInternalCall();
273	return -1;
274    }
275    v = (PyUnicodeObject *)*unicode;
276    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
277	PyErr_BadInternalCall();
278	return -1;
279    }
280
281    /* Resizing unicode_empty and single character objects is not
282       possible since these are being shared. We simply return a fresh
283       copy with the same Unicode content. */
284    if (v->length != length &&
285	(v == unicode_empty || v->length == 1)) {
286	PyUnicodeObject *w = _PyUnicode_New(length);
287	if (w == NULL)
288	    return -1;
289	Py_UNICODE_COPY(w->str, v->str,
290			length < v->length ? length : v->length);
291	Py_DECREF(*unicode);
292	*unicode = (PyObject *)w;
293	return 0;
294    }
295
296    /* Note that we don't have to modify *unicode for unshared Unicode
297       objects, since we can modify them in-place. */
298    return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306				int size)
307{
308    PyUnicodeObject *unicode;
309
310    /* If the Unicode data is known at construction time, we can apply
311       some optimizations which share commonly used objects. */
312    if (u != NULL) {
313
314	/* Optimization for empty strings */
315	if (size == 0 && unicode_empty != NULL) {
316	    Py_INCREF(unicode_empty);
317	    return (PyObject *)unicode_empty;
318	}
319
320	/* Single character Unicode objects in the Latin-1 range are
321	   shared when using this constructor */
322	if (size == 1 && *u < 256) {
323	    unicode = unicode_latin1[*u];
324	    if (!unicode) {
325		unicode = _PyUnicode_New(1);
326		if (!unicode)
327		    return NULL;
328		unicode->str[0] = *u;
329		unicode_latin1[*u] = unicode;
330	    }
331	    Py_INCREF(unicode);
332	    return (PyObject *)unicode;
333	}
334    }
335
336    unicode = _PyUnicode_New(size);
337    if (!unicode)
338        return NULL;
339
340    /* Copy the Unicode data into the new object */
341    if (u != NULL)
342	Py_UNICODE_COPY(unicode->str, u, size);
343
344    return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350				 int size)
351{
352    PyUnicodeObject *unicode;
353
354    if (w == NULL) {
355	PyErr_BadInternalCall();
356	return NULL;
357    }
358
359    unicode = _PyUnicode_New(size);
360    if (!unicode)
361        return NULL;
362
363    /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365    memcpy(unicode->str, w, size * sizeof(wchar_t));
366#else
367    {
368	register Py_UNICODE *u;
369	register int i;
370	u = PyUnicode_AS_UNICODE(unicode);
371	for (i = size; i >= 0; i--)
372	    *u++ = *w++;
373    }
374#endif
375
376    return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380			 register wchar_t *w,
381			 int size)
382{
383    if (unicode == NULL) {
384	PyErr_BadInternalCall();
385	return -1;
386    }
387    if (size > PyUnicode_GET_SIZE(unicode))
388	size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390    memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392    {
393	register Py_UNICODE *u;
394	register int i;
395	u = PyUnicode_AS_UNICODE(unicode);
396	for (i = size; i >= 0; i--)
397	    *w++ = *u++;
398    }
399#endif
400
401    return size;
402}
403
404#endif
405
406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
408    Py_UNICODE s[1];
409
410#ifdef Py_UNICODE_WIDE
411    if (ordinal < 0 || ordinal > 0x10ffff) {
412	PyErr_SetString(PyExc_ValueError,
413			"unichr() arg not in range(0x110000) "
414			"(wide Python build)");
415	return NULL;
416    }
417#else
418    if (ordinal < 0 || ordinal > 0xffff) {
419	PyErr_SetString(PyExc_ValueError,
420			"unichr() arg not in range(0x10000) "
421			"(narrow Python build)");
422	return NULL;
423    }
424#endif
425
426    s[0] = (Py_UNICODE)ordinal;
427    return PyUnicode_FromUnicode(s, 1);
428}
429
430PyObject *PyUnicode_FromObject(register PyObject *obj)
431{
432    /* XXX Perhaps we should make this API an alias of
433           PyObject_Unicode() instead ?! */
434    if (PyUnicode_CheckExact(obj)) {
435	Py_INCREF(obj);
436	return obj;
437    }
438    if (PyUnicode_Check(obj)) {
439	/* For a Unicode subtype that's not a Unicode object,
440	   return a true Unicode object with the same data. */
441	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
442				     PyUnicode_GET_SIZE(obj));
443    }
444    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
445}
446
447PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
448				      const char *encoding,
449				      const char *errors)
450{
451    const char *s = NULL;
452    int len;
453    PyObject *v;
454
455    if (obj == NULL) {
456	PyErr_BadInternalCall();
457	return NULL;
458    }
459
460#if 0
461    /* For b/w compatibility we also accept Unicode objects provided
462       that no encodings is given and then redirect to
463       PyObject_Unicode() which then applies the additional logic for
464       Unicode subclasses.
465
466       NOTE: This API should really only be used for object which
467             represent *encoded* Unicode !
468
469    */
470	if (PyUnicode_Check(obj)) {
471	    if (encoding) {
472		PyErr_SetString(PyExc_TypeError,
473				"decoding Unicode is not supported");
474	    return NULL;
475	    }
476	return PyObject_Unicode(obj);
477	    }
478#else
479    if (PyUnicode_Check(obj)) {
480	PyErr_SetString(PyExc_TypeError,
481			"decoding Unicode is not supported");
482	return NULL;
483	}
484#endif
485
486    /* Coerce object */
487    if (PyString_Check(obj)) {
488	    s = PyString_AS_STRING(obj);
489	    len = PyString_GET_SIZE(obj);
490	    }
491    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
492	/* Overwrite the error message with something more useful in
493	   case of a TypeError. */
494	if (PyErr_ExceptionMatches(PyExc_TypeError))
495	PyErr_Format(PyExc_TypeError,
496			 "coercing to Unicode: need string or buffer, "
497			 "%.80s found",
498		     obj->ob_type->tp_name);
499	goto onError;
500    }
501
502    /* Convert to Unicode */
503    if (len == 0) {
504	Py_INCREF(unicode_empty);
505	v = (PyObject *)unicode_empty;
506    }
507    else
508	v = PyUnicode_Decode(s, len, encoding, errors);
509
510    return v;
511
512 onError:
513    return NULL;
514}
515
516PyObject *PyUnicode_Decode(const char *s,
517			   int size,
518			   const char *encoding,
519			   const char *errors)
520{
521    PyObject *buffer = NULL, *unicode;
522
523    if (encoding == NULL)
524	encoding = PyUnicode_GetDefaultEncoding();
525
526    /* Shortcuts for common default encodings */
527    if (strcmp(encoding, "utf-8") == 0)
528        return PyUnicode_DecodeUTF8(s, size, errors);
529    else if (strcmp(encoding, "latin-1") == 0)
530        return PyUnicode_DecodeLatin1(s, size, errors);
531#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
532    else if (strcmp(encoding, "mbcs") == 0)
533        return PyUnicode_DecodeMBCS(s, size, errors);
534#endif
535    else if (strcmp(encoding, "ascii") == 0)
536        return PyUnicode_DecodeASCII(s, size, errors);
537
538    /* Decode via the codec registry */
539    buffer = PyBuffer_FromMemory((void *)s, size);
540    if (buffer == NULL)
541        goto onError;
542    unicode = PyCodec_Decode(buffer, encoding, errors);
543    if (unicode == NULL)
544        goto onError;
545    if (!PyUnicode_Check(unicode)) {
546        PyErr_Format(PyExc_TypeError,
547                     "decoder did not return an unicode object (type=%.400s)",
548                     unicode->ob_type->tp_name);
549        Py_DECREF(unicode);
550        goto onError;
551    }
552    Py_DECREF(buffer);
553    return unicode;
554
555 onError:
556    Py_XDECREF(buffer);
557    return NULL;
558}
559
560PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
561                                    const char *encoding,
562                                    const char *errors)
563{
564    PyObject *v;
565
566    if (!PyUnicode_Check(unicode)) {
567        PyErr_BadArgument();
568        goto onError;
569    }
570
571    if (encoding == NULL)
572	encoding = PyUnicode_GetDefaultEncoding();
573
574    /* Decode via the codec registry */
575    v = PyCodec_Decode(unicode, encoding, errors);
576    if (v == NULL)
577        goto onError;
578    return v;
579
580 onError:
581    return NULL;
582}
583
584PyObject *PyUnicode_Encode(const Py_UNICODE *s,
585			   int size,
586			   const char *encoding,
587			   const char *errors)
588{
589    PyObject *v, *unicode;
590
591    unicode = PyUnicode_FromUnicode(s, size);
592    if (unicode == NULL)
593	return NULL;
594    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
595    Py_DECREF(unicode);
596    return v;
597}
598
599PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
600                                    const char *encoding,
601                                    const char *errors)
602{
603    PyObject *v;
604
605    if (!PyUnicode_Check(unicode)) {
606        PyErr_BadArgument();
607        goto onError;
608    }
609
610    if (encoding == NULL)
611	encoding = PyUnicode_GetDefaultEncoding();
612
613    /* Encode via the codec registry */
614    v = PyCodec_Encode(unicode, encoding, errors);
615    if (v == NULL)
616        goto onError;
617    return v;
618
619 onError:
620    return NULL;
621}
622
623PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
624                                    const char *encoding,
625                                    const char *errors)
626{
627    PyObject *v;
628
629    if (!PyUnicode_Check(unicode)) {
630        PyErr_BadArgument();
631        goto onError;
632    }
633
634    if (encoding == NULL)
635	encoding = PyUnicode_GetDefaultEncoding();
636
637    /* Shortcuts for common default encodings */
638    if (errors == NULL) {
639	if (strcmp(encoding, "utf-8") == 0)
640	    return PyUnicode_AsUTF8String(unicode);
641	else if (strcmp(encoding, "latin-1") == 0)
642	    return PyUnicode_AsLatin1String(unicode);
643#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
644	else if (strcmp(encoding, "mbcs") == 0)
645	    return PyUnicode_AsMBCSString(unicode);
646#endif
647	else if (strcmp(encoding, "ascii") == 0)
648	    return PyUnicode_AsASCIIString(unicode);
649    }
650
651    /* Encode via the codec registry */
652    v = PyCodec_Encode(unicode, encoding, errors);
653    if (v == NULL)
654        goto onError;
655    if (!PyString_Check(v)) {
656        PyErr_Format(PyExc_TypeError,
657                     "encoder did not return a string object (type=%.400s)",
658                     v->ob_type->tp_name);
659        Py_DECREF(v);
660        goto onError;
661    }
662    return v;
663
664 onError:
665    return NULL;
666}
667
668PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
669					    const char *errors)
670{
671    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
672
673    if (v)
674        return v;
675    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
676    if (v && errors == NULL)
677        ((PyUnicodeObject *)unicode)->defenc = v;
678    return v;
679}
680
681Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
682{
683    if (!PyUnicode_Check(unicode)) {
684        PyErr_BadArgument();
685        goto onError;
686    }
687    return PyUnicode_AS_UNICODE(unicode);
688
689 onError:
690    return NULL;
691}
692
693int PyUnicode_GetSize(PyObject *unicode)
694{
695    if (!PyUnicode_Check(unicode)) {
696        PyErr_BadArgument();
697        goto onError;
698    }
699    return PyUnicode_GET_SIZE(unicode);
700
701 onError:
702    return -1;
703}
704
705const char *PyUnicode_GetDefaultEncoding(void)
706{
707    return unicode_default_encoding;
708}
709
710int PyUnicode_SetDefaultEncoding(const char *encoding)
711{
712    PyObject *v;
713
714    /* Make sure the encoding is valid. As side effect, this also
715       loads the encoding into the codec registry cache. */
716    v = _PyCodec_Lookup(encoding);
717    if (v == NULL)
718	goto onError;
719    Py_DECREF(v);
720    strncpy(unicode_default_encoding,
721	    encoding,
722	    sizeof(unicode_default_encoding));
723    return 0;
724
725 onError:
726    return -1;
727}
728
729/* error handling callback helper:
730   build arguments, call the callback and check the arguments,
731   if no exception occured, copy the replacement to the output
732   and adjust various state variables.
733   return 0 on success, -1 on error
734*/
735
736static
737int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
738                 const char *encoding, const char *reason,
739                 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
740                 PyObject **output, int *outpos, Py_UNICODE **outptr)
741{
742    static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
743
744    PyObject *restuple = NULL;
745    PyObject *repunicode = NULL;
746    int outsize = PyUnicode_GET_SIZE(*output);
747    int requiredsize;
748    int newpos;
749    Py_UNICODE *repptr;
750    int repsize;
751    int res = -1;
752
753    if (*errorHandler == NULL) {
754	*errorHandler = PyCodec_LookupError(errors);
755	if (*errorHandler == NULL)
756	   goto onError;
757    }
758
759    if (*exceptionObject == NULL) {
760    	*exceptionObject = PyUnicodeDecodeError_Create(
761	    encoding, input, insize, *startinpos, *endinpos, reason);
762	if (*exceptionObject == NULL)
763	   goto onError;
764    }
765    else {
766	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
767	    goto onError;
768	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
769	    goto onError;
770	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
771	    goto onError;
772    }
773
774    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
775    if (restuple == NULL)
776	goto onError;
777    if (!PyTuple_Check(restuple)) {
778	PyErr_Format(PyExc_TypeError, &argparse[4]);
779	goto onError;
780    }
781    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
782	goto onError;
783    if (newpos<0)
784	newpos = insize+newpos;
785    if (newpos<0 || newpos>insize) {
786	PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
787	goto onError;
788    }
789
790    /* need more space? (at least enough for what we
791       have+the replacement+the rest of the string (starting
792       at the new input position), so we won't have to check space
793       when there are no errors in the rest of the string) */
794    repptr = PyUnicode_AS_UNICODE(repunicode);
795    repsize = PyUnicode_GET_SIZE(repunicode);
796    requiredsize = *outpos + repsize + insize-newpos;
797    if (requiredsize > outsize) {
798	if (requiredsize<2*outsize)
799	    requiredsize = 2*outsize;
800	if (PyUnicode_Resize(output, requiredsize) < 0)
801	    goto onError;
802	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
803    }
804    *endinpos = newpos;
805    *inptr = input + newpos;
806    Py_UNICODE_COPY(*outptr, repptr, repsize);
807    *outptr += repsize;
808    *outpos += repsize;
809    /* we made it! */
810    res = 0;
811
812    onError:
813    Py_XDECREF(restuple);
814    return res;
815}
816
817/* --- UTF-7 Codec -------------------------------------------------------- */
818
819/* see RFC2152 for details */
820
821static
822char utf7_special[128] = {
823    /* indicate whether a UTF-7 character is special i.e. cannot be directly
824       encoded:
825	   0 - not special
826	   1 - special
827	   2 - whitespace (optional)
828	   3 - RFC2152 Set O (optional) */
829    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
830    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
831    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
832    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
833    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
834    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
835    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
836    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
837
838};
839
840#define SPECIAL(c, encodeO, encodeWS) \
841	(((c)>127 || utf7_special[(c)] == 1) || \
842	 (encodeWS && (utf7_special[(c)] == 2)) || \
843     (encodeO && (utf7_special[(c)] == 3)))
844
845#define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
846#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
847#define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
848                        (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
849
850#define ENCODE(out, ch, bits) \
851    while (bits >= 6) { \
852        *out++ = B64(ch >> (bits-6)); \
853        bits -= 6; \
854    }
855
856#define DECODE(out, ch, bits, surrogate) \
857    while (bits >= 16) { \
858        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
859        bits -= 16; \
860		if (surrogate) { \
861			/* We have already generated an error for the high surrogate
862               so let's not bother seeing if the low surrogate is correct or not */\
863			surrogate = 0; \
864		} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
865            /* This is a surrogate pair. Unfortunately we can't represent \
866               it in a 16-bit character */ \
867			surrogate = 1; \
868            errmsg = "code pairs are not supported"; \
869	        goto utf7Error; \
870		} else { \
871				*out++ = outCh; \
872		} \
873    } \
874
875PyObject *PyUnicode_DecodeUTF7(const char *s,
876			       int size,
877			       const char *errors)
878{
879    const char *starts = s;
880    int startinpos;
881    int endinpos;
882    int outpos;
883    const char *e;
884    PyUnicodeObject *unicode;
885    Py_UNICODE *p;
886    const char *errmsg = "";
887    int inShift = 0;
888    unsigned int bitsleft = 0;
889    unsigned long charsleft = 0;
890    int surrogate = 0;
891    PyObject *errorHandler = NULL;
892    PyObject *exc = NULL;
893
894    unicode = _PyUnicode_New(size);
895    if (!unicode)
896        return NULL;
897    if (size == 0)
898        return (PyObject *)unicode;
899
900    p = unicode->str;
901    e = s + size;
902
903    while (s < e) {
904        Py_UNICODE ch;
905        restart:
906        ch = *s;
907
908        if (inShift) {
909            if ((ch == '-') || !B64CHAR(ch)) {
910                inShift = 0;
911                s++;
912
913                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
914                if (bitsleft >= 6) {
915                    /* The shift sequence has a partial character in it. If
916                       bitsleft < 6 then we could just classify it as padding
917                       but that is not the case here */
918
919                    errmsg = "partial character in shift sequence";
920                    goto utf7Error;
921                }
922                /* According to RFC2152 the remaining bits should be zero. We
923                   choose to signal an error/insert a replacement character
924                   here so indicate the potential of a misencoded character. */
925
926                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
927                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
928                    errmsg = "non-zero padding bits in shift sequence";
929                    goto utf7Error;
930                }
931
932                if (ch == '-') {
933                    if ((s < e) && (*(s) == '-')) {
934                        *p++ = '-';
935                        inShift = 1;
936                    }
937                } else if (SPECIAL(ch,0,0)) {
938                    errmsg = "unexpected special character";
939	                goto utf7Error;
940                } else  {
941                    *p++ = ch;
942                }
943            } else {
944                charsleft = (charsleft << 6) | UB64(ch);
945                bitsleft += 6;
946                s++;
947                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
948            }
949        }
950        else if ( ch == '+' ) {
951            startinpos = s-starts;
952            s++;
953            if (s < e && *s == '-') {
954                s++;
955                *p++ = '+';
956            } else
957            {
958                inShift = 1;
959                bitsleft = 0;
960            }
961        }
962        else if (SPECIAL(ch,0,0)) {
963            errmsg = "unexpected special character";
964            s++;
965	        goto utf7Error;
966        }
967        else {
968            *p++ = ch;
969            s++;
970        }
971        continue;
972    utf7Error:
973        outpos = p-PyUnicode_AS_UNICODE(unicode);
974        endinpos = s-starts;
975        if (unicode_decode_call_errorhandler(
976             errors, &errorHandler,
977             "utf7", errmsg,
978             starts, size, &startinpos, &endinpos, &exc, &s,
979             (PyObject **)&unicode, &outpos, &p))
980        goto onError;
981    }
982
983    if (inShift) {
984        outpos = p-PyUnicode_AS_UNICODE(unicode);
985        endinpos = size;
986        if (unicode_decode_call_errorhandler(
987             errors, &errorHandler,
988             "utf7", "unterminated shift sequence",
989             starts, size, &startinpos, &endinpos, &exc, &s,
990             (PyObject **)&unicode, &outpos, &p))
991            goto onError;
992        if (s < e)
993           goto restart;
994    }
995
996    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
997        goto onError;
998
999    Py_XDECREF(errorHandler);
1000    Py_XDECREF(exc);
1001    return (PyObject *)unicode;
1002
1003onError:
1004    Py_XDECREF(errorHandler);
1005    Py_XDECREF(exc);
1006    Py_DECREF(unicode);
1007    return NULL;
1008}
1009
1010
1011PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1012                   int size,
1013                   int encodeSetO,
1014                   int encodeWhiteSpace,
1015                   const char *errors)
1016{
1017    PyObject *v;
1018    /* It might be possible to tighten this worst case */
1019    unsigned int cbAllocated = 5 * size;
1020    int inShift = 0;
1021    int i = 0;
1022    unsigned int bitsleft = 0;
1023    unsigned long charsleft = 0;
1024    char * out;
1025    char * start;
1026
1027    if (size == 0)
1028		return PyString_FromStringAndSize(NULL, 0);
1029
1030    v = PyString_FromStringAndSize(NULL, cbAllocated);
1031    if (v == NULL)
1032        return NULL;
1033
1034    start = out = PyString_AS_STRING(v);
1035    for (;i < size; ++i) {
1036        Py_UNICODE ch = s[i];
1037
1038        if (!inShift) {
1039            if (ch == '+') {
1040                *out++ = '+';
1041                *out++ = '-';
1042            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1043                charsleft = ch;
1044                bitsleft = 16;
1045                *out++ = '+';
1046                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1047                inShift = bitsleft > 0;
1048            } else {
1049                *out++ = (char) ch;
1050            }
1051        } else {
1052            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1053                *out++ = B64(charsleft << (6-bitsleft));
1054                charsleft = 0;
1055                bitsleft = 0;
1056                /* Characters not in the BASE64 set implicitly unshift the sequence
1057                   so no '-' is required, except if the character is itself a '-' */
1058                if (B64CHAR(ch) || ch == '-') {
1059                    *out++ = '-';
1060                }
1061                inShift = 0;
1062                *out++ = (char) ch;
1063            } else {
1064                bitsleft += 16;
1065                charsleft = (charsleft << 16) | ch;
1066                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1067
1068                /* If the next character is special then we dont' need to terminate
1069                   the shift sequence. If the next character is not a BASE64 character
1070                   or '-' then the shift sequence will be terminated implicitly and we
1071                   don't have to insert a '-'. */
1072
1073                if (bitsleft == 0) {
1074                    if (i + 1 < size) {
1075                        Py_UNICODE ch2 = s[i+1];
1076
1077                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1078
1079                        } else if (B64CHAR(ch2) || ch2 == '-') {
1080                            *out++ = '-';
1081                            inShift = 0;
1082                        } else {
1083                            inShift = 0;
1084                        }
1085
1086                    }
1087                    else {
1088                        *out++ = '-';
1089                        inShift = 0;
1090                    }
1091                }
1092            }
1093        }
1094    }
1095    if (bitsleft) {
1096        *out++= B64(charsleft << (6-bitsleft) );
1097        *out++ = '-';
1098    }
1099
1100    _PyString_Resize(&v, out - start);
1101    return v;
1102}
1103
1104#undef SPECIAL
1105#undef B64
1106#undef B64CHAR
1107#undef UB64
1108#undef ENCODE
1109#undef DECODE
1110
1111/* --- UTF-8 Codec -------------------------------------------------------- */
1112
1113static
1114char utf8_code_length[256] = {
1115    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1116       illegal prefix.  see RFC 2279 for details */
1117    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1118    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1119    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1120    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1121    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1122    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1123    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1124    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1125    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1126    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1127    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1129    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1130    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1131    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1132    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1133};
1134
1135PyObject *PyUnicode_DecodeUTF8(const char *s,
1136			       int size,
1137			       const char *errors)
1138{
1139    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1140}
1141
1142PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1143			                int size,
1144			                const char *errors,
1145			                int *consumed)
1146{
1147    const char *starts = s;
1148    int n;
1149    int startinpos;
1150    int endinpos;
1151    int outpos;
1152    const char *e;
1153    PyUnicodeObject *unicode;
1154    Py_UNICODE *p;
1155    const char *errmsg = "";
1156    PyObject *errorHandler = NULL;
1157    PyObject *exc = NULL;
1158
1159    /* Note: size will always be longer than the resulting Unicode
1160       character count */
1161    unicode = _PyUnicode_New(size);
1162    if (!unicode)
1163        return NULL;
1164    if (size == 0) {
1165        if (consumed)
1166            *consumed = 0;
1167        return (PyObject *)unicode;
1168    }
1169
1170    /* Unpack UTF-8 encoded data */
1171    p = unicode->str;
1172    e = s + size;
1173
1174    while (s < e) {
1175        Py_UCS4 ch = (unsigned char)*s;
1176
1177        if (ch < 0x80) {
1178            *p++ = (Py_UNICODE)ch;
1179            s++;
1180            continue;
1181        }
1182
1183        n = utf8_code_length[ch];
1184
1185        if (s + n > e) {
1186	    if (consumed)
1187		break;
1188	    else {
1189		errmsg = "unexpected end of data";
1190		startinpos = s-starts;
1191		endinpos = size;
1192		goto utf8Error;
1193	    }
1194	}
1195
1196        switch (n) {
1197
1198        case 0:
1199            errmsg = "unexpected code byte";
1200	    startinpos = s-starts;
1201	    endinpos = startinpos+1;
1202	    goto utf8Error;
1203
1204        case 1:
1205            errmsg = "internal error";
1206	    startinpos = s-starts;
1207	    endinpos = startinpos+1;
1208	    goto utf8Error;
1209
1210        case 2:
1211            if ((s[1] & 0xc0) != 0x80) {
1212                errmsg = "invalid data";
1213		startinpos = s-starts;
1214		endinpos = startinpos+2;
1215		goto utf8Error;
1216	    }
1217            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1218            if (ch < 0x80) {
1219		startinpos = s-starts;
1220		endinpos = startinpos+2;
1221                errmsg = "illegal encoding";
1222		goto utf8Error;
1223	    }
1224	    else
1225		*p++ = (Py_UNICODE)ch;
1226            break;
1227
1228        case 3:
1229            if ((s[1] & 0xc0) != 0x80 ||
1230                (s[2] & 0xc0) != 0x80) {
1231                errmsg = "invalid data";
1232		startinpos = s-starts;
1233		endinpos = startinpos+3;
1234		goto utf8Error;
1235	    }
1236            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1237            if (ch < 0x0800) {
1238		/* Note: UTF-8 encodings of surrogates are considered
1239		   legal UTF-8 sequences;
1240
1241		   XXX For wide builds (UCS-4) we should probably try
1242		       to recombine the surrogates into a single code
1243		       unit.
1244		*/
1245                errmsg = "illegal encoding";
1246		startinpos = s-starts;
1247		endinpos = startinpos+3;
1248		goto utf8Error;
1249	    }
1250	    else
1251		*p++ = (Py_UNICODE)ch;
1252            break;
1253
1254        case 4:
1255            if ((s[1] & 0xc0) != 0x80 ||
1256                (s[2] & 0xc0) != 0x80 ||
1257                (s[3] & 0xc0) != 0x80) {
1258                errmsg = "invalid data";
1259		startinpos = s-starts;
1260		endinpos = startinpos+4;
1261		goto utf8Error;
1262	    }
1263            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1264                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1265            /* validate and convert to UTF-16 */
1266            if ((ch < 0x10000)        /* minimum value allowed for 4
1267					 byte encoding */
1268                || (ch > 0x10ffff))   /* maximum value allowed for
1269					 UTF-16 */
1270	    {
1271                errmsg = "illegal encoding";
1272		startinpos = s-starts;
1273		endinpos = startinpos+4;
1274		goto utf8Error;
1275	    }
1276#ifdef Py_UNICODE_WIDE
1277	    *p++ = (Py_UNICODE)ch;
1278#else
1279            /*  compute and append the two surrogates: */
1280
1281            /*  translate from 10000..10FFFF to 0..FFFF */
1282            ch -= 0x10000;
1283
1284            /*  high surrogate = top 10 bits added to D800 */
1285            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1286
1287            /*  low surrogate = bottom 10 bits added to DC00 */
1288            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1289#endif
1290            break;
1291
1292        default:
1293            /* Other sizes are only needed for UCS-4 */
1294            errmsg = "unsupported Unicode code range";
1295	    startinpos = s-starts;
1296	    endinpos = startinpos+n;
1297	    goto utf8Error;
1298        }
1299        s += n;
1300	continue;
1301
1302    utf8Error:
1303    outpos = p-PyUnicode_AS_UNICODE(unicode);
1304    if (unicode_decode_call_errorhandler(
1305	     errors, &errorHandler,
1306	     "utf8", errmsg,
1307	     starts, size, &startinpos, &endinpos, &exc, &s,
1308	     (PyObject **)&unicode, &outpos, &p))
1309	goto onError;
1310    }
1311    if (consumed)
1312	*consumed = s-starts;
1313
1314    /* Adjust length */
1315    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1316        goto onError;
1317
1318    Py_XDECREF(errorHandler);
1319    Py_XDECREF(exc);
1320    return (PyObject *)unicode;
1321
1322onError:
1323    Py_XDECREF(errorHandler);
1324    Py_XDECREF(exc);
1325    Py_DECREF(unicode);
1326    return NULL;
1327}
1328
1329/* Allocation strategy:  if the string is short, convert into a stack buffer
1330   and allocate exactly as much space needed at the end.  Else allocate the
1331   maximum possible needed (4 result bytes per Unicode character), and return
1332   the excess memory at the end.
1333*/
1334PyObject *
1335PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1336		     int size,
1337		     const char *errors)
1338{
1339#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1340
1341    int i;              /* index into s of next input byte */
1342    PyObject *v;        /* result string object */
1343    char *p;            /* next free byte in output buffer */
1344    int nallocated;     /* number of result bytes allocated */
1345    int nneeded;        /* number of result bytes needed */
1346    char stackbuf[MAX_SHORT_UNICHARS * 4];
1347
1348    assert(s != NULL);
1349    assert(size >= 0);
1350
1351    if (size <= MAX_SHORT_UNICHARS) {
1352        /* Write into the stack buffer; nallocated can't overflow.
1353         * At the end, we'll allocate exactly as much heap space as it
1354         * turns out we need.
1355         */
1356        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1357        v = NULL;   /* will allocate after we're done */
1358        p = stackbuf;
1359    }
1360    else {
1361        /* Overallocate on the heap, and give the excess back at the end. */
1362        nallocated = size * 4;
1363        if (nallocated / 4 != size)  /* overflow! */
1364            return PyErr_NoMemory();
1365        v = PyString_FromStringAndSize(NULL, nallocated);
1366        if (v == NULL)
1367            return NULL;
1368        p = PyString_AS_STRING(v);
1369    }
1370
1371    for (i = 0; i < size;) {
1372        Py_UCS4 ch = s[i++];
1373
1374        if (ch < 0x80)
1375            /* Encode ASCII */
1376            *p++ = (char) ch;
1377
1378        else if (ch < 0x0800) {
1379            /* Encode Latin-1 */
1380            *p++ = (char)(0xc0 | (ch >> 6));
1381            *p++ = (char)(0x80 | (ch & 0x3f));
1382        }
1383        else {
1384            /* Encode UCS2 Unicode ordinals */
1385            if (ch < 0x10000) {
1386                /* Special case: check for high surrogate */
1387                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1388                    Py_UCS4 ch2 = s[i];
1389                    /* Check for low surrogate and combine the two to
1390                       form a UCS4 value */
1391                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1392                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1393                        i++;
1394                        goto encodeUCS4;
1395                    }
1396                    /* Fall through: handles isolated high surrogates */
1397                }
1398                *p++ = (char)(0xe0 | (ch >> 12));
1399                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1400                *p++ = (char)(0x80 | (ch & 0x3f));
1401                continue;
1402    	    }
1403encodeUCS4:
1404            /* Encode UCS4 Unicode ordinals */
1405            *p++ = (char)(0xf0 | (ch >> 18));
1406            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1407            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1408            *p++ = (char)(0x80 | (ch & 0x3f));
1409        }
1410    }
1411
1412    if (v == NULL) {
1413        /* This was stack allocated. */
1414        nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1415        assert(nneeded <= nallocated);
1416        v = PyString_FromStringAndSize(stackbuf, nneeded);
1417    }
1418    else {
1419    	/* Cut back to size actually needed. */
1420        nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1421        assert(nneeded <= nallocated);
1422        _PyString_Resize(&v, nneeded);
1423    }
1424    return v;
1425
1426#undef MAX_SHORT_UNICHARS
1427}
1428
1429PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1430{
1431    if (!PyUnicode_Check(unicode)) {
1432        PyErr_BadArgument();
1433        return NULL;
1434    }
1435    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1436				PyUnicode_GET_SIZE(unicode),
1437				NULL);
1438}
1439
1440/* --- UTF-16 Codec ------------------------------------------------------- */
1441
1442PyObject *
1443PyUnicode_DecodeUTF16(const char *s,
1444		      int size,
1445		      const char *errors,
1446		      int *byteorder)
1447{
1448    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1449}
1450
1451PyObject *
1452PyUnicode_DecodeUTF16Stateful(const char *s,
1453			      int size,
1454			      const char *errors,
1455			      int *byteorder,
1456			      int *consumed)
1457{
1458    const char *starts = s;
1459    int startinpos;
1460    int endinpos;
1461    int outpos;
1462    PyUnicodeObject *unicode;
1463    Py_UNICODE *p;
1464    const unsigned char *q, *e;
1465    int bo = 0;       /* assume native ordering by default */
1466    const char *errmsg = "";
1467    /* Offsets from q for retrieving byte pairs in the right order. */
1468#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1469    int ihi = 1, ilo = 0;
1470#else
1471    int ihi = 0, ilo = 1;
1472#endif
1473    PyObject *errorHandler = NULL;
1474    PyObject *exc = NULL;
1475
1476    /* Note: size will always be longer than the resulting Unicode
1477       character count */
1478    unicode = _PyUnicode_New(size);
1479    if (!unicode)
1480        return NULL;
1481    if (size == 0)
1482        return (PyObject *)unicode;
1483
1484    /* Unpack UTF-16 encoded data */
1485    p = unicode->str;
1486    q = (unsigned char *)s;
1487    e = q + size;
1488
1489    if (byteorder)
1490        bo = *byteorder;
1491
1492    /* Check for BOM marks (U+FEFF) in the input and adjust current
1493       byte order setting accordingly. In native mode, the leading BOM
1494       mark is skipped, in all other modes, it is copied to the output
1495       stream as-is (giving a ZWNBSP character). */
1496    if (bo == 0) {
1497        if (size >= 2) {
1498            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1499#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1500	    if (bom == 0xFEFF) {
1501		q += 2;
1502		bo = -1;
1503	    }
1504	    else if (bom == 0xFFFE) {
1505		q += 2;
1506		bo = 1;
1507	    }
1508#else
1509	    if (bom == 0xFEFF) {
1510		q += 2;
1511		bo = 1;
1512	    }
1513	    else if (bom == 0xFFFE) {
1514		q += 2;
1515		bo = -1;
1516	    }
1517#endif
1518	}
1519    }
1520
1521    if (bo == -1) {
1522        /* force LE */
1523        ihi = 1;
1524        ilo = 0;
1525    }
1526    else if (bo == 1) {
1527        /* force BE */
1528        ihi = 0;
1529        ilo = 1;
1530    }
1531
1532    while (q < e) {
1533	Py_UNICODE ch;
1534	/* remaining bytes at the end? (size should be even) */
1535	if (e-q<2) {
1536	    if (consumed)
1537		break;
1538	    errmsg = "truncated data";
1539	    startinpos = ((const char *)q)-starts;
1540	    endinpos = ((const char *)e)-starts;
1541	    goto utf16Error;
1542	    /* The remaining input chars are ignored if the callback
1543	       chooses to skip the input */
1544	}
1545	ch = (q[ihi] << 8) | q[ilo];
1546
1547	q += 2;
1548
1549	if (ch < 0xD800 || ch > 0xDFFF) {
1550	    *p++ = ch;
1551	    continue;
1552	}
1553
1554	/* UTF-16 code pair: */
1555	if (q >= e) {
1556	    errmsg = "unexpected end of data";
1557	    startinpos = (((const char *)q)-2)-starts;
1558	    endinpos = ((const char *)e)-starts;
1559	    goto utf16Error;
1560	}
1561	if (0xD800 <= ch && ch <= 0xDBFF) {
1562	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1563	    q += 2;
1564	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1565#ifndef Py_UNICODE_WIDE
1566		*p++ = ch;
1567		*p++ = ch2;
1568#else
1569		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1570#endif
1571		continue;
1572	    }
1573	    else {
1574                errmsg = "illegal UTF-16 surrogate";
1575		startinpos = (((const char *)q)-4)-starts;
1576		endinpos = startinpos+2;
1577		goto utf16Error;
1578	    }
1579
1580	}
1581	errmsg = "illegal encoding";
1582	startinpos = (((const char *)q)-2)-starts;
1583	endinpos = startinpos+2;
1584	/* Fall through to report the error */
1585
1586    utf16Error:
1587	outpos = p-PyUnicode_AS_UNICODE(unicode);
1588	if (unicode_decode_call_errorhandler(
1589	         errors, &errorHandler,
1590	         "utf16", errmsg,
1591	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1592	         (PyObject **)&unicode, &outpos, &p))
1593	    goto onError;
1594    }
1595
1596    if (byteorder)
1597        *byteorder = bo;
1598
1599    if (consumed)
1600	*consumed = (const char *)q-starts;
1601
1602    /* Adjust length */
1603    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1604        goto onError;
1605
1606    Py_XDECREF(errorHandler);
1607    Py_XDECREF(exc);
1608    return (PyObject *)unicode;
1609
1610onError:
1611    Py_DECREF(unicode);
1612    Py_XDECREF(errorHandler);
1613    Py_XDECREF(exc);
1614    return NULL;
1615}
1616
1617PyObject *
1618PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1619		      int size,
1620		      const char *errors,
1621		      int byteorder)
1622{
1623    PyObject *v;
1624    unsigned char *p;
1625#ifdef Py_UNICODE_WIDE
1626    int i, pairs;
1627#else
1628    const int pairs = 0;
1629#endif
1630    /* Offsets from p for storing byte pairs in the right order. */
1631#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1632    int ihi = 1, ilo = 0;
1633#else
1634    int ihi = 0, ilo = 1;
1635#endif
1636
1637#define STORECHAR(CH)                   \
1638    do {                                \
1639        p[ihi] = ((CH) >> 8) & 0xff;    \
1640        p[ilo] = (CH) & 0xff;           \
1641        p += 2;                         \
1642    } while(0)
1643
1644#ifdef Py_UNICODE_WIDE
1645    for (i = pairs = 0; i < size; i++)
1646	if (s[i] >= 0x10000)
1647	    pairs++;
1648#endif
1649    v = PyString_FromStringAndSize(NULL,
1650		  2 * (size + pairs + (byteorder == 0)));
1651    if (v == NULL)
1652        return NULL;
1653
1654    p = (unsigned char *)PyString_AS_STRING(v);
1655    if (byteorder == 0)
1656	STORECHAR(0xFEFF);
1657    if (size == 0)
1658        return v;
1659
1660    if (byteorder == -1) {
1661        /* force LE */
1662        ihi = 1;
1663        ilo = 0;
1664    }
1665    else if (byteorder == 1) {
1666        /* force BE */
1667        ihi = 0;
1668        ilo = 1;
1669    }
1670
1671    while (size-- > 0) {
1672	Py_UNICODE ch = *s++;
1673	Py_UNICODE ch2 = 0;
1674#ifdef Py_UNICODE_WIDE
1675	if (ch >= 0x10000) {
1676	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1677	    ch  = 0xD800 | ((ch-0x10000) >> 10);
1678	}
1679#endif
1680        STORECHAR(ch);
1681        if (ch2)
1682            STORECHAR(ch2);
1683    }
1684    return v;
1685#undef STORECHAR
1686}
1687
1688PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1689{
1690    if (!PyUnicode_Check(unicode)) {
1691        PyErr_BadArgument();
1692        return NULL;
1693    }
1694    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1695				 PyUnicode_GET_SIZE(unicode),
1696				 NULL,
1697				 0);
1698}
1699
1700/* --- Unicode Escape Codec ----------------------------------------------- */
1701
1702static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1703
1704PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1705					int size,
1706					const char *errors)
1707{
1708    const char *starts = s;
1709    int startinpos;
1710    int endinpos;
1711    int outpos;
1712    int i;
1713    PyUnicodeObject *v;
1714    Py_UNICODE *p;
1715    const char *end;
1716    char* message;
1717    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1718    PyObject *errorHandler = NULL;
1719    PyObject *exc = NULL;
1720
1721    /* Escaped strings will always be longer than the resulting
1722       Unicode string, so we start with size here and then reduce the
1723       length after conversion to the true value.
1724       (but if the error callback returns a long replacement string
1725       we'll have to allocate more space) */
1726    v = _PyUnicode_New(size);
1727    if (v == NULL)
1728        goto onError;
1729    if (size == 0)
1730        return (PyObject *)v;
1731
1732    p = PyUnicode_AS_UNICODE(v);
1733    end = s + size;
1734
1735    while (s < end) {
1736        unsigned char c;
1737        Py_UNICODE x;
1738        int digits;
1739
1740        /* Non-escape characters are interpreted as Unicode ordinals */
1741        if (*s != '\\') {
1742            *p++ = (unsigned char) *s++;
1743            continue;
1744        }
1745
1746        startinpos = s-starts;
1747        /* \ - Escapes */
1748        s++;
1749        switch (*s++) {
1750
1751        /* \x escapes */
1752        case '\n': break;
1753        case '\\': *p++ = '\\'; break;
1754        case '\'': *p++ = '\''; break;
1755        case '\"': *p++ = '\"'; break;
1756        case 'b': *p++ = '\b'; break;
1757        case 'f': *p++ = '\014'; break; /* FF */
1758        case 't': *p++ = '\t'; break;
1759        case 'n': *p++ = '\n'; break;
1760        case 'r': *p++ = '\r'; break;
1761        case 'v': *p++ = '\013'; break; /* VT */
1762        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1763
1764        /* \OOO (octal) escapes */
1765        case '0': case '1': case '2': case '3':
1766        case '4': case '5': case '6': case '7':
1767            x = s[-1] - '0';
1768            if ('0' <= *s && *s <= '7') {
1769                x = (x<<3) + *s++ - '0';
1770                if ('0' <= *s && *s <= '7')
1771                    x = (x<<3) + *s++ - '0';
1772            }
1773            *p++ = x;
1774            break;
1775
1776        /* hex escapes */
1777        /* \xXX */
1778        case 'x':
1779            digits = 2;
1780            message = "truncated \\xXX escape";
1781            goto hexescape;
1782
1783        /* \uXXXX */
1784        case 'u':
1785            digits = 4;
1786            message = "truncated \\uXXXX escape";
1787            goto hexescape;
1788
1789        /* \UXXXXXXXX */
1790        case 'U':
1791            digits = 8;
1792            message = "truncated \\UXXXXXXXX escape";
1793        hexescape:
1794            chr = 0;
1795            outpos = p-PyUnicode_AS_UNICODE(v);
1796            if (s+digits>end) {
1797                endinpos = size;
1798                if (unicode_decode_call_errorhandler(
1799                    errors, &errorHandler,
1800                    "unicodeescape", "end of string in escape sequence",
1801                    starts, size, &startinpos, &endinpos, &exc, &s,
1802                    (PyObject **)&v, &outpos, &p))
1803                    goto onError;
1804                goto nextByte;
1805            }
1806            for (i = 0; i < digits; ++i) {
1807                c = (unsigned char) s[i];
1808                if (!isxdigit(c)) {
1809                    endinpos = (s+i+1)-starts;
1810                    if (unicode_decode_call_errorhandler(
1811                        errors, &errorHandler,
1812                        "unicodeescape", message,
1813                        starts, size, &startinpos, &endinpos, &exc, &s,
1814                        (PyObject **)&v, &outpos, &p))
1815                        goto onError;
1816                    goto nextByte;
1817                }
1818                chr = (chr<<4) & ~0xF;
1819                if (c >= '0' && c <= '9')
1820                    chr += c - '0';
1821                else if (c >= 'a' && c <= 'f')
1822                    chr += 10 + c - 'a';
1823                else
1824                    chr += 10 + c - 'A';
1825            }
1826            s += i;
1827            if (chr == 0xffffffff && PyErr_Occurred())
1828                /* _decoding_error will have already written into the
1829                   target buffer. */
1830                break;
1831        store:
1832            /* when we get here, chr is a 32-bit unicode character */
1833            if (chr <= 0xffff)
1834                /* UCS-2 character */
1835                *p++ = (Py_UNICODE) chr;
1836            else if (chr <= 0x10ffff) {
1837                /* UCS-4 character. Either store directly, or as
1838                   surrogate pair. */
1839#ifdef Py_UNICODE_WIDE
1840                *p++ = chr;
1841#else
1842                chr -= 0x10000L;
1843                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1844                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1845#endif
1846            } else {
1847                endinpos = s-starts;
1848                outpos = p-PyUnicode_AS_UNICODE(v);
1849                if (unicode_decode_call_errorhandler(
1850                    errors, &errorHandler,
1851                    "unicodeescape", "illegal Unicode character",
1852                    starts, size, &startinpos, &endinpos, &exc, &s,
1853                    (PyObject **)&v, &outpos, &p))
1854                    goto onError;
1855            }
1856            break;
1857
1858        /* \N{name} */
1859        case 'N':
1860            message = "malformed \\N character escape";
1861            if (ucnhash_CAPI == NULL) {
1862                /* load the unicode data module */
1863                PyObject *m, *v;
1864                m = PyImport_ImportModule("unicodedata");
1865                if (m == NULL)
1866                    goto ucnhashError;
1867                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1868                Py_DECREF(m);
1869                if (v == NULL)
1870                    goto ucnhashError;
1871                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1872                Py_DECREF(v);
1873                if (ucnhash_CAPI == NULL)
1874                    goto ucnhashError;
1875            }
1876            if (*s == '{') {
1877                const char *start = s+1;
1878                /* look for the closing brace */
1879                while (*s != '}' && s < end)
1880                    s++;
1881                if (s > start && s < end && *s == '}') {
1882                    /* found a name.  look it up in the unicode database */
1883                    message = "unknown Unicode character name";
1884                    s++;
1885                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1886                        goto store;
1887                }
1888            }
1889            endinpos = s-starts;
1890            outpos = p-PyUnicode_AS_UNICODE(v);
1891            if (unicode_decode_call_errorhandler(
1892                errors, &errorHandler,
1893                "unicodeescape", message,
1894                starts, size, &startinpos, &endinpos, &exc, &s,
1895                (PyObject **)&v, &outpos, &p))
1896                goto onError;
1897            break;
1898
1899        default:
1900            if (s > end) {
1901                message = "\\ at end of string";
1902                s--;
1903                endinpos = s-starts;
1904                outpos = p-PyUnicode_AS_UNICODE(v);
1905                if (unicode_decode_call_errorhandler(
1906                    errors, &errorHandler,
1907                    "unicodeescape", message,
1908                    starts, size, &startinpos, &endinpos, &exc, &s,
1909                    (PyObject **)&v, &outpos, &p))
1910                    goto onError;
1911            }
1912            else {
1913                *p++ = '\\';
1914                *p++ = (unsigned char)s[-1];
1915            }
1916            break;
1917        }
1918        nextByte:
1919        ;
1920    }
1921    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
1922        goto onError;
1923    Py_XDECREF(errorHandler);
1924    Py_XDECREF(exc);
1925    return (PyObject *)v;
1926
1927ucnhashError:
1928    PyErr_SetString(
1929        PyExc_UnicodeError,
1930        "\\N escapes not supported (can't load unicodedata module)"
1931        );
1932    Py_XDECREF(errorHandler);
1933    Py_XDECREF(exc);
1934    return NULL;
1935
1936onError:
1937    Py_XDECREF(v);
1938    Py_XDECREF(errorHandler);
1939    Py_XDECREF(exc);
1940    return NULL;
1941}
1942
1943/* Return a Unicode-Escape string version of the Unicode object.
1944
1945   If quotes is true, the string is enclosed in u"" or u'' quotes as
1946   appropriate.
1947
1948*/
1949
1950static const Py_UNICODE *findchar(const Py_UNICODE *s,
1951				  int size,
1952				  Py_UNICODE ch);
1953
1954static
1955PyObject *unicodeescape_string(const Py_UNICODE *s,
1956                               int size,
1957                               int quotes)
1958{
1959    PyObject *repr;
1960    char *p;
1961
1962    static const char *hexdigit = "0123456789abcdef";
1963
1964    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1965    if (repr == NULL)
1966        return NULL;
1967
1968    p = PyString_AS_STRING(repr);
1969
1970    if (quotes) {
1971        *p++ = 'u';
1972        *p++ = (findchar(s, size, '\'') &&
1973                !findchar(s, size, '"')) ? '"' : '\'';
1974    }
1975    while (size-- > 0) {
1976        Py_UNICODE ch = *s++;
1977
1978        /* Escape quotes */
1979        if (quotes &&
1980	    (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1981            *p++ = '\\';
1982            *p++ = (char) ch;
1983	    continue;
1984        }
1985
1986#ifdef Py_UNICODE_WIDE
1987        /* Map 21-bit characters to '\U00xxxxxx' */
1988        else if (ch >= 0x10000) {
1989	    int offset = p - PyString_AS_STRING(repr);
1990
1991	    /* Resize the string if necessary */
1992	    if (offset + 12 > PyString_GET_SIZE(repr)) {
1993		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1994		    return NULL;
1995		p = PyString_AS_STRING(repr) + offset;
1996	    }
1997
1998            *p++ = '\\';
1999            *p++ = 'U';
2000            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2001            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2002            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2003            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2004            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2005            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2006            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2007            *p++ = hexdigit[ch & 0x0000000F];
2008	    continue;
2009        }
2010#endif
2011	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2012	else if (ch >= 0xD800 && ch < 0xDC00) {
2013	    Py_UNICODE ch2;
2014	    Py_UCS4 ucs;
2015
2016	    ch2 = *s++;
2017	    size--;
2018	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2019		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2020		*p++ = '\\';
2021		*p++ = 'U';
2022		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2023		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2024		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2025		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2026		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2027		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2028		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2029		*p++ = hexdigit[ucs & 0x0000000F];
2030		continue;
2031	    }
2032	    /* Fall through: isolated surrogates are copied as-is */
2033	    s--;
2034	    size++;
2035	}
2036
2037        /* Map 16-bit characters to '\uxxxx' */
2038        if (ch >= 256) {
2039            *p++ = '\\';
2040            *p++ = 'u';
2041            *p++ = hexdigit[(ch >> 12) & 0x000F];
2042            *p++ = hexdigit[(ch >> 8) & 0x000F];
2043            *p++ = hexdigit[(ch >> 4) & 0x000F];
2044            *p++ = hexdigit[ch & 0x000F];
2045        }
2046
2047        /* Map special whitespace to '\t', \n', '\r' */
2048        else if (ch == '\t') {
2049            *p++ = '\\';
2050            *p++ = 't';
2051        }
2052        else if (ch == '\n') {
2053            *p++ = '\\';
2054            *p++ = 'n';
2055        }
2056        else if (ch == '\r') {
2057            *p++ = '\\';
2058            *p++ = 'r';
2059        }
2060
2061        /* Map non-printable US ASCII to '\xhh' */
2062        else if (ch < ' ' || ch >= 0x7F) {
2063            *p++ = '\\';
2064            *p++ = 'x';
2065            *p++ = hexdigit[(ch >> 4) & 0x000F];
2066            *p++ = hexdigit[ch & 0x000F];
2067        }
2068
2069        /* Copy everything else as-is */
2070        else
2071            *p++ = (char) ch;
2072    }
2073    if (quotes)
2074        *p++ = PyString_AS_STRING(repr)[1];
2075
2076    *p = '\0';
2077    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2078    return repr;
2079}
2080
2081PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2082					int size)
2083{
2084    return unicodeescape_string(s, size, 0);
2085}
2086
2087PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2088{
2089    if (!PyUnicode_Check(unicode)) {
2090        PyErr_BadArgument();
2091        return NULL;
2092    }
2093    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2094					 PyUnicode_GET_SIZE(unicode));
2095}
2096
2097/* --- Raw Unicode Escape Codec ------------------------------------------- */
2098
2099PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2100					   int size,
2101					   const char *errors)
2102{
2103    const char *starts = s;
2104    int startinpos;
2105    int endinpos;
2106    int outpos;
2107    PyUnicodeObject *v;
2108    Py_UNICODE *p;
2109    const char *end;
2110    const char *bs;
2111    PyObject *errorHandler = NULL;
2112    PyObject *exc = NULL;
2113
2114    /* Escaped strings will always be longer than the resulting
2115       Unicode string, so we start with size here and then reduce the
2116       length after conversion to the true value. (But decoding error
2117       handler might have to resize the string) */
2118    v = _PyUnicode_New(size);
2119    if (v == NULL)
2120	goto onError;
2121    if (size == 0)
2122	return (PyObject *)v;
2123    p = PyUnicode_AS_UNICODE(v);
2124    end = s + size;
2125    while (s < end) {
2126	unsigned char c;
2127	Py_UCS4 x;
2128	int i;
2129        int count;
2130
2131	/* Non-escape characters are interpreted as Unicode ordinals */
2132	if (*s != '\\') {
2133	    *p++ = (unsigned char)*s++;
2134	    continue;
2135	}
2136	startinpos = s-starts;
2137
2138	/* \u-escapes are only interpreted iff the number of leading
2139	   backslashes if odd */
2140	bs = s;
2141	for (;s < end;) {
2142	    if (*s != '\\')
2143		break;
2144	    *p++ = (unsigned char)*s++;
2145	}
2146	if (((s - bs) & 1) == 0 ||
2147	    s >= end ||
2148	    (*s != 'u' && *s != 'U')) {
2149	    continue;
2150	}
2151	p--;
2152        count = *s=='u' ? 4 : 8;
2153	s++;
2154
2155	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2156	outpos = p-PyUnicode_AS_UNICODE(v);
2157	for (x = 0, i = 0; i < count; ++i, ++s) {
2158	    c = (unsigned char)*s;
2159	    if (!isxdigit(c)) {
2160		endinpos = s-starts;
2161		if (unicode_decode_call_errorhandler(
2162		    errors, &errorHandler,
2163		    "rawunicodeescape", "truncated \\uXXXX",
2164		    starts, size, &startinpos, &endinpos, &exc, &s,
2165		    (PyObject **)&v, &outpos, &p))
2166		    goto onError;
2167		goto nextByte;
2168	    }
2169	    x = (x<<4) & ~0xF;
2170	    if (c >= '0' && c <= '9')
2171		x += c - '0';
2172	    else if (c >= 'a' && c <= 'f')
2173		x += 10 + c - 'a';
2174	    else
2175		x += 10 + c - 'A';
2176	}
2177#ifndef Py_UNICODE_WIDE
2178        if (x > 0x10000) {
2179            if (unicode_decode_call_errorhandler(
2180                    errors, &errorHandler,
2181                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
2182		    starts, size, &startinpos, &endinpos, &exc, &s,
2183		    (PyObject **)&v, &outpos, &p))
2184		    goto onError;
2185        }
2186#endif
2187	*p++ = x;
2188	nextByte:
2189	;
2190    }
2191    if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2192	goto onError;
2193    Py_XDECREF(errorHandler);
2194    Py_XDECREF(exc);
2195    return (PyObject *)v;
2196
2197 onError:
2198    Py_XDECREF(v);
2199    Py_XDECREF(errorHandler);
2200    Py_XDECREF(exc);
2201    return NULL;
2202}
2203
2204PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2205					   int size)
2206{
2207    PyObject *repr;
2208    char *p;
2209    char *q;
2210
2211    static const char *hexdigit = "0123456789abcdef";
2212
2213#ifdef Py_UNICODE_WIDE
2214    repr = PyString_FromStringAndSize(NULL, 10 * size);
2215#else
2216    repr = PyString_FromStringAndSize(NULL, 6 * size);
2217#endif
2218    if (repr == NULL)
2219        return NULL;
2220    if (size == 0)
2221	return repr;
2222
2223    p = q = PyString_AS_STRING(repr);
2224    while (size-- > 0) {
2225        Py_UNICODE ch = *s++;
2226#ifdef Py_UNICODE_WIDE
2227	/* Map 32-bit characters to '\Uxxxxxxxx' */
2228	if (ch >= 0x10000) {
2229            *p++ = '\\';
2230            *p++ = 'U';
2231            *p++ = hexdigit[(ch >> 28) & 0xf];
2232            *p++ = hexdigit[(ch >> 24) & 0xf];
2233            *p++ = hexdigit[(ch >> 20) & 0xf];
2234            *p++ = hexdigit[(ch >> 16) & 0xf];
2235            *p++ = hexdigit[(ch >> 12) & 0xf];
2236            *p++ = hexdigit[(ch >> 8) & 0xf];
2237            *p++ = hexdigit[(ch >> 4) & 0xf];
2238            *p++ = hexdigit[ch & 15];
2239        }
2240        else
2241#endif
2242	/* Map 16-bit characters to '\uxxxx' */
2243	if (ch >= 256) {
2244            *p++ = '\\';
2245            *p++ = 'u';
2246            *p++ = hexdigit[(ch >> 12) & 0xf];
2247            *p++ = hexdigit[(ch >> 8) & 0xf];
2248            *p++ = hexdigit[(ch >> 4) & 0xf];
2249            *p++ = hexdigit[ch & 15];
2250        }
2251	/* Copy everything else as-is */
2252	else
2253            *p++ = (char) ch;
2254    }
2255    *p = '\0';
2256    _PyString_Resize(&repr, p - q);
2257    return repr;
2258}
2259
2260PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2261{
2262    if (!PyUnicode_Check(unicode)) {
2263	PyErr_BadArgument();
2264	return NULL;
2265    }
2266    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2267					    PyUnicode_GET_SIZE(unicode));
2268}
2269
2270/* --- Latin-1 Codec ------------------------------------------------------ */
2271
2272PyObject *PyUnicode_DecodeLatin1(const char *s,
2273				 int size,
2274				 const char *errors)
2275{
2276    PyUnicodeObject *v;
2277    Py_UNICODE *p;
2278
2279    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2280    if (size == 1) {
2281	Py_UNICODE r = *(unsigned char*)s;
2282	return PyUnicode_FromUnicode(&r, 1);
2283    }
2284
2285    v = _PyUnicode_New(size);
2286    if (v == NULL)
2287	goto onError;
2288    if (size == 0)
2289	return (PyObject *)v;
2290    p = PyUnicode_AS_UNICODE(v);
2291    while (size-- > 0)
2292	*p++ = (unsigned char)*s++;
2293    return (PyObject *)v;
2294
2295 onError:
2296    Py_XDECREF(v);
2297    return NULL;
2298}
2299
2300/* create or adjust a UnicodeEncodeError */
2301static void make_encode_exception(PyObject **exceptionObject,
2302    const char *encoding,
2303    const Py_UNICODE *unicode, int size,
2304    int startpos, int endpos,
2305    const char *reason)
2306{
2307    if (*exceptionObject == NULL) {
2308	*exceptionObject = PyUnicodeEncodeError_Create(
2309	    encoding, unicode, size, startpos, endpos, reason);
2310    }
2311    else {
2312	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2313	    goto onError;
2314	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2315	    goto onError;
2316	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2317	    goto onError;
2318	return;
2319	onError:
2320	Py_DECREF(*exceptionObject);
2321	*exceptionObject = NULL;
2322    }
2323}
2324
2325/* raises a UnicodeEncodeError */
2326static void raise_encode_exception(PyObject **exceptionObject,
2327    const char *encoding,
2328    const Py_UNICODE *unicode, int size,
2329    int startpos, int endpos,
2330    const char *reason)
2331{
2332    make_encode_exception(exceptionObject,
2333	encoding, unicode, size, startpos, endpos, reason);
2334    if (*exceptionObject != NULL)
2335	PyCodec_StrictErrors(*exceptionObject);
2336}
2337
2338/* error handling callback helper:
2339   build arguments, call the callback and check the arguments,
2340   put the result into newpos and return the replacement string, which
2341   has to be freed by the caller */
2342static PyObject *unicode_encode_call_errorhandler(const char *errors,
2343    PyObject **errorHandler,
2344    const char *encoding, const char *reason,
2345    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2346    int startpos, int endpos,
2347    int *newpos)
2348{
2349    static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2350
2351    PyObject *restuple;
2352    PyObject *resunicode;
2353
2354    if (*errorHandler == NULL) {
2355	*errorHandler = PyCodec_LookupError(errors);
2356        if (*errorHandler == NULL)
2357	    return NULL;
2358    }
2359
2360    make_encode_exception(exceptionObject,
2361	encoding, unicode, size, startpos, endpos, reason);
2362    if (*exceptionObject == NULL)
2363	return NULL;
2364
2365    restuple = PyObject_CallFunctionObjArgs(
2366	*errorHandler, *exceptionObject, NULL);
2367    if (restuple == NULL)
2368	return NULL;
2369    if (!PyTuple_Check(restuple)) {
2370	PyErr_Format(PyExc_TypeError, &argparse[4]);
2371	Py_DECREF(restuple);
2372	return NULL;
2373    }
2374    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2375	&resunicode, newpos)) {
2376	Py_DECREF(restuple);
2377	return NULL;
2378    }
2379    if (*newpos<0)
2380	*newpos = size+*newpos;
2381    if (*newpos<0 || *newpos>size) {
2382	PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2383	Py_DECREF(restuple);
2384	return NULL;
2385    }
2386    Py_INCREF(resunicode);
2387    Py_DECREF(restuple);
2388    return resunicode;
2389}
2390
2391static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2392				 int size,
2393				 const char *errors,
2394				 int limit)
2395{
2396    /* output object */
2397    PyObject *res;
2398    /* pointers to the beginning and end+1 of input */
2399    const Py_UNICODE *startp = p;
2400    const Py_UNICODE *endp = p + size;
2401    /* pointer to the beginning of the unencodable characters */
2402    /* const Py_UNICODE *badp = NULL; */
2403    /* pointer into the output */
2404    char *str;
2405    /* current output position */
2406    int respos = 0;
2407    int ressize;
2408    char *encoding = (limit == 256) ? "latin-1" : "ascii";
2409    char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2410    PyObject *errorHandler = NULL;
2411    PyObject *exc = NULL;
2412    /* the following variable is used for caching string comparisons
2413     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2414    int known_errorHandler = -1;
2415
2416    /* allocate enough for a simple encoding without
2417       replacements, if we need more, we'll resize */
2418    res = PyString_FromStringAndSize(NULL, size);
2419    if (res == NULL)
2420        goto onError;
2421    if (size == 0)
2422	return res;
2423    str = PyString_AS_STRING(res);
2424    ressize = size;
2425
2426    while (p<endp) {
2427	Py_UNICODE c = *p;
2428
2429	/* can we encode this? */
2430	if (c<limit) {
2431	    /* no overflow check, because we know that the space is enough */
2432	    *str++ = (char)c;
2433	    ++p;
2434	}
2435	else {
2436	    int unicodepos = p-startp;
2437	    int requiredsize;
2438	    PyObject *repunicode;
2439	    int repsize;
2440	    int newpos;
2441	    int respos;
2442	    Py_UNICODE *uni2;
2443	    /* startpos for collecting unencodable chars */
2444	    const Py_UNICODE *collstart = p;
2445	    const Py_UNICODE *collend = p;
2446	    /* find all unecodable characters */
2447	    while ((collend < endp) && ((*collend)>=limit))
2448		++collend;
2449	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2450	    if (known_errorHandler==-1) {
2451		if ((errors==NULL) || (!strcmp(errors, "strict")))
2452		    known_errorHandler = 1;
2453		else if (!strcmp(errors, "replace"))
2454		    known_errorHandler = 2;
2455		else if (!strcmp(errors, "ignore"))
2456		    known_errorHandler = 3;
2457		else if (!strcmp(errors, "xmlcharrefreplace"))
2458		    known_errorHandler = 4;
2459		else
2460		    known_errorHandler = 0;
2461	    }
2462	    switch (known_errorHandler) {
2463		case 1: /* strict */
2464		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2465		    goto onError;
2466		case 2: /* replace */
2467		    while (collstart++<collend)
2468			*str++ = '?'; /* fall through */
2469		case 3: /* ignore */
2470		    p = collend;
2471		    break;
2472		case 4: /* xmlcharrefreplace */
2473		    respos = str-PyString_AS_STRING(res);
2474		    /* determine replacement size (temporarily (mis)uses p) */
2475		    for (p = collstart, repsize = 0; p < collend; ++p) {
2476			if (*p<10)
2477			    repsize += 2+1+1;
2478			else if (*p<100)
2479			    repsize += 2+2+1;
2480			else if (*p<1000)
2481			    repsize += 2+3+1;
2482			else if (*p<10000)
2483			    repsize += 2+4+1;
2484#ifndef Py_UNICODE_WIDE
2485			else
2486			    repsize += 2+5+1;
2487#else
2488			else if (*p<100000)
2489			    repsize += 2+5+1;
2490			else if (*p<1000000)
2491			    repsize += 2+6+1;
2492			else
2493			    repsize += 2+7+1;
2494#endif
2495		    }
2496		    requiredsize = respos+repsize+(endp-collend);
2497		    if (requiredsize > ressize) {
2498			if (requiredsize<2*ressize)
2499			    requiredsize = 2*ressize;
2500			if (_PyString_Resize(&res, requiredsize))
2501			    goto onError;
2502			str = PyString_AS_STRING(res) + respos;
2503			ressize = requiredsize;
2504		    }
2505		    /* generate replacement (temporarily (mis)uses p) */
2506		    for (p = collstart; p < collend; ++p) {
2507			str += sprintf(str, "&#%d;", (int)*p);
2508		    }
2509		    p = collend;
2510		    break;
2511		default:
2512		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2513			encoding, reason, startp, size, &exc,
2514			collstart-startp, collend-startp, &newpos);
2515		    if (repunicode == NULL)
2516			goto onError;
2517		    /* need more space? (at least enough for what we
2518		       have+the replacement+the rest of the string, so
2519		       we won't have to check space for encodable characters) */
2520		    respos = str-PyString_AS_STRING(res);
2521		    repsize = PyUnicode_GET_SIZE(repunicode);
2522		    requiredsize = respos+repsize+(endp-collend);
2523		    if (requiredsize > ressize) {
2524			if (requiredsize<2*ressize)
2525			    requiredsize = 2*ressize;
2526			if (_PyString_Resize(&res, requiredsize)) {
2527			    Py_DECREF(repunicode);
2528			    goto onError;
2529			}
2530			str = PyString_AS_STRING(res) + respos;
2531			ressize = requiredsize;
2532		    }
2533		    /* check if there is anything unencodable in the replacement
2534		       and copy it to the output */
2535		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2536			c = *uni2;
2537			if (c >= limit) {
2538			    raise_encode_exception(&exc, encoding, startp, size,
2539				unicodepos, unicodepos+1, reason);
2540			    Py_DECREF(repunicode);
2541			    goto onError;
2542			}
2543			*str = (char)c;
2544		    }
2545		    p = startp + newpos;
2546		    Py_DECREF(repunicode);
2547	    }
2548	}
2549    }
2550    /* Resize if we allocated to much */
2551    respos = str-PyString_AS_STRING(res);
2552    if (respos<ressize)
2553       /* If this falls res will be NULL */
2554	_PyString_Resize(&res, respos);
2555    Py_XDECREF(errorHandler);
2556    Py_XDECREF(exc);
2557    return res;
2558
2559    onError:
2560    Py_XDECREF(res);
2561    Py_XDECREF(errorHandler);
2562    Py_XDECREF(exc);
2563    return NULL;
2564}
2565
2566PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2567				 int size,
2568				 const char *errors)
2569{
2570    return unicode_encode_ucs1(p, size, errors, 256);
2571}
2572
2573PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2574{
2575    if (!PyUnicode_Check(unicode)) {
2576	PyErr_BadArgument();
2577	return NULL;
2578    }
2579    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2580				  PyUnicode_GET_SIZE(unicode),
2581				  NULL);
2582}
2583
2584/* --- 7-bit ASCII Codec -------------------------------------------------- */
2585
2586PyObject *PyUnicode_DecodeASCII(const char *s,
2587				int size,
2588				const char *errors)
2589{
2590    const char *starts = s;
2591    PyUnicodeObject *v;
2592    Py_UNICODE *p;
2593    int startinpos;
2594    int endinpos;
2595    int outpos;
2596    const char *e;
2597    PyObject *errorHandler = NULL;
2598    PyObject *exc = NULL;
2599
2600    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2601    if (size == 1 && *(unsigned char*)s < 128) {
2602	Py_UNICODE r = *(unsigned char*)s;
2603	return PyUnicode_FromUnicode(&r, 1);
2604    }
2605
2606    v = _PyUnicode_New(size);
2607    if (v == NULL)
2608	goto onError;
2609    if (size == 0)
2610	return (PyObject *)v;
2611    p = PyUnicode_AS_UNICODE(v);
2612    e = s + size;
2613    while (s < e) {
2614	register unsigned char c = (unsigned char)*s;
2615	if (c < 128) {
2616	    *p++ = c;
2617	    ++s;
2618	}
2619	else {
2620	    startinpos = s-starts;
2621	    endinpos = startinpos + 1;
2622	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
2623	    if (unicode_decode_call_errorhandler(
2624		 errors, &errorHandler,
2625		 "ascii", "ordinal not in range(128)",
2626		 starts, size, &startinpos, &endinpos, &exc, &s,
2627		 (PyObject **)&v, &outpos, &p))
2628		goto onError;
2629	}
2630    }
2631    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2632	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2633	    goto onError;
2634    Py_XDECREF(errorHandler);
2635    Py_XDECREF(exc);
2636    return (PyObject *)v;
2637
2638 onError:
2639    Py_XDECREF(v);
2640    Py_XDECREF(errorHandler);
2641    Py_XDECREF(exc);
2642    return NULL;
2643}
2644
2645PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2646				int size,
2647				const char *errors)
2648{
2649    return unicode_encode_ucs1(p, size, errors, 128);
2650}
2651
2652PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2653{
2654    if (!PyUnicode_Check(unicode)) {
2655	PyErr_BadArgument();
2656	return NULL;
2657    }
2658    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2659				 PyUnicode_GET_SIZE(unicode),
2660				 NULL);
2661}
2662
2663#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2664
2665/* --- MBCS codecs for Windows -------------------------------------------- */
2666
2667PyObject *PyUnicode_DecodeMBCS(const char *s,
2668				int size,
2669				const char *errors)
2670{
2671    PyUnicodeObject *v;
2672    Py_UNICODE *p;
2673
2674    /* First get the size of the result */
2675    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2676    if (size > 0 && usize==0)
2677        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2678
2679    v = _PyUnicode_New(usize);
2680    if (v == NULL)
2681        return NULL;
2682    if (usize == 0)
2683	return (PyObject *)v;
2684    p = PyUnicode_AS_UNICODE(v);
2685    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2686        Py_DECREF(v);
2687        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2688    }
2689
2690    return (PyObject *)v;
2691}
2692
2693PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2694				int size,
2695				const char *errors)
2696{
2697    PyObject *repr;
2698    char *s;
2699    DWORD mbcssize;
2700
2701    /* If there are no characters, bail now! */
2702    if (size==0)
2703	    return PyString_FromString("");
2704
2705    /* First get the size of the result */
2706    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2707    if (mbcssize==0)
2708        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2709
2710    repr = PyString_FromStringAndSize(NULL, mbcssize);
2711    if (repr == NULL)
2712        return NULL;
2713    if (mbcssize == 0)
2714        return repr;
2715
2716    /* Do the conversion */
2717    s = PyString_AS_STRING(repr);
2718    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2719        Py_DECREF(repr);
2720        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2721    }
2722    return repr;
2723}
2724
2725PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2726{
2727    if (!PyUnicode_Check(unicode)) {
2728        PyErr_BadArgument();
2729        return NULL;
2730    }
2731    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2732				PyUnicode_GET_SIZE(unicode),
2733				NULL);
2734}
2735
2736#endif /* MS_WINDOWS */
2737
2738/* --- Character Mapping Codec -------------------------------------------- */
2739
2740PyObject *PyUnicode_DecodeCharmap(const char *s,
2741				  int size,
2742				  PyObject *mapping,
2743				  const char *errors)
2744{
2745    const char *starts = s;
2746    int startinpos;
2747    int endinpos;
2748    int outpos;
2749    const char *e;
2750    PyUnicodeObject *v;
2751    Py_UNICODE *p;
2752    int extrachars = 0;
2753    PyObject *errorHandler = NULL;
2754    PyObject *exc = NULL;
2755
2756    /* Default to Latin-1 */
2757    if (mapping == NULL)
2758	return PyUnicode_DecodeLatin1(s, size, errors);
2759
2760    v = _PyUnicode_New(size);
2761    if (v == NULL)
2762	goto onError;
2763    if (size == 0)
2764	return (PyObject *)v;
2765    p = PyUnicode_AS_UNICODE(v);
2766    e = s + size;
2767    while (s < e) {
2768	unsigned char ch = *s;
2769	PyObject *w, *x;
2770
2771	/* Get mapping (char ordinal -> integer, Unicode char or None) */
2772	w = PyInt_FromLong((long)ch);
2773	if (w == NULL)
2774	    goto onError;
2775	x = PyObject_GetItem(mapping, w);
2776	Py_DECREF(w);
2777	if (x == NULL) {
2778	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2779		/* No mapping found means: mapping is undefined. */
2780		PyErr_Clear();
2781		x = Py_None;
2782		Py_INCREF(x);
2783	    } else
2784		goto onError;
2785	}
2786
2787	/* Apply mapping */
2788	if (PyInt_Check(x)) {
2789	    long value = PyInt_AS_LONG(x);
2790	    if (value < 0 || value > 65535) {
2791		PyErr_SetString(PyExc_TypeError,
2792				"character mapping must be in range(65536)");
2793		Py_DECREF(x);
2794		goto onError;
2795	    }
2796	    *p++ = (Py_UNICODE)value;
2797	}
2798	else if (x == Py_None) {
2799	    /* undefined mapping */
2800	    outpos = p-PyUnicode_AS_UNICODE(v);
2801	    startinpos = s-starts;
2802	    endinpos = startinpos+1;
2803	    if (unicode_decode_call_errorhandler(
2804		 errors, &errorHandler,
2805		 "charmap", "character maps to <undefined>",
2806		 starts, size, &startinpos, &endinpos, &exc, &s,
2807		 (PyObject **)&v, &outpos, &p)) {
2808		Py_DECREF(x);
2809		goto onError;
2810	    }
2811	    continue;
2812	}
2813	else if (PyUnicode_Check(x)) {
2814	    int targetsize = PyUnicode_GET_SIZE(x);
2815
2816	    if (targetsize == 1)
2817		/* 1-1 mapping */
2818		*p++ = *PyUnicode_AS_UNICODE(x);
2819
2820	    else if (targetsize > 1) {
2821		/* 1-n mapping */
2822		if (targetsize > extrachars) {
2823		    /* resize first */
2824		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2825		    int needed = (targetsize - extrachars) + \
2826			         (targetsize << 2);
2827		    extrachars += needed;
2828		    if (_PyUnicode_Resize(&v,
2829					 PyUnicode_GET_SIZE(v) + needed) < 0) {
2830			Py_DECREF(x);
2831			goto onError;
2832		    }
2833		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2834		}
2835		Py_UNICODE_COPY(p,
2836				PyUnicode_AS_UNICODE(x),
2837				targetsize);
2838		p += targetsize;
2839		extrachars -= targetsize;
2840	    }
2841	    /* 1-0 mapping: skip the character */
2842	}
2843	else {
2844	    /* wrong return value */
2845	    PyErr_SetString(PyExc_TypeError,
2846		  "character mapping must return integer, None or unicode");
2847	    Py_DECREF(x);
2848	    goto onError;
2849	}
2850	Py_DECREF(x);
2851	++s;
2852    }
2853    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2854	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2855	    goto onError;
2856    Py_XDECREF(errorHandler);
2857    Py_XDECREF(exc);
2858    return (PyObject *)v;
2859
2860 onError:
2861    Py_XDECREF(errorHandler);
2862    Py_XDECREF(exc);
2863    Py_XDECREF(v);
2864    return NULL;
2865}
2866
2867/* Lookup the character ch in the mapping. If the character
2868   can't be found, Py_None is returned (or NULL, if another
2869   error occured). */
2870static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
2871{
2872    PyObject *w = PyInt_FromLong((long)c);
2873    PyObject *x;
2874
2875    if (w == NULL)
2876	 return NULL;
2877    x = PyObject_GetItem(mapping, w);
2878    Py_DECREF(w);
2879    if (x == NULL) {
2880	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2881	    /* No mapping found means: mapping is undefined. */
2882	    PyErr_Clear();
2883	    x = Py_None;
2884	    Py_INCREF(x);
2885	    return x;
2886	} else
2887	    return NULL;
2888    }
2889    else if (x == Py_None)
2890	return x;
2891    else if (PyInt_Check(x)) {
2892	long value = PyInt_AS_LONG(x);
2893	if (value < 0 || value > 255) {
2894	    PyErr_SetString(PyExc_TypeError,
2895			     "character mapping must be in range(256)");
2896	    Py_DECREF(x);
2897	    return NULL;
2898	}
2899	return x;
2900    }
2901    else if (PyString_Check(x))
2902	return x;
2903    else {
2904	/* wrong return value */
2905	PyErr_SetString(PyExc_TypeError,
2906	      "character mapping must return integer, None or str");
2907	Py_DECREF(x);
2908	return NULL;
2909    }
2910}
2911
2912/* lookup the character, put the result in the output string and adjust
2913   various state variables. Reallocate the output string if not enough
2914   space is available. Return a new reference to the object that
2915   was put in the output buffer, or Py_None, if the mapping was undefined
2916   (in which case no character was written) or NULL, if a
2917   reallocation error ocurred. The called must decref the result */
2918static
2919PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2920    PyObject **outobj, int *outpos)
2921{
2922    PyObject *rep = charmapencode_lookup(c, mapping);
2923
2924    if (rep==NULL)
2925	return NULL;
2926    else if (rep==Py_None)
2927	return rep;
2928    else {
2929	char *outstart = PyString_AS_STRING(*outobj);
2930	int outsize = PyString_GET_SIZE(*outobj);
2931	if (PyInt_Check(rep)) {
2932	    int requiredsize = *outpos+1;
2933	    if (outsize<requiredsize) {
2934		/* exponentially overallocate to minimize reallocations */
2935		if (requiredsize < 2*outsize)
2936		    requiredsize = 2*outsize;
2937		if (_PyString_Resize(outobj, requiredsize)) {
2938		    Py_DECREF(rep);
2939		    return NULL;
2940		}
2941		outstart = PyString_AS_STRING(*outobj);
2942	    }
2943	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2944	}
2945	else {
2946	    const char *repchars = PyString_AS_STRING(rep);
2947	    int repsize = PyString_GET_SIZE(rep);
2948	    int requiredsize = *outpos+repsize;
2949	    if (outsize<requiredsize) {
2950		/* exponentially overallocate to minimize reallocations */
2951		if (requiredsize < 2*outsize)
2952		    requiredsize = 2*outsize;
2953		if (_PyString_Resize(outobj, requiredsize)) {
2954		    Py_DECREF(rep);
2955		    return NULL;
2956		}
2957		outstart = PyString_AS_STRING(*outobj);
2958	    }
2959	    memcpy(outstart + *outpos, repchars, repsize);
2960	    *outpos += repsize;
2961	}
2962    }
2963    return rep;
2964}
2965
2966/* handle an error in PyUnicode_EncodeCharmap
2967   Return 0 on success, -1 on error */
2968static
2969int charmap_encoding_error(
2970    const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2971    PyObject **exceptionObject,
2972    int *known_errorHandler, PyObject **errorHandler, const char *errors,
2973    PyObject **res, int *respos)
2974{
2975    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2976    int repsize;
2977    int newpos;
2978    Py_UNICODE *uni2;
2979    /* startpos for collecting unencodable chars */
2980    int collstartpos = *inpos;
2981    int collendpos = *inpos+1;
2982    int collpos;
2983    char *encoding = "charmap";
2984    char *reason = "character maps to <undefined>";
2985
2986    PyObject *x;
2987    /* find all unencodable characters */
2988    while (collendpos < size) {
2989	x = charmapencode_lookup(p[collendpos], mapping);
2990	if (x==NULL)
2991	    return -1;
2992	else if (x!=Py_None) {
2993	    Py_DECREF(x);
2994	    break;
2995	}
2996	Py_DECREF(x);
2997	++collendpos;
2998    }
2999    /* cache callback name lookup
3000     * (if not done yet, i.e. it's the first error) */
3001    if (*known_errorHandler==-1) {
3002	if ((errors==NULL) || (!strcmp(errors, "strict")))
3003	    *known_errorHandler = 1;
3004	else if (!strcmp(errors, "replace"))
3005	    *known_errorHandler = 2;
3006	else if (!strcmp(errors, "ignore"))
3007	    *known_errorHandler = 3;
3008	else if (!strcmp(errors, "xmlcharrefreplace"))
3009	    *known_errorHandler = 4;
3010	else
3011	    *known_errorHandler = 0;
3012    }
3013    switch (*known_errorHandler) {
3014	case 1: /* strict */
3015	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3016	    return -1;
3017	case 2: /* replace */
3018	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3019		x = charmapencode_output('?', mapping, res, respos);
3020		if (x==NULL) {
3021		    return -1;
3022		}
3023		else if (x==Py_None) {
3024		    Py_DECREF(x);
3025		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3026		    return -1;
3027		}
3028		Py_DECREF(x);
3029	    }
3030	    /* fall through */
3031	case 3: /* ignore */
3032	    *inpos = collendpos;
3033	    break;
3034	case 4: /* xmlcharrefreplace */
3035	    /* generate replacement (temporarily (mis)uses p) */
3036	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3037		char buffer[2+29+1+1];
3038		char *cp;
3039		sprintf(buffer, "&#%d;", (int)p[collpos]);
3040		for (cp = buffer; *cp; ++cp) {
3041		    x = charmapencode_output(*cp, mapping, res, respos);
3042		    if (x==NULL)
3043			return -1;
3044		    else if (x==Py_None) {
3045			Py_DECREF(x);
3046			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3047			return -1;
3048		    }
3049		    Py_DECREF(x);
3050		}
3051	    }
3052	    *inpos = collendpos;
3053	    break;
3054	default:
3055	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3056		encoding, reason, p, size, exceptionObject,
3057		collstartpos, collendpos, &newpos);
3058	    if (repunicode == NULL)
3059		return -1;
3060	    /* generate replacement  */
3061	    repsize = PyUnicode_GET_SIZE(repunicode);
3062	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3063		x = charmapencode_output(*uni2, mapping, res, respos);
3064		if (x==NULL) {
3065		    Py_DECREF(repunicode);
3066		    return -1;
3067		}
3068		else if (x==Py_None) {
3069		    Py_DECREF(repunicode);
3070		    Py_DECREF(x);
3071		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3072		    return -1;
3073		}
3074		Py_DECREF(x);
3075	    }
3076	    *inpos = newpos;
3077	    Py_DECREF(repunicode);
3078    }
3079    return 0;
3080}
3081
3082PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3083				  int size,
3084				  PyObject *mapping,
3085				  const char *errors)
3086{
3087    /* output object */
3088    PyObject *res = NULL;
3089    /* current input position */
3090    int inpos = 0;
3091    /* current output position */
3092    int respos = 0;
3093    PyObject *errorHandler = NULL;
3094    PyObject *exc = NULL;
3095    /* the following variable is used for caching string comparisons
3096     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3097     * 3=ignore, 4=xmlcharrefreplace */
3098    int known_errorHandler = -1;
3099
3100    /* Default to Latin-1 */
3101    if (mapping == NULL)
3102	return PyUnicode_EncodeLatin1(p, size, errors);
3103
3104    /* allocate enough for a simple encoding without
3105       replacements, if we need more, we'll resize */
3106    res = PyString_FromStringAndSize(NULL, size);
3107    if (res == NULL)
3108        goto onError;
3109    if (size == 0)
3110	return res;
3111
3112    while (inpos<size) {
3113	/* try to encode it */
3114	PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3115	if (x==NULL) /* error */
3116	    goto onError;
3117	if (x==Py_None) { /* unencodable character */
3118	    if (charmap_encoding_error(p, size, &inpos, mapping,
3119		&exc,
3120		&known_errorHandler, &errorHandler, errors,
3121		&res, &respos)) {
3122		Py_DECREF(x);
3123		goto onError;
3124	    }
3125	}
3126	else
3127	    /* done with this character => adjust input position */
3128	    ++inpos;
3129	Py_DECREF(x);
3130    }
3131
3132    /* Resize if we allocated to much */
3133    if (respos<PyString_GET_SIZE(res)) {
3134	if (_PyString_Resize(&res, respos))
3135	    goto onError;
3136    }
3137    Py_XDECREF(exc);
3138    Py_XDECREF(errorHandler);
3139    return res;
3140
3141    onError:
3142    Py_XDECREF(res);
3143    Py_XDECREF(exc);
3144    Py_XDECREF(errorHandler);
3145    return NULL;
3146}
3147
3148PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3149				    PyObject *mapping)
3150{
3151    if (!PyUnicode_Check(unicode) || mapping == NULL) {
3152	PyErr_BadArgument();
3153	return NULL;
3154    }
3155    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3156				   PyUnicode_GET_SIZE(unicode),
3157				   mapping,
3158				   NULL);
3159}
3160
3161/* create or adjust a UnicodeTranslateError */
3162static void make_translate_exception(PyObject **exceptionObject,
3163    const Py_UNICODE *unicode, int size,
3164    int startpos, int endpos,
3165    const char *reason)
3166{
3167    if (*exceptionObject == NULL) {
3168    	*exceptionObject = PyUnicodeTranslateError_Create(
3169	    unicode, size, startpos, endpos, reason);
3170    }
3171    else {
3172	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3173	    goto onError;
3174	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3175	    goto onError;
3176	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3177	    goto onError;
3178	return;
3179	onError:
3180	Py_DECREF(*exceptionObject);
3181	*exceptionObject = NULL;
3182    }
3183}
3184
3185/* raises a UnicodeTranslateError */
3186static void raise_translate_exception(PyObject **exceptionObject,
3187    const Py_UNICODE *unicode, int size,
3188    int startpos, int endpos,
3189    const char *reason)
3190{
3191    make_translate_exception(exceptionObject,
3192	unicode, size, startpos, endpos, reason);
3193    if (*exceptionObject != NULL)
3194	PyCodec_StrictErrors(*exceptionObject);
3195}
3196
3197/* error handling callback helper:
3198   build arguments, call the callback and check the arguments,
3199   put the result into newpos and return the replacement string, which
3200   has to be freed by the caller */
3201static PyObject *unicode_translate_call_errorhandler(const char *errors,
3202    PyObject **errorHandler,
3203    const char *reason,
3204    const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3205    int startpos, int endpos,
3206    int *newpos)
3207{
3208    static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3209
3210    PyObject *restuple;
3211    PyObject *resunicode;
3212
3213    if (*errorHandler == NULL) {
3214	*errorHandler = PyCodec_LookupError(errors);
3215        if (*errorHandler == NULL)
3216	    return NULL;
3217    }
3218
3219    make_translate_exception(exceptionObject,
3220	unicode, size, startpos, endpos, reason);
3221    if (*exceptionObject == NULL)
3222	return NULL;
3223
3224    restuple = PyObject_CallFunctionObjArgs(
3225	*errorHandler, *exceptionObject, NULL);
3226    if (restuple == NULL)
3227	return NULL;
3228    if (!PyTuple_Check(restuple)) {
3229	PyErr_Format(PyExc_TypeError, &argparse[4]);
3230	Py_DECREF(restuple);
3231	return NULL;
3232    }
3233    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3234	&resunicode, newpos)) {
3235	Py_DECREF(restuple);
3236	return NULL;
3237    }
3238    if (*newpos<0)
3239	*newpos = size+*newpos;
3240    if (*newpos<0 || *newpos>size) {
3241	PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3242	Py_DECREF(restuple);
3243	return NULL;
3244    }
3245    Py_INCREF(resunicode);
3246    Py_DECREF(restuple);
3247    return resunicode;
3248}
3249
3250/* Lookup the character ch in the mapping and put the result in result,
3251   which must be decrefed by the caller.
3252   Return 0 on success, -1 on error */
3253static
3254int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3255{
3256    PyObject *w = PyInt_FromLong((long)c);
3257    PyObject *x;
3258
3259    if (w == NULL)
3260	 return -1;
3261    x = PyObject_GetItem(mapping, w);
3262    Py_DECREF(w);
3263    if (x == NULL) {
3264	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3265	    /* No mapping found means: use 1:1 mapping. */
3266	    PyErr_Clear();
3267	    *result = NULL;
3268	    return 0;
3269	} else
3270	    return -1;
3271    }
3272    else if (x == Py_None) {
3273	*result = x;
3274	return 0;
3275    }
3276    else if (PyInt_Check(x)) {
3277	long value = PyInt_AS_LONG(x);
3278	long max = PyUnicode_GetMax();
3279	if (value < 0 || value > max) {
3280	    PyErr_Format(PyExc_TypeError,
3281			     "character mapping must be in range(0x%lx)", max+1);
3282	    Py_DECREF(x);
3283	    return -1;
3284	}
3285	*result = x;
3286	return 0;
3287    }
3288    else if (PyUnicode_Check(x)) {
3289	*result = x;
3290	return 0;
3291    }
3292    else {
3293	/* wrong return value */
3294	PyErr_SetString(PyExc_TypeError,
3295	      "character mapping must return integer, None or unicode");
3296	Py_DECREF(x);
3297	return -1;
3298    }
3299}
3300/* ensure that *outobj is at least requiredsize characters long,
3301if not reallocate and adjust various state variables.
3302Return 0 on success, -1 on error */
3303static
3304int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
3305    int requiredsize)
3306{
3307    int oldsize = PyUnicode_GET_SIZE(*outobj);
3308    if (requiredsize > oldsize) {
3309	/* remember old output position */
3310	int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3311	/* exponentially overallocate to minimize reallocations */
3312	if (requiredsize < 2 * oldsize)
3313	    requiredsize = 2 * oldsize;
3314	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
3315	    return -1;
3316	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3317    }
3318    return 0;
3319}
3320/* lookup the character, put the result in the output string and adjust
3321   various state variables. Return a new reference to the object that
3322   was put in the output buffer in *result, or Py_None, if the mapping was
3323   undefined (in which case no character was written).
3324   The called must decref result.
3325   Return 0 on success, -1 on error. */
3326static
3327int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3328    int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3329    PyObject **res)
3330{
3331    if (charmaptranslate_lookup(*curinp, mapping, res))
3332	return -1;
3333    if (*res==NULL) {
3334	/* not found => default to 1:1 mapping */
3335	*(*outp)++ = *curinp;
3336    }
3337    else if (*res==Py_None)
3338	;
3339    else if (PyInt_Check(*res)) {
3340	/* no overflow check, because we know that the space is enough */
3341	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3342    }
3343    else if (PyUnicode_Check(*res)) {
3344	int repsize = PyUnicode_GET_SIZE(*res);
3345	if (repsize==1) {
3346	    /* no overflow check, because we know that the space is enough */
3347	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3348	}
3349	else if (repsize!=0) {
3350	    /* more than one character */
3351	    int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3352		(insize - (curinp-startinp)) +
3353		repsize - 1;
3354	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
3355		return -1;
3356	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3357	    *outp += repsize;
3358	}
3359    }
3360    else
3361	return -1;
3362    return 0;
3363}
3364
3365PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3366				     int size,
3367				     PyObject *mapping,
3368				     const char *errors)
3369{
3370    /* output object */
3371    PyObject *res = NULL;
3372    /* pointers to the beginning and end+1 of input */
3373    const Py_UNICODE *startp = p;
3374    const Py_UNICODE *endp = p + size;
3375    /* pointer into the output */
3376    Py_UNICODE *str;
3377    /* current output position */
3378    int respos = 0;
3379    char *reason = "character maps to <undefined>";
3380    PyObject *errorHandler = NULL;
3381    PyObject *exc = NULL;
3382    /* the following variable is used for caching string comparisons
3383     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3384     * 3=ignore, 4=xmlcharrefreplace */
3385    int known_errorHandler = -1;
3386
3387    if (mapping == NULL) {
3388	PyErr_BadArgument();
3389	return NULL;
3390    }
3391
3392    /* allocate enough for a simple 1:1 translation without
3393       replacements, if we need more, we'll resize */
3394    res = PyUnicode_FromUnicode(NULL, size);
3395    if (res == NULL)
3396	goto onError;
3397    if (size == 0)
3398	return res;
3399    str = PyUnicode_AS_UNICODE(res);
3400
3401    while (p<endp) {
3402	/* try to encode it */
3403	PyObject *x = NULL;
3404	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
3405	    Py_XDECREF(x);
3406	    goto onError;
3407	}
3408	Py_XDECREF(x);
3409	if (x!=Py_None) /* it worked => adjust input pointer */
3410	    ++p;
3411	else { /* untranslatable character */
3412	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3413	    int repsize;
3414	    int newpos;
3415	    Py_UNICODE *uni2;
3416	    /* startpos for collecting untranslatable chars */
3417	    const Py_UNICODE *collstart = p;
3418	    const Py_UNICODE *collend = p+1;
3419	    const Py_UNICODE *coll;
3420
3421	    /* find all untranslatable characters */
3422	    while (collend < endp) {
3423		if (charmaptranslate_lookup(*collend, mapping, &x))
3424		    goto onError;
3425		Py_XDECREF(x);
3426		if (x!=Py_None)
3427		    break;
3428		++collend;
3429	    }
3430	    /* cache callback name lookup
3431	     * (if not done yet, i.e. it's the first error) */
3432	    if (known_errorHandler==-1) {
3433		if ((errors==NULL) || (!strcmp(errors, "strict")))
3434		    known_errorHandler = 1;
3435		else if (!strcmp(errors, "replace"))
3436		    known_errorHandler = 2;
3437		else if (!strcmp(errors, "ignore"))
3438		    known_errorHandler = 3;
3439		else if (!strcmp(errors, "xmlcharrefreplace"))
3440		    known_errorHandler = 4;
3441		else
3442		    known_errorHandler = 0;
3443	    }
3444	    switch (known_errorHandler) {
3445		case 1: /* strict */
3446		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3447		    goto onError;
3448		case 2: /* replace */
3449		    /* No need to check for space, this is a 1:1 replacement */
3450		    for (coll = collstart; coll<collend; ++coll)
3451			*str++ = '?';
3452		    /* fall through */
3453		case 3: /* ignore */
3454		    p = collend;
3455		    break;
3456		case 4: /* xmlcharrefreplace */
3457		    /* generate replacement (temporarily (mis)uses p) */
3458		    for (p = collstart; p < collend; ++p) {
3459			char buffer[2+29+1+1];
3460			char *cp;
3461			sprintf(buffer, "&#%d;", (int)*p);
3462			if (charmaptranslate_makespace(&res, &str,
3463			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3464			    goto onError;
3465			for (cp = buffer; *cp; ++cp)
3466			    *str++ = *cp;
3467		    }
3468		    p = collend;
3469		    break;
3470		default:
3471		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3472			reason, startp, size, &exc,
3473			collstart-startp, collend-startp, &newpos);
3474		    if (repunicode == NULL)
3475			goto onError;
3476		    /* generate replacement  */
3477		    repsize = PyUnicode_GET_SIZE(repunicode);
3478		    if (charmaptranslate_makespace(&res, &str,
3479			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3480			Py_DECREF(repunicode);
3481			goto onError;
3482		    }
3483		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3484			*str++ = *uni2;
3485		    p = startp + newpos;
3486		    Py_DECREF(repunicode);
3487	    }
3488	}
3489    }
3490    /* Resize if we allocated to much */
3491    respos = str-PyUnicode_AS_UNICODE(res);
3492    if (respos<PyUnicode_GET_SIZE(res)) {
3493	if (_PyUnicode_Resize(&res, respos) < 0)
3494	    goto onError;
3495    }
3496    Py_XDECREF(exc);
3497    Py_XDECREF(errorHandler);
3498    return res;
3499
3500    onError:
3501    Py_XDECREF(res);
3502    Py_XDECREF(exc);
3503    Py_XDECREF(errorHandler);
3504    return NULL;
3505}
3506
3507PyObject *PyUnicode_Translate(PyObject *str,
3508			      PyObject *mapping,
3509			      const char *errors)
3510{
3511    PyObject *result;
3512
3513    str = PyUnicode_FromObject(str);
3514    if (str == NULL)
3515	goto onError;
3516    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3517					PyUnicode_GET_SIZE(str),
3518					mapping,
3519					errors);
3520    Py_DECREF(str);
3521    return result;
3522
3523 onError:
3524    Py_XDECREF(str);
3525    return NULL;
3526}
3527
3528/* --- Decimal Encoder ---------------------------------------------------- */
3529
3530int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3531			    int length,
3532			    char *output,
3533			    const char *errors)
3534{
3535    Py_UNICODE *p, *end;
3536    PyObject *errorHandler = NULL;
3537    PyObject *exc = NULL;
3538    const char *encoding = "decimal";
3539    const char *reason = "invalid decimal Unicode string";
3540    /* the following variable is used for caching string comparisons
3541     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3542    int known_errorHandler = -1;
3543
3544    if (output == NULL) {
3545	PyErr_BadArgument();
3546	return -1;
3547    }
3548
3549    p = s;
3550    end = s + length;
3551    while (p < end) {
3552	register Py_UNICODE ch = *p;
3553	int decimal;
3554	PyObject *repunicode;
3555	int repsize;
3556	int newpos;
3557	Py_UNICODE *uni2;
3558	Py_UNICODE *collstart;
3559	Py_UNICODE *collend;
3560
3561	if (Py_UNICODE_ISSPACE(ch)) {
3562	    *output++ = ' ';
3563	    ++p;
3564	    continue;
3565	}
3566	decimal = Py_UNICODE_TODECIMAL(ch);
3567	if (decimal >= 0) {
3568	    *output++ = '0' + decimal;
3569	    ++p;
3570	    continue;
3571	}
3572	if (0 < ch && ch < 256) {
3573	    *output++ = (char)ch;
3574	    ++p;
3575	    continue;
3576	}
3577	/* All other characters are considered unencodable */
3578	collstart = p;
3579	collend = p+1;
3580	while (collend < end) {
3581	    if ((0 < *collend && *collend < 256) ||
3582	        !Py_UNICODE_ISSPACE(*collend) ||
3583	        Py_UNICODE_TODECIMAL(*collend))
3584		break;
3585	}
3586	/* cache callback name lookup
3587	 * (if not done yet, i.e. it's the first error) */
3588	if (known_errorHandler==-1) {
3589	    if ((errors==NULL) || (!strcmp(errors, "strict")))
3590		known_errorHandler = 1;
3591	    else if (!strcmp(errors, "replace"))
3592		known_errorHandler = 2;
3593	    else if (!strcmp(errors, "ignore"))
3594		known_errorHandler = 3;
3595	    else if (!strcmp(errors, "xmlcharrefreplace"))
3596		known_errorHandler = 4;
3597	    else
3598		known_errorHandler = 0;
3599	}
3600	switch (known_errorHandler) {
3601	    case 1: /* strict */
3602		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3603		goto onError;
3604	    case 2: /* replace */
3605		for (p = collstart; p < collend; ++p)
3606		    *output++ = '?';
3607		/* fall through */
3608	    case 3: /* ignore */
3609		p = collend;
3610		break;
3611	    case 4: /* xmlcharrefreplace */
3612		/* generate replacement (temporarily (mis)uses p) */
3613		for (p = collstart; p < collend; ++p)
3614		    output += sprintf(output, "&#%d;", (int)*p);
3615		p = collend;
3616		break;
3617	    default:
3618		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3619		    encoding, reason, s, length, &exc,
3620		    collstart-s, collend-s, &newpos);
3621		if (repunicode == NULL)
3622		    goto onError;
3623		/* generate replacement  */
3624		repsize = PyUnicode_GET_SIZE(repunicode);
3625		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3626		    Py_UNICODE ch = *uni2;
3627		    if (Py_UNICODE_ISSPACE(ch))
3628			*output++ = ' ';
3629		    else {
3630			decimal = Py_UNICODE_TODECIMAL(ch);
3631			if (decimal >= 0)
3632			    *output++ = '0' + decimal;
3633			else if (0 < ch && ch < 256)
3634			    *output++ = (char)ch;
3635			else {
3636			    Py_DECREF(repunicode);
3637			    raise_encode_exception(&exc, encoding,
3638				s, length, collstart-s, collend-s, reason);
3639			    goto onError;
3640			}
3641		    }
3642		}
3643		p = s + newpos;
3644		Py_DECREF(repunicode);
3645	}
3646    }
3647    /* 0-terminate the output string */
3648    *output++ = '\0';
3649    Py_XDECREF(exc);
3650    Py_XDECREF(errorHandler);
3651    return 0;
3652
3653 onError:
3654    Py_XDECREF(exc);
3655    Py_XDECREF(errorHandler);
3656    return -1;
3657}
3658
3659/* --- Helpers ------------------------------------------------------------ */
3660
3661static
3662int count(PyUnicodeObject *self,
3663	  int start,
3664	  int end,
3665	  PyUnicodeObject *substring)
3666{
3667    int count = 0;
3668
3669    if (start < 0)
3670        start += self->length;
3671    if (start < 0)
3672        start = 0;
3673    if (end > self->length)
3674        end = self->length;
3675    if (end < 0)
3676        end += self->length;
3677    if (end < 0)
3678        end = 0;
3679
3680    if (substring->length == 0)
3681	return (end - start + 1);
3682
3683    end -= substring->length;
3684
3685    while (start <= end)
3686        if (Py_UNICODE_MATCH(self, start, substring)) {
3687            count++;
3688            start += substring->length;
3689        } else
3690            start++;
3691
3692    return count;
3693}
3694
3695int PyUnicode_Count(PyObject *str,
3696		    PyObject *substr,
3697		    int start,
3698		    int end)
3699{
3700    int result;
3701
3702    str = PyUnicode_FromObject(str);
3703    if (str == NULL)
3704	return -1;
3705    substr = PyUnicode_FromObject(substr);
3706    if (substr == NULL) {
3707	Py_DECREF(str);
3708	return -1;
3709    }
3710
3711    result = count((PyUnicodeObject *)str,
3712		   start, end,
3713		   (PyUnicodeObject *)substr);
3714
3715    Py_DECREF(str);
3716    Py_DECREF(substr);
3717    return result;
3718}
3719
3720static
3721int findstring(PyUnicodeObject *self,
3722	       PyUnicodeObject *substring,
3723	       int start,
3724	       int end,
3725	       int direction)
3726{
3727    if (start < 0)
3728        start += self->length;
3729    if (start < 0)
3730        start = 0;
3731
3732    if (end > self->length)
3733        end = self->length;
3734    if (end < 0)
3735        end += self->length;
3736    if (end < 0)
3737        end = 0;
3738
3739    if (substring->length == 0)
3740	return (direction > 0) ? start : end;
3741
3742    end -= substring->length;
3743
3744    if (direction < 0) {
3745        for (; end >= start; end--)
3746            if (Py_UNICODE_MATCH(self, end, substring))
3747                return end;
3748    } else {
3749        for (; start <= end; start++)
3750            if (Py_UNICODE_MATCH(self, start, substring))
3751                return start;
3752    }
3753
3754    return -1;
3755}
3756
3757int PyUnicode_Find(PyObject *str,
3758		   PyObject *substr,
3759		   int start,
3760		   int end,
3761		   int direction)
3762{
3763    int result;
3764
3765    str = PyUnicode_FromObject(str);
3766    if (str == NULL)
3767	return -2;
3768    substr = PyUnicode_FromObject(substr);
3769    if (substr == NULL) {
3770	Py_DECREF(str);
3771	return -2;
3772    }
3773
3774    result = findstring((PyUnicodeObject *)str,
3775			(PyUnicodeObject *)substr,
3776			start, end, direction);
3777    Py_DECREF(str);
3778    Py_DECREF(substr);
3779    return result;
3780}
3781
3782static
3783int tailmatch(PyUnicodeObject *self,
3784	      PyUnicodeObject *substring,
3785	      int start,
3786	      int end,
3787	      int direction)
3788{
3789    if (start < 0)
3790        start += self->length;
3791    if (start < 0)
3792        start = 0;
3793
3794    if (substring->length == 0)
3795        return 1;
3796
3797    if (end > self->length)
3798        end = self->length;
3799    if (end < 0)
3800        end += self->length;
3801    if (end < 0)
3802        end = 0;
3803
3804    end -= substring->length;
3805    if (end < start)
3806	return 0;
3807
3808    if (direction > 0) {
3809	if (Py_UNICODE_MATCH(self, end, substring))
3810	    return 1;
3811    } else {
3812        if (Py_UNICODE_MATCH(self, start, substring))
3813	    return 1;
3814    }
3815
3816    return 0;
3817}
3818
3819int PyUnicode_Tailmatch(PyObject *str,
3820			PyObject *substr,
3821			int start,
3822			int end,
3823			int direction)
3824{
3825    int result;
3826
3827    str = PyUnicode_FromObject(str);
3828    if (str == NULL)
3829	return -1;
3830    substr = PyUnicode_FromObject(substr);
3831    if (substr == NULL) {
3832	Py_DECREF(substr);
3833	return -1;
3834    }
3835
3836    result = tailmatch((PyUnicodeObject *)str,
3837		       (PyUnicodeObject *)substr,
3838		       start, end, direction);
3839    Py_DECREF(str);
3840    Py_DECREF(substr);
3841    return result;
3842}
3843
3844static
3845const Py_UNICODE *findchar(const Py_UNICODE *s,
3846		     int size,
3847		     Py_UNICODE ch)
3848{
3849    /* like wcschr, but doesn't stop at NULL characters */
3850
3851    while (size-- > 0) {
3852        if (*s == ch)
3853            return s;
3854        s++;
3855    }
3856
3857    return NULL;
3858}
3859
3860/* Apply fixfct filter to the Unicode object self and return a
3861   reference to the modified object */
3862
3863static
3864PyObject *fixup(PyUnicodeObject *self,
3865		int (*fixfct)(PyUnicodeObject *s))
3866{
3867
3868    PyUnicodeObject *u;
3869
3870    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3871    if (u == NULL)
3872	return NULL;
3873
3874    Py_UNICODE_COPY(u->str, self->str, self->length);
3875
3876    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3877	/* fixfct should return TRUE if it modified the buffer. If
3878	   FALSE, return a reference to the original buffer instead
3879	   (to save space, not time) */
3880	Py_INCREF(self);
3881	Py_DECREF(u);
3882	return (PyObject*) self;
3883    }
3884    return (PyObject*) u;
3885}
3886
3887static
3888int fixupper(PyUnicodeObject *self)
3889{
3890    int len = self->length;
3891    Py_UNICODE *s = self->str;
3892    int status = 0;
3893
3894    while (len-- > 0) {
3895	register Py_UNICODE ch;
3896
3897	ch = Py_UNICODE_TOUPPER(*s);
3898	if (ch != *s) {
3899            status = 1;
3900	    *s = ch;
3901	}
3902        s++;
3903    }
3904
3905    return status;
3906}
3907
3908static
3909int fixlower(PyUnicodeObject *self)
3910{
3911    int len = self->length;
3912    Py_UNICODE *s = self->str;
3913    int status = 0;
3914
3915    while (len-- > 0) {
3916	register Py_UNICODE ch;
3917
3918	ch = Py_UNICODE_TOLOWER(*s);
3919	if (ch != *s) {
3920            status = 1;
3921	    *s = ch;
3922	}
3923        s++;
3924    }
3925
3926    return status;
3927}
3928
3929static
3930int fixswapcase(PyUnicodeObject *self)
3931{
3932    int len = self->length;
3933    Py_UNICODE *s = self->str;
3934    int status = 0;
3935
3936    while (len-- > 0) {
3937        if (Py_UNICODE_ISUPPER(*s)) {
3938            *s = Py_UNICODE_TOLOWER(*s);
3939            status = 1;
3940        } else if (Py_UNICODE_ISLOWER(*s)) {
3941            *s = Py_UNICODE_TOUPPER(*s);
3942            status = 1;
3943        }
3944        s++;
3945    }
3946
3947    return status;
3948}
3949
3950static
3951int fixcapitalize(PyUnicodeObject *self)
3952{
3953    int len = self->length;
3954    Py_UNICODE *s = self->str;
3955    int status = 0;
3956
3957    if (len == 0)
3958	return 0;
3959    if (Py_UNICODE_ISLOWER(*s)) {
3960	*s = Py_UNICODE_TOUPPER(*s);
3961	status = 1;
3962    }
3963    s++;
3964    while (--len > 0) {
3965        if (Py_UNICODE_ISUPPER(*s)) {
3966            *s = Py_UNICODE_TOLOWER(*s);
3967            status = 1;
3968        }
3969        s++;
3970    }
3971    return status;
3972}
3973
3974static
3975int fixtitle(PyUnicodeObject *self)
3976{
3977    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3978    register Py_UNICODE *e;
3979    int previous_is_cased;
3980
3981    /* Shortcut for single character strings */
3982    if (PyUnicode_GET_SIZE(self) == 1) {
3983	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3984	if (*p != ch) {
3985	    *p = ch;
3986	    return 1;
3987	}
3988	else
3989	    return 0;
3990    }
3991
3992    e = p + PyUnicode_GET_SIZE(self);
3993    previous_is_cased = 0;
3994    for (; p < e; p++) {
3995	register const Py_UNICODE ch = *p;
3996
3997	if (previous_is_cased)
3998	    *p = Py_UNICODE_TOLOWER(ch);
3999	else
4000	    *p = Py_UNICODE_TOTITLE(ch);
4001
4002	if (Py_UNICODE_ISLOWER(ch) ||
4003	    Py_UNICODE_ISUPPER(ch) ||
4004	    Py_UNICODE_ISTITLE(ch))
4005	    previous_is_cased = 1;
4006	else
4007	    previous_is_cased = 0;
4008    }
4009    return 1;
4010}
4011
4012PyObject *
4013PyUnicode_Join(PyObject *separator, PyObject *seq)
4014{
4015    PyObject *internal_separator = NULL;
4016    const Py_UNICODE *sep;
4017    size_t seplen;
4018    PyUnicodeObject *res = NULL; /* the result */
4019    size_t res_alloc = 100;  /* # allocated bytes for string in res */
4020    size_t res_used;         /* # used bytes */
4021    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
4022    PyObject *fseq;          /* PySequence_Fast(seq) */
4023    int seqlen;              /* len(fseq) -- number of items in sequence */
4024    const Py_UNICODE blank = ' ';
4025    PyObject *item;
4026    int i;
4027
4028    fseq = PySequence_Fast(seq, "");
4029    if (fseq == NULL) {
4030	if (PyErr_ExceptionMatches(PyExc_TypeError))
4031	    PyErr_Format(PyExc_TypeError,
4032			 "sequence expected, %.80s found",
4033			 seq->ob_type->tp_name);
4034    	return NULL;
4035    }
4036
4037    /* Grrrr.  A codec may be invoked to convert str objects to
4038     * Unicode, and so it's possible to call back into Python code
4039     * during PyUnicode_FromObject(), and so it's possible for a sick
4040     * codec to change the size of fseq (if seq is a list).  Therefore
4041     * we have to keep refetching the size -- can't assume seqlen
4042     * is invariant.
4043     */
4044    seqlen = PySequence_Fast_GET_SIZE(fseq);
4045    /* If empty sequence, return u"". */
4046    if (seqlen == 0) {
4047    	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
4048    	goto Done;
4049    }
4050    /* If singleton sequence with an exact Unicode, return that. */
4051    if (seqlen == 1) {
4052	item = PySequence_Fast_GET_ITEM(fseq, 0);
4053	if (PyUnicode_CheckExact(item)) {
4054	    Py_INCREF(item);
4055	    res = (PyUnicodeObject *)item;
4056	    goto Done;
4057	}
4058    }
4059
4060    /* At least two items to join, or one that isn't exact Unicode. */
4061    if (seqlen > 1) {
4062        /* Set up sep and seplen -- they're needed. */
4063    	if (separator == NULL) {
4064	    sep = &blank;
4065	    seplen = 1;
4066        }
4067    	else {
4068	    internal_separator = PyUnicode_FromObject(separator);
4069	    if (internal_separator == NULL)
4070	        goto onError;
4071	    sep = PyUnicode_AS_UNICODE(internal_separator);
4072	    seplen = PyUnicode_GET_SIZE(internal_separator);
4073	    /* In case PyUnicode_FromObject() mutated seq. */
4074	    seqlen = PySequence_Fast_GET_SIZE(fseq);
4075        }
4076    }
4077
4078    /* Get space. */
4079    res = _PyUnicode_New((int)res_alloc);
4080    if (res == NULL)
4081        goto onError;
4082    res_p = PyUnicode_AS_UNICODE(res);
4083    res_used = 0;
4084
4085    for (i = 0; i < seqlen; ++i) {
4086	size_t itemlen;
4087	size_t new_res_used;
4088
4089	item = PySequence_Fast_GET_ITEM(fseq, i);
4090	/* Convert item to Unicode. */
4091	if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4092	    PyErr_Format(PyExc_TypeError,
4093			 "sequence item %i: expected string or Unicode,"
4094			 " %.80s found",
4095			 i, item->ob_type->tp_name);
4096	    goto onError;
4097	}
4098	item = PyUnicode_FromObject(item);
4099	if (item == NULL)
4100	    goto onError;
4101	/* We own a reference to item from here on. */
4102
4103	/* In case PyUnicode_FromObject() mutated seq. */
4104	seqlen = PySequence_Fast_GET_SIZE(fseq);
4105
4106        /* Make sure we have enough space for the separator and the item. */
4107	itemlen = PyUnicode_GET_SIZE(item);
4108	new_res_used = res_used + itemlen;
4109	if (new_res_used < res_used ||  new_res_used > INT_MAX)
4110	    goto Overflow;
4111	if (i < seqlen - 1) {
4112	    new_res_used += seplen;
4113	    if (new_res_used < res_used ||  new_res_used > INT_MAX)
4114		goto Overflow;
4115	}
4116	if (new_res_used > res_alloc) {
4117	    /* double allocated size until it's big enough */
4118	    do {
4119	        size_t oldsize = res_alloc;
4120	        res_alloc += res_alloc;
4121	        if (res_alloc < oldsize || res_alloc > INT_MAX)
4122	            goto Overflow;
4123	    } while (new_res_used > res_alloc);
4124	    if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
4125		Py_DECREF(item);
4126		goto onError;
4127	    }
4128            res_p = PyUnicode_AS_UNICODE(res) + res_used;
4129	}
4130
4131	/* Copy item, and maybe the separator. */
4132	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4133	res_p += itemlen;
4134	if (i < seqlen - 1) {
4135	    Py_UNICODE_COPY(res_p, sep, (int)seplen);
4136	    res_p += seplen;
4137	}
4138	Py_DECREF(item);
4139	res_used = new_res_used;
4140    }
4141
4142    /* Shrink res to match the used area; this probably can't fail,
4143     * but it's cheap to check.
4144     */
4145    if (_PyUnicode_Resize(&res, (int)res_used) < 0)
4146	goto onError;
4147
4148 Done:
4149    Py_XDECREF(internal_separator);
4150    Py_DECREF(fseq);
4151    return (PyObject *)res;
4152
4153 Overflow:
4154    PyErr_SetString(PyExc_OverflowError,
4155                    "join() is too long for a Python string");
4156    Py_DECREF(item);
4157    /* fall through */
4158
4159 onError:
4160    Py_XDECREF(internal_separator);
4161    Py_DECREF(fseq);
4162    Py_XDECREF(res);
4163    return NULL;
4164}
4165
4166static
4167PyUnicodeObject *pad(PyUnicodeObject *self,
4168		     int left,
4169		     int right,
4170		     Py_UNICODE fill)
4171{
4172    PyUnicodeObject *u;
4173
4174    if (left < 0)
4175        left = 0;
4176    if (right < 0)
4177        right = 0;
4178
4179    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4180        Py_INCREF(self);
4181        return self;
4182    }
4183
4184    u = _PyUnicode_New(left + self->length + right);
4185    if (u) {
4186        if (left)
4187            Py_UNICODE_FILL(u->str, fill, left);
4188        Py_UNICODE_COPY(u->str + left, self->str, self->length);
4189        if (right)
4190            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4191    }
4192
4193    return u;
4194}
4195
4196#define SPLIT_APPEND(data, left, right)					\
4197	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
4198	if (!str)							\
4199	    goto onError;						\
4200	if (PyList_Append(list, str)) {					\
4201	    Py_DECREF(str);						\
4202	    goto onError;						\
4203	}								\
4204        else								\
4205            Py_DECREF(str);
4206
4207#define SPLIT_INSERT(data, left, right)					\
4208	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
4209	if (!str)							\
4210	    goto onError;						\
4211	if (PyList_Insert(list, 0, str)) {				\
4212	    Py_DECREF(str);						\
4213	    goto onError;						\
4214	}								\
4215        else								\
4216            Py_DECREF(str);
4217
4218static
4219PyObject *split_whitespace(PyUnicodeObject *self,
4220			   PyObject *list,
4221			   int maxcount)
4222{
4223    register int i;
4224    register int j;
4225    int len = self->length;
4226    PyObject *str;
4227
4228    for (i = j = 0; i < len; ) {
4229	/* find a token */
4230	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4231	    i++;
4232	j = i;
4233	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4234	    i++;
4235	if (j < i) {
4236	    if (maxcount-- <= 0)
4237		break;
4238	    SPLIT_APPEND(self->str, j, i);
4239	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4240		i++;
4241	    j = i;
4242	}
4243    }
4244    if (j < len) {
4245	SPLIT_APPEND(self->str, j, len);
4246    }
4247    return list;
4248
4249 onError:
4250    Py_DECREF(list);
4251    return NULL;
4252}
4253
4254PyObject *PyUnicode_Splitlines(PyObject *string,
4255			       int keepends)
4256{
4257    register int i;
4258    register int j;
4259    int len;
4260    PyObject *list;
4261    PyObject *str;
4262    Py_UNICODE *data;
4263
4264    string = PyUnicode_FromObject(string);
4265    if (string == NULL)
4266	return NULL;
4267    data = PyUnicode_AS_UNICODE(string);
4268    len = PyUnicode_GET_SIZE(string);
4269
4270    list = PyList_New(0);
4271    if (!list)
4272        goto onError;
4273
4274    for (i = j = 0; i < len; ) {
4275	int eol;
4276
4277	/* Find a line and append it */
4278	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4279	    i++;
4280
4281	/* Skip the line break reading CRLF as one line break */
4282	eol = i;
4283	if (i < len) {
4284	    if (data[i] == '\r' && i + 1 < len &&
4285		data[i+1] == '\n')
4286		i += 2;
4287	    else
4288		i++;
4289	    if (keepends)
4290		eol = i;
4291	}
4292	SPLIT_APPEND(data, j, eol);
4293	j = i;
4294    }
4295    if (j < len) {
4296	SPLIT_APPEND(data, j, len);
4297    }
4298
4299    Py_DECREF(string);
4300    return list;
4301
4302 onError:
4303    Py_DECREF(list);
4304    Py_DECREF(string);
4305    return NULL;
4306}
4307
4308static
4309PyObject *split_char(PyUnicodeObject *self,
4310		     PyObject *list,
4311		     Py_UNICODE ch,
4312		     int maxcount)
4313{
4314    register int i;
4315    register int j;
4316    int len = self->length;
4317    PyObject *str;
4318
4319    for (i = j = 0; i < len; ) {
4320	if (self->str[i] == ch) {
4321	    if (maxcount-- <= 0)
4322		break;
4323	    SPLIT_APPEND(self->str, j, i);
4324	    i = j = i + 1;
4325	} else
4326	    i++;
4327    }
4328    if (j <= len) {
4329	SPLIT_APPEND(self->str, j, len);
4330    }
4331    return list;
4332
4333 onError:
4334    Py_DECREF(list);
4335    return NULL;
4336}
4337
4338static
4339PyObject *split_substring(PyUnicodeObject *self,
4340			  PyObject *list,
4341			  PyUnicodeObject *substring,
4342			  int maxcount)
4343{
4344    register int i;
4345    register int j;
4346    int len = self->length;
4347    int sublen = substring->length;
4348    PyObject *str;
4349
4350    for (i = j = 0; i <= len - sublen; ) {
4351	if (Py_UNICODE_MATCH(self, i, substring)) {
4352	    if (maxcount-- <= 0)
4353		break;
4354	    SPLIT_APPEND(self->str, j, i);
4355	    i = j = i + sublen;
4356	} else
4357	    i++;
4358    }
4359    if (j <= len) {
4360	SPLIT_APPEND(self->str, j, len);
4361    }
4362    return list;
4363
4364 onError:
4365    Py_DECREF(list);
4366    return NULL;
4367}
4368
4369static
4370PyObject *rsplit_whitespace(PyUnicodeObject *self,
4371			    PyObject *list,
4372			    int maxcount)
4373{
4374    register int i;
4375    register int j;
4376    int len = self->length;
4377    PyObject *str;
4378
4379    for (i = j = len - 1; i >= 0; ) {
4380	/* find a token */
4381	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4382	    i--;
4383	j = i;
4384	while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4385	    i--;
4386	if (j > i) {
4387	    if (maxcount-- <= 0)
4388		break;
4389	    SPLIT_INSERT(self->str, i + 1, j + 1);
4390	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4391		i--;
4392	    j = i;
4393	}
4394    }
4395    if (j >= 0) {
4396	SPLIT_INSERT(self->str, 0, j + 1);
4397    }
4398    return list;
4399
4400 onError:
4401    Py_DECREF(list);
4402    return NULL;
4403}
4404
4405static
4406PyObject *rsplit_char(PyUnicodeObject *self,
4407		      PyObject *list,
4408		      Py_UNICODE ch,
4409		      int maxcount)
4410{
4411    register int i;
4412    register int j;
4413    int len = self->length;
4414    PyObject *str;
4415
4416    for (i = j = len - 1; i >= 0; ) {
4417	if (self->str[i] == ch) {
4418	    if (maxcount-- <= 0)
4419		break;
4420	    SPLIT_INSERT(self->str, i + 1, j + 1);
4421	    j = i = i - 1;
4422	} else
4423	    i--;
4424    }
4425    if (j >= -1) {
4426	SPLIT_INSERT(self->str, 0, j + 1);
4427    }
4428    return list;
4429
4430 onError:
4431    Py_DECREF(list);
4432    return NULL;
4433}
4434
4435static
4436PyObject *rsplit_substring(PyUnicodeObject *self,
4437			   PyObject *list,
4438			   PyUnicodeObject *substring,
4439			   int maxcount)
4440{
4441    register int i;
4442    register int j;
4443    int len = self->length;
4444    int sublen = substring->length;
4445    PyObject *str;
4446
4447    for (i = len - sublen, j = len; i >= 0; ) {
4448	if (Py_UNICODE_MATCH(self, i, substring)) {
4449	    if (maxcount-- <= 0)
4450		break;
4451	    SPLIT_INSERT(self->str, i + sublen, j);
4452	    j = i;
4453	    i -= sublen;
4454	} else
4455	    i--;
4456    }
4457    if (j >= 0) {
4458	SPLIT_INSERT(self->str, 0, j);
4459    }
4460    return list;
4461
4462 onError:
4463    Py_DECREF(list);
4464    return NULL;
4465}
4466
4467#undef SPLIT_APPEND
4468#undef SPLIT_INSERT
4469
4470static
4471PyObject *split(PyUnicodeObject *self,
4472		PyUnicodeObject *substring,
4473		int maxcount)
4474{
4475    PyObject *list;
4476
4477    if (maxcount < 0)
4478        maxcount = INT_MAX;
4479
4480    list = PyList_New(0);
4481    if (!list)
4482        return NULL;
4483
4484    if (substring == NULL)
4485	return split_whitespace(self,list,maxcount);
4486
4487    else if (substring->length == 1)
4488	return split_char(self,list,substring->str[0],maxcount);
4489
4490    else if (substring->length == 0) {
4491	Py_DECREF(list);
4492	PyErr_SetString(PyExc_ValueError, "empty separator");
4493	return NULL;
4494    }
4495    else
4496	return split_substring(self,list,substring,maxcount);
4497}
4498
4499static
4500PyObject *rsplit(PyUnicodeObject *self,
4501		 PyUnicodeObject *substring,
4502		 int maxcount)
4503{
4504    PyObject *list;
4505
4506    if (maxcount < 0)
4507        maxcount = INT_MAX;
4508
4509    list = PyList_New(0);
4510    if (!list)
4511        return NULL;
4512
4513    if (substring == NULL)
4514	return rsplit_whitespace(self,list,maxcount);
4515
4516    else if (substring->length == 1)
4517	return rsplit_char(self,list,substring->str[0],maxcount);
4518
4519    else if (substring->length == 0) {
4520	Py_DECREF(list);
4521	PyErr_SetString(PyExc_ValueError, "empty separator");
4522	return NULL;
4523    }
4524    else
4525	return rsplit_substring(self,list,substring,maxcount);
4526}
4527
4528static
4529PyObject *replace(PyUnicodeObject *self,
4530		  PyUnicodeObject *str1,
4531		  PyUnicodeObject *str2,
4532		  int maxcount)
4533{
4534    PyUnicodeObject *u;
4535
4536    if (maxcount < 0)
4537	maxcount = INT_MAX;
4538
4539    if (str1->length == 1 && str2->length == 1) {
4540        int i;
4541
4542        /* replace characters */
4543        if (!findchar(self->str, self->length, str1->str[0]) &&
4544            PyUnicode_CheckExact(self)) {
4545            /* nothing to replace, return original string */
4546            Py_INCREF(self);
4547            u = self;
4548        } else {
4549	    Py_UNICODE u1 = str1->str[0];
4550	    Py_UNICODE u2 = str2->str[0];
4551
4552            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
4553                NULL,
4554                self->length
4555                );
4556            if (u != NULL) {
4557		Py_UNICODE_COPY(u->str, self->str,
4558				self->length);
4559                for (i = 0; i < u->length; i++)
4560                    if (u->str[i] == u1) {
4561                        if (--maxcount < 0)
4562                            break;
4563                        u->str[i] = u2;
4564                    }
4565        }
4566        }
4567
4568    } else {
4569        int n, i;
4570        Py_UNICODE *p;
4571
4572        /* replace strings */
4573        n = count(self, 0, self->length, str1);
4574        if (n > maxcount)
4575            n = maxcount;
4576        if (n == 0) {
4577            /* nothing to replace, return original string */
4578            if (PyUnicode_CheckExact(self)) {
4579                Py_INCREF(self);
4580                u = self;
4581            }
4582            else {
4583                u = (PyUnicodeObject *)
4584                    PyUnicode_FromUnicode(self->str, self->length);
4585	    }
4586        } else {
4587            u = _PyUnicode_New(
4588                self->length + n * (str2->length - str1->length));
4589            if (u) {
4590                i = 0;
4591                p = u->str;
4592                if (str1->length > 0) {
4593                    while (i <= self->length - str1->length)
4594                        if (Py_UNICODE_MATCH(self, i, str1)) {
4595                            /* replace string segment */
4596                            Py_UNICODE_COPY(p, str2->str, str2->length);
4597                            p += str2->length;
4598                            i += str1->length;
4599                            if (--n <= 0) {
4600                                /* copy remaining part */
4601                                Py_UNICODE_COPY(p, self->str+i, self->length-i);
4602                                break;
4603                            }
4604                        } else
4605                            *p++ = self->str[i++];
4606                } else {
4607                    while (n > 0) {
4608                        Py_UNICODE_COPY(p, str2->str, str2->length);
4609                        p += str2->length;
4610                        if (--n <= 0)
4611                            break;
4612                        *p++ = self->str[i++];
4613                    }
4614                    Py_UNICODE_COPY(p, self->str+i, self->length-i);
4615                }
4616            }
4617        }
4618    }
4619
4620    return (PyObject *) u;
4621}
4622
4623/* --- Unicode Object Methods --------------------------------------------- */
4624
4625PyDoc_STRVAR(title__doc__,
4626"S.title() -> unicode\n\
4627\n\
4628Return a titlecased version of S, i.e. words start with title case\n\
4629characters, all remaining cased characters have lower case.");
4630
4631static PyObject*
4632unicode_title(PyUnicodeObject *self)
4633{
4634    return fixup(self, fixtitle);
4635}
4636
4637PyDoc_STRVAR(capitalize__doc__,
4638"S.capitalize() -> unicode\n\
4639\n\
4640Return a capitalized version of S, i.e. make the first character\n\
4641have upper case.");
4642
4643static PyObject*
4644unicode_capitalize(PyUnicodeObject *self)
4645{
4646    return fixup(self, fixcapitalize);
4647}
4648
4649#if 0
4650PyDoc_STRVAR(capwords__doc__,
4651"S.capwords() -> unicode\n\
4652\n\
4653Apply .capitalize() to all words in S and return the result with\n\
4654normalized whitespace (all whitespace strings are replaced by ' ').");
4655
4656static PyObject*
4657unicode_capwords(PyUnicodeObject *self)
4658{
4659    PyObject *list;
4660    PyObject *item;
4661    int i;
4662
4663    /* Split into words */
4664    list = split(self, NULL, -1);
4665    if (!list)
4666        return NULL;
4667
4668    /* Capitalize each word */
4669    for (i = 0; i < PyList_GET_SIZE(list); i++) {
4670        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4671		     fixcapitalize);
4672        if (item == NULL)
4673            goto onError;
4674        Py_DECREF(PyList_GET_ITEM(list, i));
4675        PyList_SET_ITEM(list, i, item);
4676    }
4677
4678    /* Join the words to form a new string */
4679    item = PyUnicode_Join(NULL, list);
4680
4681onError:
4682    Py_DECREF(list);
4683    return (PyObject *)item;
4684}
4685#endif
4686
4687/* Argument converter.  Coerces to a single unicode character */
4688
4689static int
4690convert_uc(PyObject *obj, void *addr)
4691{
4692	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4693	PyObject *uniobj;
4694	Py_UNICODE *unistr;
4695
4696	uniobj = PyUnicode_FromObject(obj);
4697	if (uniobj == NULL) {
4698		PyErr_SetString(PyExc_TypeError,
4699			"The fill character cannot be converted to Unicode");
4700		return 0;
4701	}
4702	if (PyUnicode_GET_SIZE(uniobj) != 1) {
4703		PyErr_SetString(PyExc_TypeError,
4704			"The fill character must be exactly one character long");
4705		Py_DECREF(uniobj);
4706		return 0;
4707	}
4708	unistr = PyUnicode_AS_UNICODE(uniobj);
4709	*fillcharloc = unistr[0];
4710	Py_DECREF(uniobj);
4711	return 1;
4712}
4713
4714PyDoc_STRVAR(center__doc__,
4715"S.center(width[, fillchar]) -> unicode\n\
4716\n\
4717Return S centered in a Unicode string of length width. Padding is\n\
4718done using the specified fill character (default is a space)");
4719
4720static PyObject *
4721unicode_center(PyUnicodeObject *self, PyObject *args)
4722{
4723    int marg, left;
4724    int width;
4725    Py_UNICODE fillchar = ' ';
4726
4727    if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
4728        return NULL;
4729
4730    if (self->length >= width && PyUnicode_CheckExact(self)) {
4731        Py_INCREF(self);
4732        return (PyObject*) self;
4733    }
4734
4735    marg = width - self->length;
4736    left = marg / 2 + (marg & width & 1);
4737
4738    return (PyObject*) pad(self, left, marg - left, fillchar);
4739}
4740
4741#if 0
4742
4743/* This code should go into some future Unicode collation support
4744   module. The basic comparison should compare ordinals on a naive
4745   basis (this is what Java does and thus JPython too). */
4746
4747/* speedy UTF-16 code point order comparison */
4748/* gleaned from: */
4749/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4750
4751static short utf16Fixup[32] =
4752{
4753    0, 0, 0, 0, 0, 0, 0, 0,
4754    0, 0, 0, 0, 0, 0, 0, 0,
4755    0, 0, 0, 0, 0, 0, 0, 0,
4756    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4757};
4758
4759static int
4760unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4761{
4762    int len1, len2;
4763
4764    Py_UNICODE *s1 = str1->str;
4765    Py_UNICODE *s2 = str2->str;
4766
4767    len1 = str1->length;
4768    len2 = str2->length;
4769
4770    while (len1 > 0 && len2 > 0) {
4771        Py_UNICODE c1, c2;
4772
4773        c1 = *s1++;
4774        c2 = *s2++;
4775
4776	if (c1 > (1<<11) * 26)
4777	    c1 += utf16Fixup[c1>>11];
4778	if (c2 > (1<<11) * 26)
4779            c2 += utf16Fixup[c2>>11];
4780        /* now c1 and c2 are in UTF-32-compatible order */
4781
4782        if (c1 != c2)
4783            return (c1 < c2) ? -1 : 1;
4784
4785        len1--; len2--;
4786    }
4787
4788    return (len1 < len2) ? -1 : (len1 != len2);
4789}
4790
4791#else
4792
4793static int
4794unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4795{
4796    register int len1, len2;
4797
4798    Py_UNICODE *s1 = str1->str;
4799    Py_UNICODE *s2 = str2->str;
4800
4801    len1 = str1->length;
4802    len2 = str2->length;
4803
4804    while (len1 > 0 && len2 > 0) {
4805        Py_UNICODE c1, c2;
4806
4807        c1 = *s1++;
4808        c2 = *s2++;
4809
4810        if (c1 != c2)
4811            return (c1 < c2) ? -1 : 1;
4812
4813        len1--; len2--;
4814    }
4815
4816    return (len1 < len2) ? -1 : (len1 != len2);
4817}
4818
4819#endif
4820
4821int PyUnicode_Compare(PyObject *left,
4822		      PyObject *right)
4823{
4824    PyUnicodeObject *u = NULL, *v = NULL;
4825    int result;
4826
4827    /* Coerce the two arguments */
4828    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4829    if (u == NULL)
4830	goto onError;
4831    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4832    if (v == NULL)
4833	goto onError;
4834
4835    /* Shortcut for empty or interned objects */
4836    if (v == u) {
4837	Py_DECREF(u);
4838	Py_DECREF(v);
4839	return 0;
4840    }
4841
4842    result = unicode_compare(u, v);
4843
4844    Py_DECREF(u);
4845    Py_DECREF(v);
4846    return result;
4847
4848onError:
4849    Py_XDECREF(u);
4850    Py_XDECREF(v);
4851    return -1;
4852}
4853
4854int PyUnicode_Contains(PyObject *container,
4855		       PyObject *element)
4856{
4857    PyUnicodeObject *u = NULL, *v = NULL;
4858    int result, size;
4859    register const Py_UNICODE *lhs, *end, *rhs;
4860
4861    /* Coerce the two arguments */
4862    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
4863    if (v == NULL) {
4864	PyErr_SetString(PyExc_TypeError,
4865	    "'in <string>' requires string as left operand");
4866	goto onError;
4867    }
4868    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
4869    if (u == NULL)
4870	goto onError;
4871
4872    size = PyUnicode_GET_SIZE(v);
4873    rhs = PyUnicode_AS_UNICODE(v);
4874    lhs = PyUnicode_AS_UNICODE(u);
4875
4876    result = 0;
4877    if (size == 1) {
4878	end = lhs + PyUnicode_GET_SIZE(u);
4879	while (lhs < end) {
4880	    if (*lhs++ == *rhs) {
4881		result = 1;
4882		break;
4883	    }
4884	}
4885    }
4886    else {
4887	end = lhs + (PyUnicode_GET_SIZE(u) - size);
4888	while (lhs <= end) {
4889	    if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
4890		result = 1;
4891		break;
4892	    }
4893	}
4894    }
4895
4896    Py_DECREF(u);
4897    Py_DECREF(v);
4898    return result;
4899
4900onError:
4901    Py_XDECREF(u);
4902    Py_XDECREF(v);
4903    return -1;
4904}
4905
4906/* Concat to string or Unicode object giving a new Unicode object. */
4907
4908PyObject *PyUnicode_Concat(PyObject *left,
4909			   PyObject *right)
4910{
4911    PyUnicodeObject *u = NULL, *v = NULL, *w;
4912
4913    /* Coerce the two arguments */
4914    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4915    if (u == NULL)
4916	goto onError;
4917    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4918    if (v == NULL)
4919	goto onError;
4920
4921    /* Shortcuts */
4922    if (v == unicode_empty) {
4923	Py_DECREF(v);
4924	return (PyObject *)u;
4925    }
4926    if (u == unicode_empty) {
4927	Py_DECREF(u);
4928	return (PyObject *)v;
4929    }
4930
4931    /* Concat the two Unicode strings */
4932    w = _PyUnicode_New(u->length + v->length);
4933    if (w == NULL)
4934	goto onError;
4935    Py_UNICODE_COPY(w->str, u->str, u->length);
4936    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4937
4938    Py_DECREF(u);
4939    Py_DECREF(v);
4940    return (PyObject *)w;
4941
4942onError:
4943    Py_XDECREF(u);
4944    Py_XDECREF(v);
4945    return NULL;
4946}
4947
4948PyDoc_STRVAR(count__doc__,
4949"S.count(sub[, start[, end]]) -> int\n\
4950\n\
4951Return the number of occurrences of substring sub in Unicode string\n\
4952S[start:end].  Optional arguments start and end are\n\
4953interpreted as in slice notation.");
4954
4955static PyObject *
4956unicode_count(PyUnicodeObject *self, PyObject *args)
4957{
4958    PyUnicodeObject *substring;
4959    int start = 0;
4960    int end = INT_MAX;
4961    PyObject *result;
4962
4963    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4964		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4965        return NULL;
4966
4967    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4968						(PyObject *)substring);
4969    if (substring == NULL)
4970	return NULL;
4971
4972    if (start < 0)
4973        start += self->length;
4974    if (start < 0)
4975        start = 0;
4976    if (end > self->length)
4977        end = self->length;
4978    if (end < 0)
4979        end += self->length;
4980    if (end < 0)
4981        end = 0;
4982
4983    result = PyInt_FromLong((long) count(self, start, end, substring));
4984
4985    Py_DECREF(substring);
4986    return result;
4987}
4988
4989PyDoc_STRVAR(encode__doc__,
4990"S.encode([encoding[,errors]]) -> string or unicode\n\
4991\n\
4992Encodes S using the codec registered for encoding. encoding defaults\n\
4993to the default encoding. errors may be given to set a different error\n\
4994handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4995a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4996'xmlcharrefreplace' as well as any other name registered with\n\
4997codecs.register_error that can handle UnicodeEncodeErrors.");
4998
4999static PyObject *
5000unicode_encode(PyUnicodeObject *self, PyObject *args)
5001{
5002    char *encoding = NULL;
5003    char *errors = NULL;
5004    PyObject *v;
5005
5006    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5007        return NULL;
5008    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5009    if (v == NULL)
5010        goto onError;
5011    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5012        PyErr_Format(PyExc_TypeError,
5013                     "encoder did not return a string/unicode object "
5014                     "(type=%.400s)",
5015                     v->ob_type->tp_name);
5016        Py_DECREF(v);
5017        return NULL;
5018    }
5019    return v;
5020
5021 onError:
5022    return NULL;
5023}
5024
5025PyDoc_STRVAR(decode__doc__,
5026"S.decode([encoding[,errors]]) -> string or unicode\n\
5027\n\
5028Decodes S using the codec registered for encoding. encoding defaults\n\
5029to the default encoding. errors may be given to set a different error\n\
5030handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5031a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5032as well as any other name registerd with codecs.register_error that is\n\
5033able to handle UnicodeDecodeErrors.");
5034
5035static PyObject *
5036unicode_decode(PyUnicodeObject *self, PyObject *args)
5037{
5038    char *encoding = NULL;
5039    char *errors = NULL;
5040    PyObject *v;
5041
5042    if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5043        return NULL;
5044    v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5045    if (v == NULL)
5046        goto onError;
5047    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5048        PyErr_Format(PyExc_TypeError,
5049                     "decoder did not return a string/unicode object "
5050                     "(type=%.400s)",
5051                     v->ob_type->tp_name);
5052        Py_DECREF(v);
5053        return NULL;
5054    }
5055    return v;
5056
5057 onError:
5058    return NULL;
5059}
5060
5061PyDoc_STRVAR(expandtabs__doc__,
5062"S.expandtabs([tabsize]) -> unicode\n\
5063\n\
5064Return a copy of S where all tab characters are expanded using spaces.\n\
5065If tabsize is not given, a tab size of 8 characters is assumed.");
5066
5067static PyObject*
5068unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5069{
5070    Py_UNICODE *e;
5071    Py_UNICODE *p;
5072    Py_UNICODE *q;
5073    int i, j;
5074    PyUnicodeObject *u;
5075    int tabsize = 8;
5076
5077    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5078	return NULL;
5079
5080    /* First pass: determine size of output string */
5081    i = j = 0;
5082    e = self->str + self->length;
5083    for (p = self->str; p < e; p++)
5084        if (*p == '\t') {
5085	    if (tabsize > 0)
5086		j += tabsize - (j % tabsize);
5087	}
5088        else {
5089            j++;
5090            if (*p == '\n' || *p == '\r') {
5091                i += j;
5092                j = 0;
5093            }
5094        }
5095
5096    /* Second pass: create output string and fill it */
5097    u = _PyUnicode_New(i + j);
5098    if (!u)
5099        return NULL;
5100
5101    j = 0;
5102    q = u->str;
5103
5104    for (p = self->str; p < e; p++)
5105        if (*p == '\t') {
5106	    if (tabsize > 0) {
5107		i = tabsize - (j % tabsize);
5108		j += i;
5109		while (i--)
5110		    *q++ = ' ';
5111	    }
5112	}
5113	else {
5114            j++;
5115	    *q++ = *p;
5116            if (*p == '\n' || *p == '\r')
5117                j = 0;
5118        }
5119
5120    return (PyObject*) u;
5121}
5122
5123PyDoc_STRVAR(find__doc__,
5124"S.find(sub [,start [,end]]) -> int\n\
5125\n\
5126Return the lowest index in S where substring sub is found,\n\
5127such that sub is contained within s[start,end].  Optional\n\
5128arguments start and end are interpreted as in slice notation.\n\
5129\n\
5130Return -1 on failure.");
5131
5132static PyObject *
5133unicode_find(PyUnicodeObject *self, PyObject *args)
5134{
5135    PyUnicodeObject *substring;
5136    int start = 0;
5137    int end = INT_MAX;
5138    PyObject *result;
5139
5140    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5141		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5142        return NULL;
5143    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5144						(PyObject *)substring);
5145    if (substring == NULL)
5146	return NULL;
5147
5148    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5149
5150    Py_DECREF(substring);
5151    return result;
5152}
5153
5154static PyObject *
5155unicode_getitem(PyUnicodeObject *self, int index)
5156{
5157    if (index < 0 || index >= self->length) {
5158        PyErr_SetString(PyExc_IndexError, "string index out of range");
5159        return NULL;
5160    }
5161
5162    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5163}
5164
5165static long
5166unicode_hash(PyUnicodeObject *self)
5167{
5168    /* Since Unicode objects compare equal to their ASCII string
5169       counterparts, they should use the individual character values
5170       as basis for their hash value.  This is needed to assure that
5171       strings and Unicode objects behave in the same way as
5172       dictionary keys. */
5173
5174    register int len;
5175    register Py_UNICODE *p;
5176    register long x;
5177
5178    if (self->hash != -1)
5179	return self->hash;
5180    len = PyUnicode_GET_SIZE(self);
5181    p = PyUnicode_AS_UNICODE(self);
5182    x = *p << 7;
5183    while (--len >= 0)
5184	x = (1000003*x) ^ *p++;
5185    x ^= PyUnicode_GET_SIZE(self);
5186    if (x == -1)
5187	x = -2;
5188    self->hash = x;
5189    return x;
5190}
5191
5192PyDoc_STRVAR(index__doc__,
5193"S.index(sub [,start [,end]]) -> int\n\
5194\n\
5195Like S.find() but raise ValueError when the substring is not found.");
5196
5197static PyObject *
5198unicode_index(PyUnicodeObject *self, PyObject *args)
5199{
5200    int result;
5201    PyUnicodeObject *substring;
5202    int start = 0;
5203    int end = INT_MAX;
5204
5205    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5206		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5207        return NULL;
5208
5209    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5210						(PyObject *)substring);
5211    if (substring == NULL)
5212	return NULL;
5213
5214    result = findstring(self, substring, start, end, 1);
5215
5216    Py_DECREF(substring);
5217    if (result < 0) {
5218        PyErr_SetString(PyExc_ValueError, "substring not found");
5219        return NULL;
5220    }
5221    return PyInt_FromLong(result);
5222}
5223
5224PyDoc_STRVAR(islower__doc__,
5225"S.islower() -> bool\n\
5226\n\
5227Return True if all cased characters in S are lowercase and there is\n\
5228at least one cased character in S, False otherwise.");
5229
5230static PyObject*
5231unicode_islower(PyUnicodeObject *self)
5232{
5233    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5234    register const Py_UNICODE *e;
5235    int cased;
5236
5237    /* Shortcut for single character strings */
5238    if (PyUnicode_GET_SIZE(self) == 1)
5239	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
5240
5241    /* Special case for empty strings */
5242    if (PyString_GET_SIZE(self) == 0)
5243	return PyBool_FromLong(0);
5244
5245    e = p + PyUnicode_GET_SIZE(self);
5246    cased = 0;
5247    for (; p < e; p++) {
5248	register const Py_UNICODE ch = *p;
5249
5250	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
5251	    return PyBool_FromLong(0);
5252	else if (!cased && Py_UNICODE_ISLOWER(ch))
5253	    cased = 1;
5254    }
5255    return PyBool_FromLong(cased);
5256}
5257
5258PyDoc_STRVAR(isupper__doc__,
5259"S.isupper() -> bool\n\
5260\n\
5261Return True if all cased characters in S are uppercase and there is\n\
5262at least one cased character in S, False otherwise.");
5263
5264static PyObject*
5265unicode_isupper(PyUnicodeObject *self)
5266{
5267    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5268    register const Py_UNICODE *e;
5269    int cased;
5270
5271    /* Shortcut for single character strings */
5272    if (PyUnicode_GET_SIZE(self) == 1)
5273	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
5274
5275    /* Special case for empty strings */
5276    if (PyString_GET_SIZE(self) == 0)
5277	return PyBool_FromLong(0);
5278
5279    e = p + PyUnicode_GET_SIZE(self);
5280    cased = 0;
5281    for (; p < e; p++) {
5282	register const Py_UNICODE ch = *p;
5283
5284	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
5285	    return PyBool_FromLong(0);
5286	else if (!cased && Py_UNICODE_ISUPPER(ch))
5287	    cased = 1;
5288    }
5289    return PyBool_FromLong(cased);
5290}
5291
5292PyDoc_STRVAR(istitle__doc__,
5293"S.istitle() -> bool\n\
5294\n\
5295Return True if S is a titlecased string and there is at least one\n\
5296character in S, i.e. upper- and titlecase characters may only\n\
5297follow uncased characters and lowercase characters only cased ones.\n\
5298Return False otherwise.");
5299
5300static PyObject*
5301unicode_istitle(PyUnicodeObject *self)
5302{
5303    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5304    register const Py_UNICODE *e;
5305    int cased, previous_is_cased;
5306
5307    /* Shortcut for single character strings */
5308    if (PyUnicode_GET_SIZE(self) == 1)
5309	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5310			       (Py_UNICODE_ISUPPER(*p) != 0));
5311
5312    /* Special case for empty strings */
5313    if (PyString_GET_SIZE(self) == 0)
5314	return PyBool_FromLong(0);
5315
5316    e = p + PyUnicode_GET_SIZE(self);
5317    cased = 0;
5318    previous_is_cased = 0;
5319    for (; p < e; p++) {
5320	register const Py_UNICODE ch = *p;
5321
5322	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5323	    if (previous_is_cased)
5324		return PyBool_FromLong(0);
5325	    previous_is_cased = 1;
5326	    cased = 1;
5327	}
5328	else if (Py_UNICODE_ISLOWER(ch)) {
5329	    if (!previous_is_cased)
5330		return PyBool_FromLong(0);
5331	    previous_is_cased = 1;
5332	    cased = 1;
5333	}
5334	else
5335	    previous_is_cased = 0;
5336    }
5337    return PyBool_FromLong(cased);
5338}
5339
5340PyDoc_STRVAR(isspace__doc__,
5341"S.isspace() -> bool\n\
5342\n\
5343Return True if all characters in S are whitespace\n\
5344and there is at least one character in S, False otherwise.");
5345
5346static PyObject*
5347unicode_isspace(PyUnicodeObject *self)
5348{
5349    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5350    register const Py_UNICODE *e;
5351
5352    /* Shortcut for single character strings */
5353    if (PyUnicode_GET_SIZE(self) == 1 &&
5354	Py_UNICODE_ISSPACE(*p))
5355	return PyBool_FromLong(1);
5356
5357    /* Special case for empty strings */
5358    if (PyString_GET_SIZE(self) == 0)
5359	return PyBool_FromLong(0);
5360
5361    e = p + PyUnicode_GET_SIZE(self);
5362    for (; p < e; p++) {
5363	if (!Py_UNICODE_ISSPACE(*p))
5364	    return PyBool_FromLong(0);
5365    }
5366    return PyBool_FromLong(1);
5367}
5368
5369PyDoc_STRVAR(isalpha__doc__,
5370"S.isalpha() -> bool\n\
5371\n\
5372Return True if all characters in S are alphabetic\n\
5373and there is at least one character in S, False otherwise.");
5374
5375static PyObject*
5376unicode_isalpha(PyUnicodeObject *self)
5377{
5378    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5379    register const Py_UNICODE *e;
5380
5381    /* Shortcut for single character strings */
5382    if (PyUnicode_GET_SIZE(self) == 1 &&
5383	Py_UNICODE_ISALPHA(*p))
5384	return PyBool_FromLong(1);
5385
5386    /* Special case for empty strings */
5387    if (PyString_GET_SIZE(self) == 0)
5388	return PyBool_FromLong(0);
5389
5390    e = p + PyUnicode_GET_SIZE(self);
5391    for (; p < e; p++) {
5392	if (!Py_UNICODE_ISALPHA(*p))
5393	    return PyBool_FromLong(0);
5394    }
5395    return PyBool_FromLong(1);
5396}
5397
5398PyDoc_STRVAR(isalnum__doc__,
5399"S.isalnum() -> bool\n\
5400\n\
5401Return True if all characters in S are alphanumeric\n\
5402and there is at least one character in S, False otherwise.");
5403
5404static PyObject*
5405unicode_isalnum(PyUnicodeObject *self)
5406{
5407    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5408    register const Py_UNICODE *e;
5409
5410    /* Shortcut for single character strings */
5411    if (PyUnicode_GET_SIZE(self) == 1 &&
5412	Py_UNICODE_ISALNUM(*p))
5413	return PyBool_FromLong(1);
5414
5415    /* Special case for empty strings */
5416    if (PyString_GET_SIZE(self) == 0)
5417	return PyBool_FromLong(0);
5418
5419    e = p + PyUnicode_GET_SIZE(self);
5420    for (; p < e; p++) {
5421	if (!Py_UNICODE_ISALNUM(*p))
5422	    return PyBool_FromLong(0);
5423    }
5424    return PyBool_FromLong(1);
5425}
5426
5427PyDoc_STRVAR(isdecimal__doc__,
5428"S.isdecimal() -> bool\n\
5429\n\
5430Return True if there are only decimal characters in S,\n\
5431False otherwise.");
5432
5433static PyObject*
5434unicode_isdecimal(PyUnicodeObject *self)
5435{
5436    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5437    register const Py_UNICODE *e;
5438
5439    /* Shortcut for single character strings */
5440    if (PyUnicode_GET_SIZE(self) == 1 &&
5441	Py_UNICODE_ISDECIMAL(*p))
5442	return PyBool_FromLong(1);
5443
5444    /* Special case for empty strings */
5445    if (PyString_GET_SIZE(self) == 0)
5446	return PyBool_FromLong(0);
5447
5448    e = p + PyUnicode_GET_SIZE(self);
5449    for (; p < e; p++) {
5450	if (!Py_UNICODE_ISDECIMAL(*p))
5451	    return PyBool_FromLong(0);
5452    }
5453    return PyBool_FromLong(1);
5454}
5455
5456PyDoc_STRVAR(isdigit__doc__,
5457"S.isdigit() -> bool\n\
5458\n\
5459Return True if all characters in S are digits\n\
5460and there is at least one character in S, False otherwise.");
5461
5462static PyObject*
5463unicode_isdigit(PyUnicodeObject *self)
5464{
5465    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5466    register const Py_UNICODE *e;
5467
5468    /* Shortcut for single character strings */
5469    if (PyUnicode_GET_SIZE(self) == 1 &&
5470	Py_UNICODE_ISDIGIT(*p))
5471	return PyBool_FromLong(1);
5472
5473    /* Special case for empty strings */
5474    if (PyString_GET_SIZE(self) == 0)
5475	return PyBool_FromLong(0);
5476
5477    e = p + PyUnicode_GET_SIZE(self);
5478    for (; p < e; p++) {
5479	if (!Py_UNICODE_ISDIGIT(*p))
5480	    return PyBool_FromLong(0);
5481    }
5482    return PyBool_FromLong(1);
5483}
5484
5485PyDoc_STRVAR(isnumeric__doc__,
5486"S.isnumeric() -> bool\n\
5487\n\
5488Return True if there are only numeric characters in S,\n\
5489False otherwise.");
5490
5491static PyObject*
5492unicode_isnumeric(PyUnicodeObject *self)
5493{
5494    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5495    register const Py_UNICODE *e;
5496
5497    /* Shortcut for single character strings */
5498    if (PyUnicode_GET_SIZE(self) == 1 &&
5499	Py_UNICODE_ISNUMERIC(*p))
5500	return PyBool_FromLong(1);
5501
5502    /* Special case for empty strings */
5503    if (PyString_GET_SIZE(self) == 0)
5504	return PyBool_FromLong(0);
5505
5506    e = p + PyUnicode_GET_SIZE(self);
5507    for (; p < e; p++) {
5508	if (!Py_UNICODE_ISNUMERIC(*p))
5509	    return PyBool_FromLong(0);
5510    }
5511    return PyBool_FromLong(1);
5512}
5513
5514PyDoc_STRVAR(join__doc__,
5515"S.join(sequence) -> unicode\n\
5516\n\
5517Return a string which is the concatenation of the strings in the\n\
5518sequence.  The separator between elements is S.");
5519
5520static PyObject*
5521unicode_join(PyObject *self, PyObject *data)
5522{
5523    return PyUnicode_Join(self, data);
5524}
5525
5526static int
5527unicode_length(PyUnicodeObject *self)
5528{
5529    return self->length;
5530}
5531
5532PyDoc_STRVAR(ljust__doc__,
5533"S.ljust(width[, fillchar]) -> int\n\
5534\n\
5535Return S left justified in a Unicode string of length width. Padding is\n\
5536done using the specified fill character (default is a space).");
5537
5538static PyObject *
5539unicode_ljust(PyUnicodeObject *self, PyObject *args)
5540{
5541    int width;
5542    Py_UNICODE fillchar = ' ';
5543
5544    if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
5545        return NULL;
5546
5547    if (self->length >= width && PyUnicode_CheckExact(self)) {
5548        Py_INCREF(self);
5549        return (PyObject*) self;
5550    }
5551
5552    return (PyObject*) pad(self, 0, width - self->length, fillchar);
5553}
5554
5555PyDoc_STRVAR(lower__doc__,
5556"S.lower() -> unicode\n\
5557\n\
5558Return a copy of the string S converted to lowercase.");
5559
5560static PyObject*
5561unicode_lower(PyUnicodeObject *self)
5562{
5563    return fixup(self, fixlower);
5564}
5565
5566#define LEFTSTRIP 0
5567#define RIGHTSTRIP 1
5568#define BOTHSTRIP 2
5569
5570/* Arrays indexed by above */
5571static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5572
5573#define STRIPNAME(i) (stripformat[i]+3)
5574
5575static const Py_UNICODE *
5576unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5577{
5578	size_t i;
5579	for (i = 0; i < n; ++i)
5580		if (s[i] == c)
5581			return s+i;
5582	return NULL;
5583}
5584
5585/* externally visible for str.strip(unicode) */
5586PyObject *
5587_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5588{
5589	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5590	int len = PyUnicode_GET_SIZE(self);
5591	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5592	int seplen = PyUnicode_GET_SIZE(sepobj);
5593	int i, j;
5594
5595	i = 0;
5596	if (striptype != RIGHTSTRIP) {
5597		while (i < len && unicode_memchr(sep, s[i], seplen)) {
5598			i++;
5599		}
5600	}
5601
5602	j = len;
5603	if (striptype != LEFTSTRIP) {
5604		do {
5605			j--;
5606		} while (j >= i && unicode_memchr(sep, s[j], seplen));
5607		j++;
5608	}
5609
5610	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5611		Py_INCREF(self);
5612		return (PyObject*)self;
5613	}
5614	else
5615		return PyUnicode_FromUnicode(s+i, j-i);
5616}
5617
5618
5619static PyObject *
5620do_strip(PyUnicodeObject *self, int striptype)
5621{
5622	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5623	int len = PyUnicode_GET_SIZE(self), i, j;
5624
5625	i = 0;
5626	if (striptype != RIGHTSTRIP) {
5627		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5628			i++;
5629		}
5630	}
5631
5632	j = len;
5633	if (striptype != LEFTSTRIP) {
5634		do {
5635			j--;
5636		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5637		j++;
5638	}
5639
5640	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5641		Py_INCREF(self);
5642		return (PyObject*)self;
5643	}
5644	else
5645		return PyUnicode_FromUnicode(s+i, j-i);
5646}
5647
5648
5649static PyObject *
5650do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5651{
5652	PyObject *sep = NULL;
5653
5654	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5655		return NULL;
5656
5657	if (sep != NULL && sep != Py_None) {
5658		if (PyUnicode_Check(sep))
5659			return _PyUnicode_XStrip(self, striptype, sep);
5660		else if (PyString_Check(sep)) {
5661			PyObject *res;
5662			sep = PyUnicode_FromObject(sep);
5663			if (sep==NULL)
5664				return NULL;
5665			res = _PyUnicode_XStrip(self, striptype, sep);
5666			Py_DECREF(sep);
5667			return res;
5668		}
5669		else {
5670			PyErr_Format(PyExc_TypeError,
5671				     "%s arg must be None, unicode or str",
5672				     STRIPNAME(striptype));
5673			return NULL;
5674		}
5675	}
5676
5677	return do_strip(self, striptype);
5678}
5679
5680
5681PyDoc_STRVAR(strip__doc__,
5682"S.strip([chars]) -> unicode\n\
5683\n\
5684Return a copy of the string S with leading and trailing\n\
5685whitespace removed.\n\
5686If chars is given and not None, remove characters in chars instead.\n\
5687If chars is a str, it will be converted to unicode before stripping");
5688
5689static PyObject *
5690unicode_strip(PyUnicodeObject *self, PyObject *args)
5691{
5692	if (PyTuple_GET_SIZE(args) == 0)
5693		return do_strip(self, BOTHSTRIP); /* Common case */
5694	else
5695		return do_argstrip(self, BOTHSTRIP, args);
5696}
5697
5698
5699PyDoc_STRVAR(lstrip__doc__,
5700"S.lstrip([chars]) -> unicode\n\
5701\n\
5702Return a copy of the string S with leading whitespace removed.\n\
5703If chars is given and not None, remove characters in chars instead.\n\
5704If chars is a str, it will be converted to unicode before stripping");
5705
5706static PyObject *
5707unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5708{
5709	if (PyTuple_GET_SIZE(args) == 0)
5710		return do_strip(self, LEFTSTRIP); /* Common case */
5711	else
5712		return do_argstrip(self, LEFTSTRIP, args);
5713}
5714
5715
5716PyDoc_STRVAR(rstrip__doc__,
5717"S.rstrip([chars]) -> unicode\n\
5718\n\
5719Return a copy of the string S with trailing whitespace removed.\n\
5720If chars is given and not None, remove characters in chars instead.\n\
5721If chars is a str, it will be converted to unicode before stripping");
5722
5723static PyObject *
5724unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5725{
5726	if (PyTuple_GET_SIZE(args) == 0)
5727		return do_strip(self, RIGHTSTRIP); /* Common case */
5728	else
5729		return do_argstrip(self, RIGHTSTRIP, args);
5730}
5731
5732
5733static PyObject*
5734unicode_repeat(PyUnicodeObject *str, int len)
5735{
5736    PyUnicodeObject *u;
5737    Py_UNICODE *p;
5738    int nchars;
5739    size_t nbytes;
5740
5741    if (len < 0)
5742        len = 0;
5743
5744    if (len == 1 && PyUnicode_CheckExact(str)) {
5745        /* no repeat, return original string */
5746        Py_INCREF(str);
5747        return (PyObject*) str;
5748    }
5749
5750    /* ensure # of chars needed doesn't overflow int and # of bytes
5751     * needed doesn't overflow size_t
5752     */
5753    nchars = len * str->length;
5754    if (len && nchars / len != str->length) {
5755        PyErr_SetString(PyExc_OverflowError,
5756                        "repeated string is too long");
5757        return NULL;
5758    }
5759    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5760    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5761        PyErr_SetString(PyExc_OverflowError,
5762                        "repeated string is too long");
5763        return NULL;
5764    }
5765    u = _PyUnicode_New(nchars);
5766    if (!u)
5767        return NULL;
5768
5769    p = u->str;
5770
5771    while (len-- > 0) {
5772        Py_UNICODE_COPY(p, str->str, str->length);
5773        p += str->length;
5774    }
5775
5776    return (PyObject*) u;
5777}
5778
5779PyObject *PyUnicode_Replace(PyObject *obj,
5780			    PyObject *subobj,
5781			    PyObject *replobj,
5782			    int maxcount)
5783{
5784    PyObject *self;
5785    PyObject *str1;
5786    PyObject *str2;
5787    PyObject *result;
5788
5789    self = PyUnicode_FromObject(obj);
5790    if (self == NULL)
5791	return NULL;
5792    str1 = PyUnicode_FromObject(subobj);
5793    if (str1 == NULL) {
5794	Py_DECREF(self);
5795	return NULL;
5796    }
5797    str2 = PyUnicode_FromObject(replobj);
5798    if (str2 == NULL) {
5799	Py_DECREF(self);
5800	Py_DECREF(str1);
5801	return NULL;
5802    }
5803    result = replace((PyUnicodeObject *)self,
5804		     (PyUnicodeObject *)str1,
5805		     (PyUnicodeObject *)str2,
5806		     maxcount);
5807    Py_DECREF(self);
5808    Py_DECREF(str1);
5809    Py_DECREF(str2);
5810    return result;
5811}
5812
5813PyDoc_STRVAR(replace__doc__,
5814"S.replace (old, new[, maxsplit]) -> unicode\n\
5815\n\
5816Return a copy of S with all occurrences of substring\n\
5817old replaced by new.  If the optional argument maxsplit is\n\
5818given, only the first maxsplit occurrences are replaced.");
5819
5820static PyObject*
5821unicode_replace(PyUnicodeObject *self, PyObject *args)
5822{
5823    PyUnicodeObject *str1;
5824    PyUnicodeObject *str2;
5825    int maxcount = -1;
5826    PyObject *result;
5827
5828    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5829        return NULL;
5830    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5831    if (str1 == NULL)
5832	return NULL;
5833    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5834    if (str2 == NULL) {
5835	Py_DECREF(str1);
5836	return NULL;
5837    }
5838
5839    result = replace(self, str1, str2, maxcount);
5840
5841    Py_DECREF(str1);
5842    Py_DECREF(str2);
5843    return result;
5844}
5845
5846static
5847PyObject *unicode_repr(PyObject *unicode)
5848{
5849    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5850				PyUnicode_GET_SIZE(unicode),
5851				1);
5852}
5853
5854PyDoc_STRVAR(rfind__doc__,
5855"S.rfind(sub [,start [,end]]) -> int\n\
5856\n\
5857Return the highest index in S where substring sub is found,\n\
5858such that sub is contained within s[start,end].  Optional\n\
5859arguments start and end are interpreted as in slice notation.\n\
5860\n\
5861Return -1 on failure.");
5862
5863static PyObject *
5864unicode_rfind(PyUnicodeObject *self, PyObject *args)
5865{
5866    PyUnicodeObject *substring;
5867    int start = 0;
5868    int end = INT_MAX;
5869    PyObject *result;
5870
5871    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5872		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5873        return NULL;
5874    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5875						(PyObject *)substring);
5876    if (substring == NULL)
5877	return NULL;
5878
5879    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5880
5881    Py_DECREF(substring);
5882    return result;
5883}
5884
5885PyDoc_STRVAR(rindex__doc__,
5886"S.rindex(sub [,start [,end]]) -> int\n\
5887\n\
5888Like S.rfind() but raise ValueError when the substring is not found.");
5889
5890static PyObject *
5891unicode_rindex(PyUnicodeObject *self, PyObject *args)
5892{
5893    int result;
5894    PyUnicodeObject *substring;
5895    int start = 0;
5896    int end = INT_MAX;
5897
5898    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5899		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5900        return NULL;
5901    substring = (PyUnicodeObject *)PyUnicode_FromObject(
5902						(PyObject *)substring);
5903    if (substring == NULL)
5904	return NULL;
5905
5906    result = findstring(self, substring, start, end, -1);
5907
5908    Py_DECREF(substring);
5909    if (result < 0) {
5910        PyErr_SetString(PyExc_ValueError, "substring not found");
5911        return NULL;
5912    }
5913    return PyInt_FromLong(result);
5914}
5915
5916PyDoc_STRVAR(rjust__doc__,
5917"S.rjust(width[, fillchar]) -> unicode\n\
5918\n\
5919Return S right justified in a Unicode string of length width. Padding is\n\
5920done using the specified fill character (default is a space).");
5921
5922static PyObject *
5923unicode_rjust(PyUnicodeObject *self, PyObject *args)
5924{
5925    int width;
5926    Py_UNICODE fillchar = ' ';
5927
5928    if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
5929        return NULL;
5930
5931    if (self->length >= width && PyUnicode_CheckExact(self)) {
5932        Py_INCREF(self);
5933        return (PyObject*) self;
5934    }
5935
5936    return (PyObject*) pad(self, width - self->length, 0, fillchar);
5937}
5938
5939static PyObject*
5940unicode_slice(PyUnicodeObject *self, int start, int end)
5941{
5942    /* standard clamping */
5943    if (start < 0)
5944        start = 0;
5945    if (end < 0)
5946        end = 0;
5947    if (end > self->length)
5948        end = self->length;
5949    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
5950        /* full slice, return original string */
5951        Py_INCREF(self);
5952        return (PyObject*) self;
5953    }
5954    if (start > end)
5955        start = end;
5956    /* copy slice */
5957    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5958					     end - start);
5959}
5960
5961PyObject *PyUnicode_Split(PyObject *s,
5962			  PyObject *sep,
5963			  int maxsplit)
5964{
5965    PyObject *result;
5966
5967    s = PyUnicode_FromObject(s);
5968    if (s == NULL)
5969	return NULL;
5970    if (sep != NULL) {
5971	sep = PyUnicode_FromObject(sep);
5972	if (sep == NULL) {
5973	    Py_DECREF(s);
5974	    return NULL;
5975	}
5976    }
5977
5978    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5979
5980    Py_DECREF(s);
5981    Py_XDECREF(sep);
5982    return result;
5983}
5984
5985PyDoc_STRVAR(split__doc__,
5986"S.split([sep [,maxsplit]]) -> list of strings\n\
5987\n\
5988Return a list of the words in S, using sep as the\n\
5989delimiter string.  If maxsplit is given, at most maxsplit\n\
5990splits are done. If sep is not specified or None,
5991any whitespace string is a separator.");
5992
5993static PyObject*
5994unicode_split(PyUnicodeObject *self, PyObject *args)
5995{
5996    PyObject *substring = Py_None;
5997    int maxcount = -1;
5998
5999    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6000        return NULL;
6001
6002    if (substring == Py_None)
6003	return split(self, NULL, maxcount);
6004    else if (PyUnicode_Check(substring))
6005	return split(self, (PyUnicodeObject *)substring, maxcount);
6006    else
6007	return PyUnicode_Split((PyObject *)self, substring, maxcount);
6008}
6009
6010PyObject *PyUnicode_RSplit(PyObject *s,
6011			   PyObject *sep,
6012			   int maxsplit)
6013{
6014    PyObject *result;
6015
6016    s = PyUnicode_FromObject(s);
6017    if (s == NULL)
6018	return NULL;
6019    if (sep != NULL) {
6020	sep = PyUnicode_FromObject(sep);
6021	if (sep == NULL) {
6022	    Py_DECREF(s);
6023	    return NULL;
6024	}
6025    }
6026
6027    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6028
6029    Py_DECREF(s);
6030    Py_XDECREF(sep);
6031    return result;
6032}
6033
6034PyDoc_STRVAR(rsplit__doc__,
6035"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6036\n\
6037Return a list of the words in S, using sep as the\n\
6038delimiter string, starting at the end of the string and\n\
6039working to the front.  If maxsplit is given, at most maxsplit\n\
6040splits are done. If sep is not specified, any whitespace string\n\
6041is a separator.");
6042
6043static PyObject*
6044unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6045{
6046    PyObject *substring = Py_None;
6047    int maxcount = -1;
6048
6049    if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6050        return NULL;
6051
6052    if (substring == Py_None)
6053	return rsplit(self, NULL, maxcount);
6054    else if (PyUnicode_Check(substring))
6055	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6056    else
6057	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6058}
6059
6060PyDoc_STRVAR(splitlines__doc__,
6061"S.splitlines([keepends]]) -> list of strings\n\
6062\n\
6063Return a list of the lines in S, breaking at line boundaries.\n\
6064Line breaks are not included in the resulting list unless keepends\n\
6065is given and true.");
6066
6067static PyObject*
6068unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6069{
6070    int keepends = 0;
6071
6072    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
6073        return NULL;
6074
6075    return PyUnicode_Splitlines((PyObject *)self, keepends);
6076}
6077
6078static
6079PyObject *unicode_str(PyUnicodeObject *self)
6080{
6081    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
6082}
6083
6084PyDoc_STRVAR(swapcase__doc__,
6085"S.swapcase() -> unicode\n\
6086\n\
6087Return a copy of S with uppercase characters converted to lowercase\n\
6088and vice versa.");
6089
6090static PyObject*
6091unicode_swapcase(PyUnicodeObject *self)
6092{
6093    return fixup(self, fixswapcase);
6094}
6095
6096PyDoc_STRVAR(translate__doc__,
6097"S.translate(table) -> unicode\n\
6098\n\
6099Return a copy of the string S, where all characters have been mapped\n\
6100through the given translation table, which must be a mapping of\n\
6101Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6102Unmapped characters are left untouched. Characters mapped to None\n\
6103are deleted.");
6104
6105static PyObject*
6106unicode_translate(PyUnicodeObject *self, PyObject *table)
6107{
6108    return PyUnicode_TranslateCharmap(self->str,
6109				      self->length,
6110				      table,
6111				      "ignore");
6112}
6113
6114PyDoc_STRVAR(upper__doc__,
6115"S.upper() -> unicode\n\
6116\n\
6117Return a copy of S converted to uppercase.");
6118
6119static PyObject*
6120unicode_upper(PyUnicodeObject *self)
6121{
6122    return fixup(self, fixupper);
6123}
6124
6125PyDoc_STRVAR(zfill__doc__,
6126"S.zfill(width) -> unicode\n\
6127\n\
6128Pad a numeric string x with zeros on the left, to fill a field\n\
6129of the specified width. The string x is never truncated.");
6130
6131static PyObject *
6132unicode_zfill(PyUnicodeObject *self, PyObject *args)
6133{
6134    int fill;
6135    PyUnicodeObject *u;
6136
6137    int width;
6138    if (!PyArg_ParseTuple(args, "i:zfill", &width))
6139        return NULL;
6140
6141    if (self->length >= width) {
6142        if (PyUnicode_CheckExact(self)) {
6143            Py_INCREF(self);
6144            return (PyObject*) self;
6145        }
6146        else
6147            return PyUnicode_FromUnicode(
6148                PyUnicode_AS_UNICODE(self),
6149                PyUnicode_GET_SIZE(self)
6150            );
6151    }
6152
6153    fill = width - self->length;
6154
6155    u = pad(self, fill, 0, '0');
6156
6157    if (u == NULL)
6158        return NULL;
6159
6160    if (u->str[fill] == '+' || u->str[fill] == '-') {
6161        /* move sign to beginning of string */
6162        u->str[0] = u->str[fill];
6163        u->str[fill] = '0';
6164    }
6165
6166    return (PyObject*) u;
6167}
6168
6169#if 0
6170static PyObject*
6171unicode_freelistsize(PyUnicodeObject *self)
6172{
6173    return PyInt_FromLong(unicode_freelist_size);
6174}
6175#endif
6176
6177PyDoc_STRVAR(startswith__doc__,
6178"S.startswith(prefix[, start[, end]]) -> bool\n\
6179\n\
6180Return True if S starts with the specified prefix, False otherwise.\n\
6181With optional start, test S beginning at that position.\n\
6182With optional end, stop comparing S at that position.");
6183
6184static PyObject *
6185unicode_startswith(PyUnicodeObject *self,
6186		   PyObject *args)
6187{
6188    PyUnicodeObject *substring;
6189    int start = 0;
6190    int end = INT_MAX;
6191    PyObject *result;
6192
6193    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6194		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6195	return NULL;
6196    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6197						(PyObject *)substring);
6198    if (substring == NULL)
6199	return NULL;
6200
6201    result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
6202
6203    Py_DECREF(substring);
6204    return result;
6205}
6206
6207
6208PyDoc_STRVAR(endswith__doc__,
6209"S.endswith(suffix[, start[, end]]) -> bool\n\
6210\n\
6211Return True if S ends with the specified suffix, False otherwise.\n\
6212With optional start, test S beginning at that position.\n\
6213With optional end, stop comparing S at that position.");
6214
6215static PyObject *
6216unicode_endswith(PyUnicodeObject *self,
6217		 PyObject *args)
6218{
6219    PyUnicodeObject *substring;
6220    int start = 0;
6221    int end = INT_MAX;
6222    PyObject *result;
6223
6224    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6225		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6226	return NULL;
6227    substring = (PyUnicodeObject *)PyUnicode_FromObject(
6228						(PyObject *)substring);
6229    if (substring == NULL)
6230	return NULL;
6231
6232    result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
6233
6234    Py_DECREF(substring);
6235    return result;
6236}
6237
6238
6239
6240static PyObject *
6241unicode_getnewargs(PyUnicodeObject *v)
6242{
6243	return Py_BuildValue("(u#)", v->str, v->length);
6244}
6245
6246
6247static PyMethodDef unicode_methods[] = {
6248
6249    /* Order is according to common usage: often used methods should
6250       appear first, since lookup is done sequentially. */
6251
6252    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6253    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6254    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
6255    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
6256    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6257    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6258    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6259    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6260    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6261    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6262    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6263    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6264    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6265    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
6266    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
6267    {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
6268/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6269    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6270    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6271    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
6272    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
6273    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
6274    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
6275    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6276    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6277    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6278    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6279    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6280    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6281    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6282    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6283    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6284    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6285    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6286    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6287    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6288    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
6289    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
6290#if 0
6291    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
6292#endif
6293
6294#if 0
6295    /* This one is just used for debugging the implementation. */
6296    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
6297#endif
6298
6299    {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
6300    {NULL, NULL}
6301};
6302
6303static PyObject *
6304unicode_mod(PyObject *v, PyObject *w)
6305{
6306       if (!PyUnicode_Check(v)) {
6307               Py_INCREF(Py_NotImplemented);
6308               return Py_NotImplemented;
6309       }
6310       return PyUnicode_Format(v, w);
6311}
6312
6313static PyNumberMethods unicode_as_number = {
6314	0,				/*nb_add*/
6315	0,				/*nb_subtract*/
6316	0,				/*nb_multiply*/
6317	0,				/*nb_divide*/
6318	unicode_mod,			/*nb_remainder*/
6319};
6320
6321static PySequenceMethods unicode_as_sequence = {
6322    (inquiry) unicode_length, 		/* sq_length */
6323    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
6324    (intargfunc) unicode_repeat, 	/* sq_repeat */
6325    (intargfunc) unicode_getitem, 	/* sq_item */
6326    (intintargfunc) unicode_slice, 	/* sq_slice */
6327    0, 					/* sq_ass_item */
6328    0, 					/* sq_ass_slice */
6329    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
6330};
6331
6332static PyObject*
6333unicode_subscript(PyUnicodeObject* self, PyObject* item)
6334{
6335    if (PyInt_Check(item)) {
6336        long i = PyInt_AS_LONG(item);
6337        if (i < 0)
6338            i += PyString_GET_SIZE(self);
6339        return unicode_getitem(self, i);
6340    } else if (PyLong_Check(item)) {
6341        long i = PyLong_AsLong(item);
6342        if (i == -1 && PyErr_Occurred())
6343            return NULL;
6344        if (i < 0)
6345            i += PyString_GET_SIZE(self);
6346        return unicode_getitem(self, i);
6347    } else if (PySlice_Check(item)) {
6348        int start, stop, step, slicelength, cur, i;
6349        Py_UNICODE* source_buf;
6350        Py_UNICODE* result_buf;
6351        PyObject* result;
6352
6353        if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6354				 &start, &stop, &step, &slicelength) < 0) {
6355            return NULL;
6356        }
6357
6358        if (slicelength <= 0) {
6359            return PyUnicode_FromUnicode(NULL, 0);
6360        } else {
6361            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6362            result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6363
6364            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6365                result_buf[i] = source_buf[cur];
6366            }
6367
6368            result = PyUnicode_FromUnicode(result_buf, slicelength);
6369            PyMem_FREE(result_buf);
6370            return result;
6371        }
6372    } else {
6373        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6374        return NULL;
6375    }
6376}
6377
6378static PyMappingMethods unicode_as_mapping = {
6379    (inquiry)unicode_length,		/* mp_length */
6380    (binaryfunc)unicode_subscript,	/* mp_subscript */
6381    (objobjargproc)0,			/* mp_ass_subscript */
6382};
6383
6384static int
6385unicode_buffer_getreadbuf(PyUnicodeObject *self,
6386			  int index,
6387			  const void **ptr)
6388{
6389    if (index != 0) {
6390        PyErr_SetString(PyExc_SystemError,
6391			"accessing non-existent unicode segment");
6392        return -1;
6393    }
6394    *ptr = (void *) self->str;
6395    return PyUnicode_GET_DATA_SIZE(self);
6396}
6397
6398static int
6399unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6400			   const void **ptr)
6401{
6402    PyErr_SetString(PyExc_TypeError,
6403		    "cannot use unicode as modifiable buffer");
6404    return -1;
6405}
6406
6407static int
6408unicode_buffer_getsegcount(PyUnicodeObject *self,
6409			   int *lenp)
6410{
6411    if (lenp)
6412        *lenp = PyUnicode_GET_DATA_SIZE(self);
6413    return 1;
6414}
6415
6416static int
6417unicode_buffer_getcharbuf(PyUnicodeObject *self,
6418			  int index,
6419			  const void **ptr)
6420{
6421    PyObject *str;
6422
6423    if (index != 0) {
6424        PyErr_SetString(PyExc_SystemError,
6425			"accessing non-existent unicode segment");
6426        return -1;
6427    }
6428    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
6429    if (str == NULL)
6430	return -1;
6431    *ptr = (void *) PyString_AS_STRING(str);
6432    return PyString_GET_SIZE(str);
6433}
6434
6435/* Helpers for PyUnicode_Format() */
6436
6437static PyObject *
6438getnextarg(PyObject *args, int arglen, int *p_argidx)
6439{
6440    int argidx = *p_argidx;
6441    if (argidx < arglen) {
6442	(*p_argidx)++;
6443	if (arglen < 0)
6444	    return args;
6445	else
6446	    return PyTuple_GetItem(args, argidx);
6447    }
6448    PyErr_SetString(PyExc_TypeError,
6449		    "not enough arguments for format string");
6450    return NULL;
6451}
6452
6453#define F_LJUST (1<<0)
6454#define F_SIGN	(1<<1)
6455#define F_BLANK (1<<2)
6456#define F_ALT	(1<<3)
6457#define F_ZERO	(1<<4)
6458
6459static
6460int usprintf(register Py_UNICODE *buffer, char *format, ...)
6461{
6462    register int i;
6463    int len;
6464    va_list va;
6465    char *charbuffer;
6466    va_start(va, format);
6467
6468    /* First, format the string as char array, then expand to Py_UNICODE
6469       array. */
6470    charbuffer = (char *)buffer;
6471    len = vsprintf(charbuffer, format, va);
6472    for (i = len - 1; i >= 0; i--)
6473	buffer[i] = (Py_UNICODE) charbuffer[i];
6474
6475    va_end(va);
6476    return len;
6477}
6478
6479/* XXX To save some code duplication, formatfloat/long/int could have been
6480   shared with stringobject.c, converting from 8-bit to Unicode after the
6481   formatting is done. */
6482
6483static int
6484formatfloat(Py_UNICODE *buf,
6485	    size_t buflen,
6486	    int flags,
6487	    int prec,
6488	    int type,
6489	    PyObject *v)
6490{
6491    /* fmt = '%#.' + `prec` + `type`
6492       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
6493    char fmt[20];
6494    double x;
6495
6496    x = PyFloat_AsDouble(v);
6497    if (x == -1.0 && PyErr_Occurred())
6498	return -1;
6499    if (prec < 0)
6500	prec = 6;
6501    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6502	type = 'g';
6503    /* Worst case length calc to ensure no buffer overrun:
6504
6505       'g' formats:
6506	 fmt = %#.<prec>g
6507	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6508	    for any double rep.)
6509	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6510
6511       'f' formats:
6512	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6513	 len = 1 + 50 + 1 + prec = 52 + prec
6514
6515       If prec=0 the effective precision is 1 (the leading digit is
6516       always given), therefore increase the length by one.
6517
6518    */
6519    if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6520	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
6521	PyErr_SetString(PyExc_OverflowError,
6522			"formatted float is too long (precision too large?)");
6523	return -1;
6524    }
6525    PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6526		  (flags&F_ALT) ? "#" : "",
6527		  prec, type);
6528    return usprintf(buf, fmt, x);
6529}
6530
6531static PyObject*
6532formatlong(PyObject *val, int flags, int prec, int type)
6533{
6534	char *buf;
6535	int i, len;
6536	PyObject *str; /* temporary string object. */
6537	PyUnicodeObject *result;
6538
6539	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6540	if (!str)
6541		return NULL;
6542	result = _PyUnicode_New(len);
6543	for (i = 0; i < len; i++)
6544		result->str[i] = buf[i];
6545	result->str[len] = 0;
6546	Py_DECREF(str);
6547	return (PyObject*)result;
6548}
6549
6550static int
6551formatint(Py_UNICODE *buf,
6552	  size_t buflen,
6553	  int flags,
6554	  int prec,
6555	  int type,
6556	  PyObject *v)
6557{
6558    /* fmt = '%#.' + `prec` + 'l' + `type`
6559     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6560     *                     + 1 + 1
6561     *                   = 24
6562     */
6563    char fmt[64]; /* plenty big enough! */
6564    char *sign;
6565    long x;
6566
6567    x = PyInt_AsLong(v);
6568    if (x == -1 && PyErr_Occurred())
6569        return -1;
6570    if (x < 0 && type == 'u') {
6571        type = 'd';
6572    }
6573    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6574        sign = "-";
6575    else
6576        sign = "";
6577    if (prec < 0)
6578        prec = 1;
6579
6580    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6581     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
6582     */
6583    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
6584        PyErr_SetString(PyExc_OverflowError,
6585    	        "formatted integer is too long (precision too large?)");
6586        return -1;
6587    }
6588
6589    if ((flags & F_ALT) &&
6590        (type == 'x' || type == 'X')) {
6591        /* When converting under %#x or %#X, there are a number
6592         * of issues that cause pain:
6593         * - when 0 is being converted, the C standard leaves off
6594         *   the '0x' or '0X', which is inconsistent with other
6595         *   %#x/%#X conversions and inconsistent with Python's
6596         *   hex() function
6597         * - there are platforms that violate the standard and
6598         *   convert 0 with the '0x' or '0X'
6599         *   (Metrowerks, Compaq Tru64)
6600         * - there are platforms that give '0x' when converting
6601         *   under %#X, but convert 0 in accordance with the
6602         *   standard (OS/2 EMX)
6603         *
6604         * We can achieve the desired consistency by inserting our
6605         * own '0x' or '0X' prefix, and substituting %x/%X in place
6606         * of %#x/%#X.
6607         *
6608         * Note that this is the same approach as used in
6609         * formatint() in stringobject.c
6610         */
6611        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6612                      sign, type, prec, type);
6613    }
6614    else {
6615        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6616                      sign, (flags&F_ALT) ? "#" : "",
6617                      prec, type);
6618    }
6619    if (sign[0])
6620        return usprintf(buf, fmt, -x);
6621    else
6622        return usprintf(buf, fmt, x);
6623}
6624
6625static int
6626formatchar(Py_UNICODE *buf,
6627           size_t buflen,
6628           PyObject *v)
6629{
6630    /* presume that the buffer is at least 2 characters long */
6631    if (PyUnicode_Check(v)) {
6632	if (PyUnicode_GET_SIZE(v) != 1)
6633	    goto onError;
6634	buf[0] = PyUnicode_AS_UNICODE(v)[0];
6635    }
6636
6637    else if (PyString_Check(v)) {
6638	if (PyString_GET_SIZE(v) != 1)
6639	    goto onError;
6640	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6641    }
6642
6643    else {
6644	/* Integer input truncated to a character */
6645        long x;
6646	x = PyInt_AsLong(v);
6647	if (x == -1 && PyErr_Occurred())
6648	    goto onError;
6649#ifdef Py_UNICODE_WIDE
6650	if (x < 0 || x > 0x10ffff) {
6651	    PyErr_SetString(PyExc_OverflowError,
6652			    "%c arg not in range(0x110000) "
6653			    "(wide Python build)");
6654	    return -1;
6655	}
6656#else
6657	if (x < 0 || x > 0xffff) {
6658	    PyErr_SetString(PyExc_OverflowError,
6659			    "%c arg not in range(0x10000) "
6660			    "(narrow Python build)");
6661	    return -1;
6662	}
6663#endif
6664	buf[0] = (Py_UNICODE) x;
6665    }
6666    buf[1] = '\0';
6667    return 1;
6668
6669 onError:
6670    PyErr_SetString(PyExc_TypeError,
6671		    "%c requires int or char");
6672    return -1;
6673}
6674
6675/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6676
6677   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6678   chars are formatted. XXX This is a magic number. Each formatting
6679   routine does bounds checking to ensure no overflow, but a better
6680   solution may be to malloc a buffer of appropriate size for each
6681   format. For now, the current solution is sufficient.
6682*/
6683#define FORMATBUFLEN (size_t)120
6684
6685PyObject *PyUnicode_Format(PyObject *format,
6686			   PyObject *args)
6687{
6688    Py_UNICODE *fmt, *res;
6689    int fmtcnt, rescnt, reslen, arglen, argidx;
6690    int args_owned = 0;
6691    PyUnicodeObject *result = NULL;
6692    PyObject *dict = NULL;
6693    PyObject *uformat;
6694
6695    if (format == NULL || args == NULL) {
6696	PyErr_BadInternalCall();
6697	return NULL;
6698    }
6699    uformat = PyUnicode_FromObject(format);
6700    if (uformat == NULL)
6701	return NULL;
6702    fmt = PyUnicode_AS_UNICODE(uformat);
6703    fmtcnt = PyUnicode_GET_SIZE(uformat);
6704
6705    reslen = rescnt = fmtcnt + 100;
6706    result = _PyUnicode_New(reslen);
6707    if (result == NULL)
6708	goto onError;
6709    res = PyUnicode_AS_UNICODE(result);
6710
6711    if (PyTuple_Check(args)) {
6712	arglen = PyTuple_Size(args);
6713	argidx = 0;
6714    }
6715    else {
6716	arglen = -1;
6717	argidx = -2;
6718    }
6719    if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6720        !PyObject_TypeCheck(args, &PyBaseString_Type))
6721	dict = args;
6722
6723    while (--fmtcnt >= 0) {
6724	if (*fmt != '%') {
6725	    if (--rescnt < 0) {
6726		rescnt = fmtcnt + 100;
6727		reslen += rescnt;
6728		if (_PyUnicode_Resize(&result, reslen) < 0)
6729		    return NULL;
6730		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6731		--rescnt;
6732	    }
6733	    *res++ = *fmt++;
6734	}
6735	else {
6736	    /* Got a format specifier */
6737	    int flags = 0;
6738	    int width = -1;
6739	    int prec = -1;
6740	    Py_UNICODE c = '\0';
6741	    Py_UNICODE fill;
6742	    PyObject *v = NULL;
6743	    PyObject *temp = NULL;
6744	    Py_UNICODE *pbuf;
6745	    Py_UNICODE sign;
6746	    int len;
6747	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
6748
6749	    fmt++;
6750	    if (*fmt == '(') {
6751		Py_UNICODE *keystart;
6752		int keylen;
6753		PyObject *key;
6754		int pcount = 1;
6755
6756		if (dict == NULL) {
6757		    PyErr_SetString(PyExc_TypeError,
6758				    "format requires a mapping");
6759		    goto onError;
6760		}
6761		++fmt;
6762		--fmtcnt;
6763		keystart = fmt;
6764		/* Skip over balanced parentheses */
6765		while (pcount > 0 && --fmtcnt >= 0) {
6766		    if (*fmt == ')')
6767			--pcount;
6768		    else if (*fmt == '(')
6769			++pcount;
6770		    fmt++;
6771		}
6772		keylen = fmt - keystart - 1;
6773		if (fmtcnt < 0 || pcount > 0) {
6774		    PyErr_SetString(PyExc_ValueError,
6775				    "incomplete format key");
6776		    goto onError;
6777		}
6778#if 0
6779		/* keys are converted to strings using UTF-8 and
6780		   then looked up since Python uses strings to hold
6781		   variables names etc. in its namespaces and we
6782		   wouldn't want to break common idioms. */
6783		key = PyUnicode_EncodeUTF8(keystart,
6784					   keylen,
6785					   NULL);
6786#else
6787		key = PyUnicode_FromUnicode(keystart, keylen);
6788#endif
6789		if (key == NULL)
6790		    goto onError;
6791		if (args_owned) {
6792		    Py_DECREF(args);
6793		    args_owned = 0;
6794		}
6795		args = PyObject_GetItem(dict, key);
6796		Py_DECREF(key);
6797		if (args == NULL) {
6798		    goto onError;
6799		}
6800		args_owned = 1;
6801		arglen = -1;
6802		argidx = -2;
6803	    }
6804	    while (--fmtcnt >= 0) {
6805		switch (c = *fmt++) {
6806		case '-': flags |= F_LJUST; continue;
6807		case '+': flags |= F_SIGN; continue;
6808		case ' ': flags |= F_BLANK; continue;
6809		case '#': flags |= F_ALT; continue;
6810		case '0': flags |= F_ZERO; continue;
6811		}
6812		break;
6813	    }
6814	    if (c == '*') {
6815		v = getnextarg(args, arglen, &argidx);
6816		if (v == NULL)
6817		    goto onError;
6818		if (!PyInt_Check(v)) {
6819		    PyErr_SetString(PyExc_TypeError,
6820				    "* wants int");
6821		    goto onError;
6822		}
6823		width = PyInt_AsLong(v);
6824		if (width < 0) {
6825		    flags |= F_LJUST;
6826		    width = -width;
6827		}
6828		if (--fmtcnt >= 0)
6829		    c = *fmt++;
6830	    }
6831	    else if (c >= '0' && c <= '9') {
6832		width = c - '0';
6833		while (--fmtcnt >= 0) {
6834		    c = *fmt++;
6835		    if (c < '0' || c > '9')
6836			break;
6837		    if ((width*10) / 10 != width) {
6838			PyErr_SetString(PyExc_ValueError,
6839					"width too big");
6840			goto onError;
6841		    }
6842		    width = width*10 + (c - '0');
6843		}
6844	    }
6845	    if (c == '.') {
6846		prec = 0;
6847		if (--fmtcnt >= 0)
6848		    c = *fmt++;
6849		if (c == '*') {
6850		    v = getnextarg(args, arglen, &argidx);
6851		    if (v == NULL)
6852			goto onError;
6853		    if (!PyInt_Check(v)) {
6854			PyErr_SetString(PyExc_TypeError,
6855					"* wants int");
6856			goto onError;
6857		    }
6858		    prec = PyInt_AsLong(v);
6859		    if (prec < 0)
6860			prec = 0;
6861		    if (--fmtcnt >= 0)
6862			c = *fmt++;
6863		}
6864		else if (c >= '0' && c <= '9') {
6865		    prec = c - '0';
6866		    while (--fmtcnt >= 0) {
6867			c = Py_CHARMASK(*fmt++);
6868			if (c < '0' || c > '9')
6869			    break;
6870			if ((prec*10) / 10 != prec) {
6871			    PyErr_SetString(PyExc_ValueError,
6872					    "prec too big");
6873			    goto onError;
6874			}
6875			prec = prec*10 + (c - '0');
6876		    }
6877		}
6878	    } /* prec */
6879	    if (fmtcnt >= 0) {
6880		if (c == 'h' || c == 'l' || c == 'L') {
6881		    if (--fmtcnt >= 0)
6882			c = *fmt++;
6883		}
6884	    }
6885	    if (fmtcnt < 0) {
6886		PyErr_SetString(PyExc_ValueError,
6887				"incomplete format");
6888		goto onError;
6889	    }
6890	    if (c != '%') {
6891		v = getnextarg(args, arglen, &argidx);
6892		if (v == NULL)
6893		    goto onError;
6894	    }
6895	    sign = 0;
6896	    fill = ' ';
6897	    switch (c) {
6898
6899	    case '%':
6900		pbuf = formatbuf;
6901		/* presume that buffer length is at least 1 */
6902		pbuf[0] = '%';
6903		len = 1;
6904		break;
6905
6906	    case 's':
6907	    case 'r':
6908		if (PyUnicode_Check(v) && c == 's') {
6909		    temp = v;
6910		    Py_INCREF(temp);
6911		}
6912		else {
6913		    PyObject *unicode;
6914		    if (c == 's')
6915			temp = PyObject_Unicode(v);
6916		    else
6917			temp = PyObject_Repr(v);
6918		    if (temp == NULL)
6919			goto onError;
6920                    if (PyUnicode_Check(temp))
6921                        /* nothing to do */;
6922                    else if (PyString_Check(temp)) {
6923                        /* convert to string to Unicode */
6924		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
6925						   PyString_GET_SIZE(temp),
6926					       NULL,
6927						   "strict");
6928		    Py_DECREF(temp);
6929		    temp = unicode;
6930		    if (temp == NULL)
6931			goto onError;
6932		}
6933		    else {
6934			Py_DECREF(temp);
6935			PyErr_SetString(PyExc_TypeError,
6936					"%s argument has non-string str()");
6937			goto onError;
6938		    }
6939		}
6940		pbuf = PyUnicode_AS_UNICODE(temp);
6941		len = PyUnicode_GET_SIZE(temp);
6942		if (prec >= 0 && len > prec)
6943		    len = prec;
6944		break;
6945
6946	    case 'i':
6947	    case 'd':
6948	    case 'u':
6949	    case 'o':
6950	    case 'x':
6951	    case 'X':
6952		if (c == 'i')
6953		    c = 'd';
6954		if (PyLong_Check(v)) {
6955		    temp = formatlong(v, flags, prec, c);
6956		    if (!temp)
6957			goto onError;
6958		    pbuf = PyUnicode_AS_UNICODE(temp);
6959		    len = PyUnicode_GET_SIZE(temp);
6960		    sign = 1;
6961		}
6962		else {
6963		    pbuf = formatbuf;
6964		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6965				    flags, prec, c, v);
6966		    if (len < 0)
6967			goto onError;
6968		    sign = 1;
6969		}
6970		if (flags & F_ZERO)
6971		    fill = '0';
6972		break;
6973
6974	    case 'e':
6975	    case 'E':
6976	    case 'f':
6977	    case 'F':
6978	    case 'g':
6979	    case 'G':
6980		if (c == 'F')
6981			c = 'f';
6982		pbuf = formatbuf;
6983		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6984			flags, prec, c, v);
6985		if (len < 0)
6986		    goto onError;
6987		sign = 1;
6988		if (flags & F_ZERO)
6989		    fill = '0';
6990		break;
6991
6992	    case 'c':
6993		pbuf = formatbuf;
6994		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
6995		if (len < 0)
6996		    goto onError;
6997		break;
6998
6999	    default:
7000		PyErr_Format(PyExc_ValueError,
7001			     "unsupported format character '%c' (0x%x) "
7002			     "at index %i",
7003			     (31<=c && c<=126) ? (char)c : '?',
7004                             (int)c,
7005			     (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
7006		goto onError;
7007	    }
7008	    if (sign) {
7009		if (*pbuf == '-' || *pbuf == '+') {
7010		    sign = *pbuf++;
7011		    len--;
7012		}
7013		else if (flags & F_SIGN)
7014		    sign = '+';
7015		else if (flags & F_BLANK)
7016		    sign = ' ';
7017		else
7018		    sign = 0;
7019	    }
7020	    if (width < len)
7021		width = len;
7022	    if (rescnt - (sign != 0) < width) {
7023		reslen -= rescnt;
7024		rescnt = width + fmtcnt + 100;
7025		reslen += rescnt;
7026		if (reslen < 0) {
7027		    Py_DECREF(result);
7028		    return PyErr_NoMemory();
7029		}
7030		if (_PyUnicode_Resize(&result, reslen) < 0)
7031		    return NULL;
7032		res = PyUnicode_AS_UNICODE(result)
7033		    + reslen - rescnt;
7034	    }
7035	    if (sign) {
7036		if (fill != ' ')
7037		    *res++ = sign;
7038		rescnt--;
7039		if (width > len)
7040		    width--;
7041	    }
7042	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7043		assert(pbuf[0] == '0');
7044		assert(pbuf[1] == c);
7045		if (fill != ' ') {
7046		    *res++ = *pbuf++;
7047		    *res++ = *pbuf++;
7048		}
7049		rescnt -= 2;
7050		width -= 2;
7051		if (width < 0)
7052		    width = 0;
7053		len -= 2;
7054	    }
7055	    if (width > len && !(flags & F_LJUST)) {
7056		do {
7057		    --rescnt;
7058		    *res++ = fill;
7059		} while (--width > len);
7060	    }
7061	    if (fill == ' ') {
7062		if (sign)
7063		    *res++ = sign;
7064		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7065		    assert(pbuf[0] == '0');
7066		    assert(pbuf[1] == c);
7067		    *res++ = *pbuf++;
7068		    *res++ = *pbuf++;
7069		}
7070	    }
7071	    Py_UNICODE_COPY(res, pbuf, len);
7072	    res += len;
7073	    rescnt -= len;
7074	    while (--width >= len) {
7075		--rescnt;
7076		*res++ = ' ';
7077	    }
7078	    if (dict && (argidx < arglen) && c != '%') {
7079		PyErr_SetString(PyExc_TypeError,
7080				"not all arguments converted during string formatting");
7081		goto onError;
7082	    }
7083	    Py_XDECREF(temp);
7084	} /* '%' */
7085    } /* until end */
7086    if (argidx < arglen && !dict) {
7087	PyErr_SetString(PyExc_TypeError,
7088			"not all arguments converted during string formatting");
7089	goto onError;
7090    }
7091
7092    if (args_owned) {
7093	Py_DECREF(args);
7094    }
7095    Py_DECREF(uformat);
7096    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
7097	goto onError;
7098    return (PyObject *)result;
7099
7100 onError:
7101    Py_XDECREF(result);
7102    Py_DECREF(uformat);
7103    if (args_owned) {
7104	Py_DECREF(args);
7105    }
7106    return NULL;
7107}
7108
7109static PyBufferProcs unicode_as_buffer = {
7110    (getreadbufferproc) unicode_buffer_getreadbuf,
7111    (getwritebufferproc) unicode_buffer_getwritebuf,
7112    (getsegcountproc) unicode_buffer_getsegcount,
7113    (getcharbufferproc) unicode_buffer_getcharbuf,
7114};
7115
7116static PyObject *
7117unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7118
7119static PyObject *
7120unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7121{
7122        PyObject *x = NULL;
7123	static char *kwlist[] = {"string", "encoding", "errors", 0};
7124	char *encoding = NULL;
7125	char *errors = NULL;
7126
7127	if (type != &PyUnicode_Type)
7128		return unicode_subtype_new(type, args, kwds);
7129	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7130					  kwlist, &x, &encoding, &errors))
7131	    return NULL;
7132	if (x == NULL)
7133		return (PyObject *)_PyUnicode_New(0);
7134	if (encoding == NULL && errors == NULL)
7135	    return PyObject_Unicode(x);
7136	else
7137	return PyUnicode_FromEncodedObject(x, encoding, errors);
7138}
7139
7140static PyObject *
7141unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7142{
7143	PyUnicodeObject *tmp, *pnew;
7144	int n;
7145
7146	assert(PyType_IsSubtype(type, &PyUnicode_Type));
7147	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7148	if (tmp == NULL)
7149		return NULL;
7150	assert(PyUnicode_Check(tmp));
7151	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
7152	if (pnew == NULL) {
7153		Py_DECREF(tmp);
7154		return NULL;
7155	}
7156	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7157	if (pnew->str == NULL) {
7158		_Py_ForgetReference((PyObject *)pnew);
7159		PyObject_Del(pnew);
7160		Py_DECREF(tmp);
7161		return PyErr_NoMemory();
7162	}
7163	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7164	pnew->length = n;
7165	pnew->hash = tmp->hash;
7166	Py_DECREF(tmp);
7167	return (PyObject *)pnew;
7168}
7169
7170PyDoc_STRVAR(unicode_doc,
7171"unicode(string [, encoding[, errors]]) -> object\n\
7172\n\
7173Create a new Unicode object from the given encoded string.\n\
7174encoding defaults to the current default string encoding.\n\
7175errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7176
7177PyTypeObject PyUnicode_Type = {
7178    PyObject_HEAD_INIT(&PyType_Type)
7179    0, 					/* ob_size */
7180    "unicode", 				/* tp_name */
7181    sizeof(PyUnicodeObject), 		/* tp_size */
7182    0, 					/* tp_itemsize */
7183    /* Slots */
7184    (destructor)unicode_dealloc, 	/* tp_dealloc */
7185    0, 					/* tp_print */
7186    0,				 	/* tp_getattr */
7187    0, 					/* tp_setattr */
7188    (cmpfunc) unicode_compare, 		/* tp_compare */
7189    (reprfunc) unicode_repr, 		/* tp_repr */
7190    &unicode_as_number, 		/* tp_as_number */
7191    &unicode_as_sequence, 		/* tp_as_sequence */
7192    &unicode_as_mapping, 		/* tp_as_mapping */
7193    (hashfunc) unicode_hash, 		/* tp_hash*/
7194    0, 					/* tp_call*/
7195    (reprfunc) unicode_str,	 	/* tp_str */
7196    PyObject_GenericGetAttr, 		/* tp_getattro */
7197    0,			 		/* tp_setattro */
7198    &unicode_as_buffer,			/* tp_as_buffer */
7199    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7200	    Py_TPFLAGS_BASETYPE,	/* tp_flags */
7201    unicode_doc,			/* tp_doc */
7202    0,					/* tp_traverse */
7203    0,					/* tp_clear */
7204    0,					/* tp_richcompare */
7205    0,					/* tp_weaklistoffset */
7206    0,					/* tp_iter */
7207    0,					/* tp_iternext */
7208    unicode_methods,			/* tp_methods */
7209    0,					/* tp_members */
7210    0,					/* tp_getset */
7211    &PyBaseString_Type,			/* tp_base */
7212    0,					/* tp_dict */
7213    0,					/* tp_descr_get */
7214    0,					/* tp_descr_set */
7215    0,					/* tp_dictoffset */
7216    0,					/* tp_init */
7217    0,					/* tp_alloc */
7218    unicode_new,			/* tp_new */
7219    PyObject_Del,      		/* tp_free */
7220};
7221
7222/* Initialize the Unicode implementation */
7223
7224void _PyUnicode_Init(void)
7225{
7226    int i;
7227
7228    /* Init the implementation */
7229    unicode_freelist = NULL;
7230    unicode_freelist_size = 0;
7231    unicode_empty = _PyUnicode_New(0);
7232    strcpy(unicode_default_encoding, "ascii");
7233    for (i = 0; i < 256; i++)
7234	unicode_latin1[i] = NULL;
7235    if (PyType_Ready(&PyUnicode_Type) < 0)
7236	Py_FatalError("Can't initialize 'unicode'");
7237}
7238
7239/* Finalize the Unicode implementation */
7240
7241void
7242_PyUnicode_Fini(void)
7243{
7244    PyUnicodeObject *u;
7245    int i;
7246
7247    Py_XDECREF(unicode_empty);
7248    unicode_empty = NULL;
7249
7250    for (i = 0; i < 256; i++) {
7251	if (unicode_latin1[i]) {
7252	    Py_DECREF(unicode_latin1[i]);
7253	    unicode_latin1[i] = NULL;
7254	}
7255    }
7256
7257    for (u = unicode_freelist; u != NULL;) {
7258	PyUnicodeObject *v = u;
7259	u = *(PyUnicodeObject **)u;
7260	if (v->str)
7261	    PyMem_DEL(v->str);
7262	Py_XDECREF(v->defenc);
7263	PyObject_Del(v);
7264    }
7265    unicode_freelist = NULL;
7266    unicode_freelist_size = 0;
7267}
7268
7269/*
7270Local variables:
7271c-basic-offset: 4
7272indent-tabs-mode: nil
7273End:
7274*/
7275