unicodeobject.c revision 8879a33613b33b32bda146a4da1a71d712a684d2
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WIN32
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106/* --- Unicode Object ----------------------------------------------------- */
107
108static
109int unicode_resize(register PyUnicodeObject *unicode,
110                      int length)
111{
112    void *oldstr;
113
114    /* Shortcut if there's nothing much to do. */
115    if (unicode->length == length)
116	goto reset;
117
118    /* Resizing shared object (unicode_empty or single character
119       objects) in-place is not allowed. Use PyUnicode_Resize()
120       instead ! */
121    if (unicode == unicode_empty ||
122	(unicode->length == 1 &&
123	 unicode->str[0] < 256 &&
124	 unicode_latin1[unicode->str[0]] == unicode)) {
125        PyErr_SetString(PyExc_SystemError,
126                        "can't resize shared unicode objects");
127        return -1;
128    }
129
130    /* We allocate one more byte to make sure the string is
131       Ux0000 terminated -- XXX is this needed ? */
132    oldstr = unicode->str;
133    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
134    if (!unicode->str) {
135	unicode->str = oldstr;
136        PyErr_NoMemory();
137        return -1;
138    }
139    unicode->str[length] = 0;
140    unicode->length = length;
141
142 reset:
143    /* Reset the object caches */
144    if (unicode->defenc) {
145        Py_DECREF(unicode->defenc);
146        unicode->defenc = NULL;
147    }
148    unicode->hash = -1;
149
150    return 0;
151}
152
153/* We allocate one more byte to make sure the string is
154   Ux0000 terminated -- XXX is this needed ?
155
156   XXX This allocator could further be enhanced by assuring that the
157       free list never reduces its size below 1.
158
159*/
160
161static
162PyUnicodeObject *_PyUnicode_New(int length)
163{
164    register PyUnicodeObject *unicode;
165
166    /* Optimization for empty strings */
167    if (length == 0 && unicode_empty != NULL) {
168        Py_INCREF(unicode_empty);
169        return unicode_empty;
170    }
171
172    /* Unicode freelist & memory allocation */
173    if (unicode_freelist) {
174        unicode = unicode_freelist;
175        unicode_freelist = *(PyUnicodeObject **)unicode;
176        unicode_freelist_size--;
177	if (unicode->str) {
178	    /* Keep-Alive optimization: we only upsize the buffer,
179	       never downsize it. */
180	    if ((unicode->length < length) &&
181		unicode_resize(unicode, length)) {
182		PyMem_DEL(unicode->str);
183		goto onError;
184	    }
185	}
186      else {
187	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
188      }
189      PyObject_INIT(unicode, &PyUnicode_Type);
190    }
191    else {
192        unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
193        if (unicode == NULL)
194            return NULL;
195	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
196    }
197
198    if (!unicode->str) {
199	PyErr_NoMemory();
200	goto onError;
201    }
202    unicode->str[length] = 0;
203    unicode->length = length;
204    unicode->hash = -1;
205    unicode->defenc = NULL;
206    return unicode;
207
208 onError:
209    _Py_ForgetReference((PyObject *)unicode);
210    PyObject_DEL(unicode);
211    return NULL;
212}
213
214static
215void _PyUnicode_Free(register PyUnicodeObject *unicode)
216{
217    if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
218        /* Keep-Alive optimization */
219	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
220	    PyMem_DEL(unicode->str);
221	    unicode->str = NULL;
222	    unicode->length = 0;
223	}
224	if (unicode->defenc) {
225	    Py_DECREF(unicode->defenc);
226	    unicode->defenc = NULL;
227	}
228	/* Add to free list */
229        *(PyUnicodeObject **)unicode = unicode_freelist;
230        unicode_freelist = unicode;
231        unicode_freelist_size++;
232    }
233    else {
234	PyMem_DEL(unicode->str);
235	Py_XDECREF(unicode->defenc);
236	PyObject_DEL(unicode);
237    }
238}
239
240int PyUnicode_Resize(PyObject **unicode,
241		     int length)
242{
243    register PyUnicodeObject *v;
244
245    /* Argument checks */
246    if (unicode == NULL) {
247	PyErr_BadInternalCall();
248	return -1;
249    }
250    v = (PyUnicodeObject *)*unicode;
251    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
252	PyErr_BadInternalCall();
253	return -1;
254    }
255
256    /* Resizing unicode_empty and single character objects is not
257       possible since these are being shared. We simply return a fresh
258       copy with the same Unicode content. */
259    if (v->length != length &&
260	(v == unicode_empty || v->length == 1)) {
261	PyUnicodeObject *w = _PyUnicode_New(length);
262	if (w == NULL)
263	    return -1;
264	Py_UNICODE_COPY(w->str, v->str,
265			length < v->length ? length : v->length);
266	*unicode = (PyObject *)w;
267	return 0;
268    }
269
270    /* Note that we don't have to modify *unicode for unshared Unicode
271       objects, since we can modify them in-place. */
272    return unicode_resize(v, length);
273}
274
275/* Internal API for use in unicodeobject.c only ! */
276#define _PyUnicode_Resize(unicodevar, length) \
277        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
278
279PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
280				int size)
281{
282    PyUnicodeObject *unicode;
283
284    /* If the Unicode data is known at construction time, we can apply
285       some optimizations which share commonly used objects. */
286    if (u != NULL) {
287
288	/* Optimization for empty strings */
289	if (size == 0 && unicode_empty != NULL) {
290	    Py_INCREF(unicode_empty);
291	    return (PyObject *)unicode_empty;
292	}
293
294	/* Single character Unicode objects in the Latin-1 range are
295	   shared when using this constructor */
296	if (size == 1 && *u < 256) {
297	    unicode = unicode_latin1[*u];
298	    if (!unicode) {
299		unicode = _PyUnicode_New(1);
300		if (!unicode)
301		    return NULL;
302		unicode->str[0] = *u;
303		unicode_latin1[*u] = unicode;
304	    }
305	    Py_INCREF(unicode);
306	    return (PyObject *)unicode;
307	}
308    }
309
310    unicode = _PyUnicode_New(size);
311    if (!unicode)
312        return NULL;
313
314    /* Copy the Unicode data into the new object */
315    if (u != NULL)
316	Py_UNICODE_COPY(unicode->str, u, size);
317
318    return (PyObject *)unicode;
319}
320
321#ifdef HAVE_WCHAR_H
322
323PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
324				 int size)
325{
326    PyUnicodeObject *unicode;
327
328    if (w == NULL) {
329	PyErr_BadInternalCall();
330	return NULL;
331    }
332
333    unicode = _PyUnicode_New(size);
334    if (!unicode)
335        return NULL;
336
337    /* Copy the wchar_t data into the new object */
338#ifdef HAVE_USABLE_WCHAR_T
339    memcpy(unicode->str, w, size * sizeof(wchar_t));
340#else
341    {
342	register Py_UNICODE *u;
343	register int i;
344	u = PyUnicode_AS_UNICODE(unicode);
345	for (i = size; i >= 0; i--)
346	    *u++ = *w++;
347    }
348#endif
349
350    return (PyObject *)unicode;
351}
352
353int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
354			 register wchar_t *w,
355			 int size)
356{
357    if (unicode == NULL) {
358	PyErr_BadInternalCall();
359	return -1;
360    }
361    if (size > PyUnicode_GET_SIZE(unicode))
362	size = PyUnicode_GET_SIZE(unicode);
363#ifdef HAVE_USABLE_WCHAR_T
364    memcpy(w, unicode->str, size * sizeof(wchar_t));
365#else
366    {
367	register Py_UNICODE *u;
368	register int i;
369	u = PyUnicode_AS_UNICODE(unicode);
370	for (i = size; i >= 0; i--)
371	    *w++ = *u++;
372    }
373#endif
374
375    return size;
376}
377
378#endif
379
380PyObject *PyUnicode_FromObject(register PyObject *obj)
381{
382    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
383}
384
385PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
386				      const char *encoding,
387				      const char *errors)
388{
389    const char *s;
390    int len;
391    int owned = 0;
392    PyObject *v;
393
394    if (obj == NULL) {
395	PyErr_BadInternalCall();
396	return NULL;
397    }
398
399    /* Coerce object */
400    if (PyInstance_Check(obj)) {
401	PyObject *func;
402	func = PyObject_GetAttrString(obj, "__str__");
403	if (func == NULL) {
404	    PyErr_SetString(PyExc_TypeError,
405		  "coercing to Unicode: instance doesn't define __str__");
406	    return NULL;
407	}
408	obj = PyEval_CallObject(func, NULL);
409	Py_DECREF(func);
410	if (obj == NULL)
411	    return NULL;
412	owned = 1;
413    }
414    if (PyUnicode_Check(obj)) {
415	Py_INCREF(obj);
416	v = obj;
417	if (encoding) {
418	    PyErr_SetString(PyExc_TypeError,
419			    "decoding Unicode is not supported");
420	    return NULL;
421	}
422	goto done;
423    }
424    else if (PyString_Check(obj)) {
425	s = PyString_AS_STRING(obj);
426	len = PyString_GET_SIZE(obj);
427    }
428    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
429	/* Overwrite the error message with something more useful in
430	   case of a TypeError. */
431	if (PyErr_ExceptionMatches(PyExc_TypeError))
432	    PyErr_Format(PyExc_TypeError,
433			 "coercing to Unicode: need string or buffer, "
434			 "%.80s found",
435			 obj->ob_type->tp_name);
436	goto onError;
437    }
438
439    /* Convert to Unicode */
440    if (len == 0) {
441	Py_INCREF(unicode_empty);
442	v = (PyObject *)unicode_empty;
443    }
444    else
445	v = PyUnicode_Decode(s, len, encoding, errors);
446
447 done:
448    if (owned) {
449	Py_DECREF(obj);
450    }
451    return v;
452
453 onError:
454    if (owned) {
455	Py_DECREF(obj);
456    }
457    return NULL;
458}
459
460PyObject *PyUnicode_Decode(const char *s,
461			   int size,
462			   const char *encoding,
463			   const char *errors)
464{
465    PyObject *buffer = NULL, *unicode;
466
467    if (encoding == NULL)
468	encoding = PyUnicode_GetDefaultEncoding();
469
470    /* Shortcuts for common default encodings */
471    if (strcmp(encoding, "utf-8") == 0)
472        return PyUnicode_DecodeUTF8(s, size, errors);
473    else if (strcmp(encoding, "latin-1") == 0)
474        return PyUnicode_DecodeLatin1(s, size, errors);
475    else if (strcmp(encoding, "ascii") == 0)
476        return PyUnicode_DecodeASCII(s, size, errors);
477
478    /* Decode via the codec registry */
479    buffer = PyBuffer_FromMemory((void *)s, size);
480    if (buffer == NULL)
481        goto onError;
482    unicode = PyCodec_Decode(buffer, encoding, errors);
483    if (unicode == NULL)
484        goto onError;
485    if (!PyUnicode_Check(unicode)) {
486        PyErr_Format(PyExc_TypeError,
487                     "decoder did not return an unicode object (type=%.400s)",
488                     unicode->ob_type->tp_name);
489        Py_DECREF(unicode);
490        goto onError;
491    }
492    Py_DECREF(buffer);
493    return unicode;
494
495 onError:
496    Py_XDECREF(buffer);
497    return NULL;
498}
499
500PyObject *PyUnicode_Encode(const Py_UNICODE *s,
501			   int size,
502			   const char *encoding,
503			   const char *errors)
504{
505    PyObject *v, *unicode;
506
507    unicode = PyUnicode_FromUnicode(s, size);
508    if (unicode == NULL)
509	return NULL;
510    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
511    Py_DECREF(unicode);
512    return v;
513}
514
515PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
516                                    const char *encoding,
517                                    const char *errors)
518{
519    PyObject *v;
520
521    if (!PyUnicode_Check(unicode)) {
522        PyErr_BadArgument();
523        goto onError;
524    }
525
526    if (encoding == NULL)
527	encoding = PyUnicode_GetDefaultEncoding();
528
529    /* Shortcuts for common default encodings */
530    if (errors == NULL) {
531	if (strcmp(encoding, "utf-8") == 0)
532	    return PyUnicode_AsUTF8String(unicode);
533	else if (strcmp(encoding, "latin-1") == 0)
534	    return PyUnicode_AsLatin1String(unicode);
535	else if (strcmp(encoding, "ascii") == 0)
536	    return PyUnicode_AsASCIIString(unicode);
537    }
538
539    /* Encode via the codec registry */
540    v = PyCodec_Encode(unicode, encoding, errors);
541    if (v == NULL)
542        goto onError;
543    /* XXX Should we really enforce this ? */
544    if (!PyString_Check(v)) {
545        PyErr_Format(PyExc_TypeError,
546                     "encoder did not return a string object (type=%.400s)",
547                     v->ob_type->tp_name);
548        Py_DECREF(v);
549        goto onError;
550    }
551    return v;
552
553 onError:
554    return NULL;
555}
556
557/* Return a Python string holding the default encoded value of the
558   Unicode object.
559
560   The resulting string is cached in the Unicode object for subsequent
561   usage by this function. The cached version is needed to implement
562   the character buffer interface and will live (at least) as long as
563   the Unicode object itself.
564
565   The refcount of the string is *not* incremented.
566
567   *** Exported for internal use by the interpreter only !!! ***
568
569*/
570
571PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
572					    const char *errors)
573{
574    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
575
576    if (v)
577        return v;
578    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
579    if (v && errors == NULL)
580        ((PyUnicodeObject *)unicode)->defenc = v;
581    return v;
582}
583
584Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
585{
586    if (!PyUnicode_Check(unicode)) {
587        PyErr_BadArgument();
588        goto onError;
589    }
590    return PyUnicode_AS_UNICODE(unicode);
591
592 onError:
593    return NULL;
594}
595
596int PyUnicode_GetSize(PyObject *unicode)
597{
598    if (!PyUnicode_Check(unicode)) {
599        PyErr_BadArgument();
600        goto onError;
601    }
602    return PyUnicode_GET_SIZE(unicode);
603
604 onError:
605    return -1;
606}
607
608const char *PyUnicode_GetDefaultEncoding(void)
609{
610    return unicode_default_encoding;
611}
612
613int PyUnicode_SetDefaultEncoding(const char *encoding)
614{
615    PyObject *v;
616
617    /* Make sure the encoding is valid. As side effect, this also
618       loads the encoding into the codec registry cache. */
619    v = _PyCodec_Lookup(encoding);
620    if (v == NULL)
621	goto onError;
622    Py_DECREF(v);
623    strncpy(unicode_default_encoding,
624	    encoding,
625	    sizeof(unicode_default_encoding));
626    return 0;
627
628 onError:
629    return -1;
630}
631
632/* --- UTF-8 Codec -------------------------------------------------------- */
633
634static
635char utf8_code_length[256] = {
636    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
637       illegal prefix.  see RFC 2279 for details */
638    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
651    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
652    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
653    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
654};
655
656static
657int utf8_decoding_error(const char **source,
658                        Py_UNICODE **dest,
659                        const char *errors,
660                        const char *details)
661{
662    if ((errors == NULL) ||
663        (strcmp(errors,"strict") == 0)) {
664        PyErr_Format(PyExc_UnicodeError,
665                     "UTF-8 decoding error: %.400s",
666                     details);
667        return -1;
668    }
669    else if (strcmp(errors,"ignore") == 0) {
670        (*source)++;
671        return 0;
672    }
673    else if (strcmp(errors,"replace") == 0) {
674        (*source)++;
675        **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
676        (*dest)++;
677        return 0;
678    }
679    else {
680        PyErr_Format(PyExc_ValueError,
681                     "UTF-8 decoding error; unknown error handling code: %.400s",
682                     errors);
683        return -1;
684    }
685}
686
687PyObject *PyUnicode_DecodeUTF8(const char *s,
688			       int size,
689			       const char *errors)
690{
691    int n;
692    const char *e;
693    PyUnicodeObject *unicode;
694    Py_UNICODE *p;
695    const char *errmsg = "";
696
697    /* Note: size will always be longer than the resulting Unicode
698       character count */
699    unicode = _PyUnicode_New(size);
700    if (!unicode)
701        return NULL;
702    if (size == 0)
703        return (PyObject *)unicode;
704
705    /* Unpack UTF-8 encoded data */
706    p = unicode->str;
707    e = s + size;
708
709    while (s < e) {
710        Py_UCS4 ch = (unsigned char)*s;
711
712        if (ch < 0x80) {
713            *p++ = (Py_UNICODE)ch;
714            s++;
715            continue;
716        }
717
718        n = utf8_code_length[ch];
719
720        if (s + n > e) {
721	    errmsg = "unexpected end of data";
722	    goto utf8Error;
723	}
724
725        switch (n) {
726
727        case 0:
728            errmsg = "unexpected code byte";
729	    goto utf8Error;
730
731        case 1:
732            errmsg = "internal error";
733	    goto utf8Error;
734
735        case 2:
736            if ((s[1] & 0xc0) != 0x80) {
737                errmsg = "invalid data";
738		goto utf8Error;
739	    }
740            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
741            if (ch < 0x80) {
742                errmsg = "illegal encoding";
743		goto utf8Error;
744	    }
745	    else
746		*p++ = (Py_UNICODE)ch;
747            break;
748
749        case 3:
750            if ((s[1] & 0xc0) != 0x80 ||
751                (s[2] & 0xc0) != 0x80) {
752                errmsg = "invalid data";
753		goto utf8Error;
754	    }
755            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
756            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
757                errmsg = "illegal encoding";
758		goto utf8Error;
759	    }
760	    else
761				*p++ = (Py_UNICODE)ch;
762            break;
763
764        case 4:
765            if ((s[1] & 0xc0) != 0x80 ||
766                (s[2] & 0xc0) != 0x80 ||
767                (s[3] & 0xc0) != 0x80) {
768                errmsg = "invalid data";
769		goto utf8Error;
770	    }
771            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
772                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
773            /* validate and convert to UTF-16 */
774            if ((ch < 0x10000) ||   /* minimum value allowed for 4
775                                       byte encoding */
776                (ch > 0x10ffff)) {  /* maximum value allowed for
777                                       UTF-16 */
778                errmsg = "illegal encoding";
779		goto utf8Error;
780	    }
781            /*  compute and append the two surrogates: */
782
783            /*  translate from 10000..10FFFF to 0..FFFF */
784            ch -= 0x10000;
785
786            /*  high surrogate = top 10 bits added to D800 */
787            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
788
789            /*  low surrogate = bottom 10 bits added to DC00 */
790            *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
791            break;
792
793        default:
794            /* Other sizes are only needed for UCS-4 */
795            errmsg = "unsupported Unicode code range";
796	    goto utf8Error;
797        }
798        s += n;
799	continue;
800
801    utf8Error:
802      if (utf8_decoding_error(&s, &p, errors, errmsg))
803          goto onError;
804    }
805
806    /* Adjust length */
807    if (_PyUnicode_Resize(&unicode, p - unicode->str))
808        goto onError;
809
810    return (PyObject *)unicode;
811
812onError:
813    Py_DECREF(unicode);
814    return NULL;
815}
816
817/* Not used anymore, now that the encoder supports UTF-16
818   surrogates. */
819#if 0
820static
821int utf8_encoding_error(const Py_UNICODE **source,
822			char **dest,
823			const char *errors,
824			const char *details)
825{
826    if ((errors == NULL) ||
827	(strcmp(errors,"strict") == 0)) {
828	PyErr_Format(PyExc_UnicodeError,
829		     "UTF-8 encoding error: %.400s",
830		     details);
831	return -1;
832    }
833    else if (strcmp(errors,"ignore") == 0) {
834	return 0;
835    }
836    else if (strcmp(errors,"replace") == 0) {
837	**dest = '?';
838	(*dest)++;
839	return 0;
840    }
841    else {
842	PyErr_Format(PyExc_ValueError,
843		     "UTF-8 encoding error; "
844		     "unknown error handling code: %.400s",
845		     errors);
846	return -1;
847    }
848}
849#endif
850
851PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
852			       int size,
853			       const char *errors)
854{
855    PyObject *v;
856    char *p;
857    char *q;
858    Py_UCS4 ch2;
859    unsigned int cbAllocated = 3 * size;
860    unsigned int cbWritten = 0;
861    int i = 0;
862
863    v = PyString_FromStringAndSize(NULL, cbAllocated);
864    if (v == NULL)
865        return NULL;
866    if (size == 0)
867        return v;
868
869    p = q = PyString_AS_STRING(v);
870    while (i < size) {
871        Py_UCS4 ch = s[i++];
872        if (ch < 0x80) {
873            *p++ = (char) ch;
874            cbWritten++;
875        }
876        else if (ch < 0x0800) {
877            *p++ = 0xc0 | (ch >> 6);
878            *p++ = 0x80 | (ch & 0x3f);
879            cbWritten += 2;
880        }
881        else {
882            /* Check for high surrogate */
883            if (0xD800 <= ch && ch <= 0xDBFF) {
884                if (i != size) {
885                    ch2 = s[i];
886                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
887
888                        if (cbWritten >= (cbAllocated - 4)) {
889			    /* Provide enough room for some more
890			       surrogates */
891			    cbAllocated += 4*10;
892                            if (_PyString_Resize(&v, cbAllocated))
893				goto onError;
894                        }
895
896                        /* combine the two values */
897                        ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
898
899                        *p++ = (char)((ch >> 18) | 0xf0);
900                        *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
901                        i++;
902                        cbWritten += 4;
903                    }
904                }
905            }
906            else {
907                *p++ = (char)(0xe0 | (ch >> 12));
908                cbWritten += 3;
909            }
910            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
911            *p++ = (char)(0x80 | (ch & 0x3f));
912        }
913    }
914    *p = '\0';
915    if (_PyString_Resize(&v, p - q))
916	goto onError;
917    return v;
918
919 onError:
920    Py_DECREF(v);
921    return NULL;
922}
923
924PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
925{
926    if (!PyUnicode_Check(unicode)) {
927        PyErr_BadArgument();
928        return NULL;
929    }
930    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
931				PyUnicode_GET_SIZE(unicode),
932				NULL);
933}
934
935/* --- UTF-16 Codec ------------------------------------------------------- */
936
937static
938int utf16_decoding_error(const Py_UNICODE **source,
939			 Py_UNICODE **dest,
940			 const char *errors,
941			 const char *details)
942{
943    if ((errors == NULL) ||
944        (strcmp(errors,"strict") == 0)) {
945        PyErr_Format(PyExc_UnicodeError,
946                     "UTF-16 decoding error: %.400s",
947                     details);
948        return -1;
949    }
950    else if (strcmp(errors,"ignore") == 0) {
951        return 0;
952    }
953    else if (strcmp(errors,"replace") == 0) {
954	if (dest) {
955	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
956	    (*dest)++;
957	}
958        return 0;
959    }
960    else {
961        PyErr_Format(PyExc_ValueError,
962                     "UTF-16 decoding error; "
963		     "unknown error handling code: %.400s",
964                     errors);
965        return -1;
966    }
967}
968
969PyObject *PyUnicode_DecodeUTF16(const char *s,
970				int size,
971				const char *errors,
972				int *byteorder)
973{
974    PyUnicodeObject *unicode;
975    Py_UNICODE *p;
976    const Py_UNICODE *q, *e;
977    int bo = 0;
978    const char *errmsg = "";
979
980    /* size should be an even number */
981    if (size % sizeof(Py_UNICODE) != 0) {
982	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
983	    return NULL;
984	/* The remaining input chars are ignored if we fall through
985           here... */
986    }
987
988    /* Note: size will always be longer than the resulting Unicode
989       character count */
990    unicode = _PyUnicode_New(size);
991    if (!unicode)
992        return NULL;
993    if (size == 0)
994        return (PyObject *)unicode;
995
996    /* Unpack UTF-16 encoded data */
997    p = unicode->str;
998    q = (Py_UNICODE *)s;
999    e = q + (size / sizeof(Py_UNICODE));
1000
1001    if (byteorder)
1002	bo = *byteorder;
1003
1004    /* Check for BOM marks (U+FEFF) in the input and adjust current
1005       byte order setting accordingly. In native mode, the leading BOM
1006       mark is skipped, in all other modes, it is copied to the output
1007       stream as-is (giving a ZWNBSP character). */
1008    if (bo == 0) {
1009#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1010	if (*q == 0xFEFF) {
1011	    q++;
1012	    bo = -1;
1013	} else if (*q == 0xFFFE) {
1014	    q++;
1015	    bo = 1;
1016	}
1017#else
1018	if (*q == 0xFEFF) {
1019	    q++;
1020	    bo = 1;
1021	} else if (*q == 0xFFFE) {
1022	    q++;
1023	    bo = -1;
1024	}
1025#endif
1026    }
1027
1028    while (q < e) {
1029	register Py_UNICODE ch = *q++;
1030
1031	/* Swap input bytes if needed. (This assumes
1032	   sizeof(Py_UNICODE) == 2 !) */
1033#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1034	if (bo == 1)
1035	    ch = (ch >> 8) | (ch << 8);
1036#else
1037	if (bo == -1)
1038	    ch = (ch >> 8) | (ch << 8);
1039#endif
1040	if (ch < 0xD800 || ch > 0xDFFF) {
1041	    *p++ = ch;
1042	    continue;
1043	}
1044
1045	/* UTF-16 code pair: */
1046	if (q >= e) {
1047	    errmsg = "unexpected end of data";
1048	    goto utf16Error;
1049	}
1050	if (0xDC00 <= *q && *q <= 0xDFFF) {
1051	    q++;
1052	    if (0xD800 <= *q && *q <= 0xDBFF) {
1053		/* This is valid data (a UTF-16 surrogate pair), but
1054		   we are not able to store this information since our
1055		   Py_UNICODE type only has 16 bits... this might
1056		   change someday, even though it's unlikely. */
1057		errmsg = "code pairs are not supported";
1058		goto utf16Error;
1059	    }
1060	    else
1061		continue;
1062	}
1063	errmsg = "illegal encoding";
1064	/* Fall through to report the error */
1065
1066    utf16Error:
1067	if (utf16_decoding_error(&q, &p, errors, errmsg))
1068	    goto onError;
1069    }
1070
1071    if (byteorder)
1072        *byteorder = bo;
1073
1074    /* Adjust length */
1075    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1076        goto onError;
1077
1078    return (PyObject *)unicode;
1079
1080onError:
1081    Py_DECREF(unicode);
1082    return NULL;
1083}
1084
1085#undef UTF16_ERROR
1086
1087PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1088				int size,
1089				const char *errors,
1090				int byteorder)
1091{
1092    PyObject *v;
1093    Py_UNICODE *p;
1094    char *q;
1095
1096    /* We don't create UTF-16 pairs... */
1097    v = PyString_FromStringAndSize(NULL,
1098			sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1099    if (v == NULL)
1100        return NULL;
1101
1102    q = PyString_AS_STRING(v);
1103    p = (Py_UNICODE *)q;
1104    if (byteorder == 0)
1105	*p++ = 0xFEFF;
1106    if (size == 0)
1107        return v;
1108    if (byteorder == 0 ||
1109#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1110	byteorder == -1
1111#else
1112	byteorder == 1
1113#endif
1114	)
1115	Py_UNICODE_COPY(p, s, size);
1116    else
1117	while (size-- > 0) {
1118	    Py_UNICODE ch = *s++;
1119	    *p++ = (ch >> 8) | (ch << 8);
1120	}
1121    return v;
1122}
1123
1124PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1125{
1126    if (!PyUnicode_Check(unicode)) {
1127        PyErr_BadArgument();
1128        return NULL;
1129    }
1130    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1131				 PyUnicode_GET_SIZE(unicode),
1132				 NULL,
1133				 0);
1134}
1135
1136/* --- Unicode Escape Codec ----------------------------------------------- */
1137
1138static
1139int unicodeescape_decoding_error(const char **source,
1140                                 Py_UNICODE *x,
1141                                 const char *errors,
1142                                 const char *details)
1143{
1144    if ((errors == NULL) ||
1145        (strcmp(errors,"strict") == 0)) {
1146        PyErr_Format(PyExc_UnicodeError,
1147                     "Unicode-Escape decoding error: %.400s",
1148                     details);
1149        return -1;
1150    }
1151    else if (strcmp(errors,"ignore") == 0) {
1152        return 0;
1153    }
1154    else if (strcmp(errors,"replace") == 0) {
1155        *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1156        return 0;
1157    }
1158    else {
1159        PyErr_Format(PyExc_ValueError,
1160                     "Unicode-Escape decoding error; "
1161                     "unknown error handling code: %.400s",
1162                     errors);
1163        return -1;
1164    }
1165}
1166
1167static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1168
1169PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1170					int size,
1171					const char *errors)
1172{
1173    PyUnicodeObject *v;
1174    Py_UNICODE *p, *buf;
1175    const char *end;
1176    char* message;
1177    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1178
1179    /* Escaped strings will always be longer than the resulting
1180       Unicode string, so we start with size here and then reduce the
1181       length after conversion to the true value. */
1182    v = _PyUnicode_New(size);
1183    if (v == NULL)
1184        goto onError;
1185    if (size == 0)
1186        return (PyObject *)v;
1187
1188    p = buf = PyUnicode_AS_UNICODE(v);
1189    end = s + size;
1190
1191    while (s < end) {
1192        unsigned char c;
1193        Py_UNICODE x;
1194        int i, digits;
1195
1196        /* Non-escape characters are interpreted as Unicode ordinals */
1197        if (*s != '\\') {
1198            *p++ = (unsigned char) *s++;
1199            continue;
1200        }
1201
1202        /* \ - Escapes */
1203        s++;
1204        switch (*s++) {
1205
1206        /* \x escapes */
1207        case '\n': break;
1208        case '\\': *p++ = '\\'; break;
1209        case '\'': *p++ = '\''; break;
1210        case '\"': *p++ = '\"'; break;
1211        case 'b': *p++ = '\b'; break;
1212        case 'f': *p++ = '\014'; break; /* FF */
1213        case 't': *p++ = '\t'; break;
1214        case 'n': *p++ = '\n'; break;
1215        case 'r': *p++ = '\r'; break;
1216        case 'v': *p++ = '\013'; break; /* VT */
1217        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1218
1219        /* \OOO (octal) escapes */
1220        case '0': case '1': case '2': case '3':
1221        case '4': case '5': case '6': case '7':
1222            x = s[-1] - '0';
1223            if ('0' <= *s && *s <= '7') {
1224                x = (x<<3) + *s++ - '0';
1225                if ('0' <= *s && *s <= '7')
1226                    x = (x<<3) + *s++ - '0';
1227            }
1228            *p++ = x;
1229            break;
1230
1231        /* hex escapes */
1232        /* \xXX */
1233        case 'x':
1234            digits = 2;
1235            message = "truncated \\xXX escape";
1236            goto hexescape;
1237
1238        /* \uXXXX */
1239        case 'u':
1240            digits = 4;
1241            message = "truncated \\uXXXX escape";
1242            goto hexescape;
1243
1244        /* \UXXXXXXXX */
1245        case 'U':
1246            digits = 8;
1247            message = "truncated \\UXXXXXXXX escape";
1248        hexescape:
1249            chr = 0;
1250            for (i = 0; i < digits; i++) {
1251                c = (unsigned char) s[i];
1252                if (!isxdigit(c)) {
1253                    if (unicodeescape_decoding_error(&s, &x, errors, message))
1254                        goto onError;
1255                    chr = x;
1256                    i++;
1257                    break;
1258                }
1259                chr = (chr<<4) & ~0xF;
1260                if (c >= '0' && c <= '9')
1261                    chr += c - '0';
1262                else if (c >= 'a' && c <= 'f')
1263                    chr += 10 + c - 'a';
1264                else
1265                    chr += 10 + c - 'A';
1266            }
1267            s += i;
1268        store:
1269            /* when we get here, chr is a 32-bit unicode character */
1270            if (chr <= 0xffff)
1271                /* UCS-2 character */
1272                *p++ = (Py_UNICODE) chr;
1273            else if (chr <= 0x10ffff) {
1274                /* UCS-4 character.  store as two surrogate characters */
1275                chr -= 0x10000L;
1276                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1277                *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1278            } else {
1279                if (unicodeescape_decoding_error(
1280                    &s, &x, errors,
1281                    "illegal Unicode character")
1282                    )
1283                    goto onError;
1284                *p++ = x; /* store replacement character */
1285            }
1286            break;
1287
1288        /* \N{name} */
1289        case 'N':
1290            message = "malformed \\N character escape";
1291            if (ucnhash_CAPI == NULL) {
1292                /* load the unicode data module */
1293                PyObject *m, *v;
1294                m = PyImport_ImportModule("unicodedata");
1295                if (m == NULL)
1296                    goto ucnhashError;
1297                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1298                Py_DECREF(m);
1299                if (v == NULL)
1300                    goto ucnhashError;
1301                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1302                Py_DECREF(v);
1303                if (ucnhash_CAPI == NULL)
1304                    goto ucnhashError;
1305            }
1306            if (*s == '{') {
1307                const char *start = s+1;
1308                /* look for the closing brace */
1309                while (*s != '}' && s < end)
1310                    s++;
1311                if (s > start && s < end && *s == '}') {
1312                    /* found a name.  look it up in the unicode database */
1313                    message = "unknown Unicode character name";
1314                    s++;
1315                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1316                        goto store;
1317                }
1318            }
1319            if (unicodeescape_decoding_error(&s, &x, errors, message))
1320                goto onError;
1321            *p++ = x;
1322            break;
1323
1324        default:
1325            *p++ = '\\';
1326            *p++ = (unsigned char)s[-1];
1327            break;
1328        }
1329    }
1330    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1331		goto onError;
1332    return (PyObject *)v;
1333
1334ucnhashError:
1335    PyErr_SetString(
1336        PyExc_UnicodeError,
1337        "\\N escapes not supported (can't load unicodedata module)"
1338        );
1339    return NULL;
1340
1341onError:
1342    Py_XDECREF(v);
1343    return NULL;
1344}
1345
1346/* Return a Unicode-Escape string version of the Unicode object.
1347
1348   If quotes is true, the string is enclosed in u"" or u'' quotes as
1349   appropriate.
1350
1351*/
1352
1353static const Py_UNICODE *findchar(const Py_UNICODE *s,
1354				  int size,
1355				  Py_UNICODE ch);
1356
1357static
1358PyObject *unicodeescape_string(const Py_UNICODE *s,
1359                               int size,
1360                               int quotes)
1361{
1362    PyObject *repr;
1363    char *p;
1364    char *q;
1365
1366    static const char *hexdigit = "0123456789abcdef";
1367
1368    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1369    if (repr == NULL)
1370        return NULL;
1371
1372    p = q = PyString_AS_STRING(repr);
1373
1374    if (quotes) {
1375        *p++ = 'u';
1376        *p++ = (findchar(s, size, '\'') &&
1377                !findchar(s, size, '"')) ? '"' : '\'';
1378    }
1379    while (size-- > 0) {
1380        Py_UNICODE ch = *s++;
1381        /* Escape quotes */
1382        if (quotes && (ch == q[1] || ch == '\\')) {
1383            *p++ = '\\';
1384            *p++ = (char) ch;
1385        }
1386        /* Map 16-bit characters to '\uxxxx' */
1387        else if (ch >= 256) {
1388            *p++ = '\\';
1389            *p++ = 'u';
1390            *p++ = hexdigit[(ch >> 12) & 0xf];
1391            *p++ = hexdigit[(ch >> 8) & 0xf];
1392            *p++ = hexdigit[(ch >> 4) & 0xf];
1393            *p++ = hexdigit[ch & 15];
1394        }
1395        /* Map special whitespace to '\t', \n', '\r' */
1396        else if (ch == '\t') {
1397            *p++ = '\\';
1398            *p++ = 't';
1399        }
1400        else if (ch == '\n') {
1401            *p++ = '\\';
1402            *p++ = 'n';
1403        }
1404        else if (ch == '\r') {
1405            *p++ = '\\';
1406            *p++ = 'r';
1407        }
1408        /* Map non-printable US ASCII to '\xhh' */
1409        else if (ch < ' ' || ch >= 128) {
1410            *p++ = '\\';
1411            *p++ = 'x';
1412            *p++ = hexdigit[(ch >> 4) & 0xf];
1413            *p++ = hexdigit[ch & 15];
1414        }
1415        /* Copy everything else as-is */
1416        else
1417            *p++ = (char) ch;
1418    }
1419    if (quotes)
1420        *p++ = q[1];
1421
1422    *p = '\0';
1423    if (_PyString_Resize(&repr, p - q))
1424	goto onError;
1425
1426    return repr;
1427
1428 onError:
1429    Py_DECREF(repr);
1430    return NULL;
1431}
1432
1433PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1434					int size)
1435{
1436    return unicodeescape_string(s, size, 0);
1437}
1438
1439PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1440{
1441    if (!PyUnicode_Check(unicode)) {
1442        PyErr_BadArgument();
1443        return NULL;
1444    }
1445    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1446					 PyUnicode_GET_SIZE(unicode));
1447}
1448
1449/* --- Raw Unicode Escape Codec ------------------------------------------- */
1450
1451PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1452					   int size,
1453					   const char *errors)
1454{
1455    PyUnicodeObject *v;
1456    Py_UNICODE *p, *buf;
1457    const char *end;
1458    const char *bs;
1459
1460    /* Escaped strings will always be longer than the resulting
1461       Unicode string, so we start with size here and then reduce the
1462       length after conversion to the true value. */
1463    v = _PyUnicode_New(size);
1464    if (v == NULL)
1465	goto onError;
1466    if (size == 0)
1467	return (PyObject *)v;
1468    p = buf = PyUnicode_AS_UNICODE(v);
1469    end = s + size;
1470    while (s < end) {
1471	unsigned char c;
1472	Py_UNICODE x;
1473	int i;
1474
1475	/* Non-escape characters are interpreted as Unicode ordinals */
1476	if (*s != '\\') {
1477	    *p++ = (unsigned char)*s++;
1478	    continue;
1479	}
1480
1481	/* \u-escapes are only interpreted iff the number of leading
1482	   backslashes if odd */
1483	bs = s;
1484	for (;s < end;) {
1485	    if (*s != '\\')
1486		break;
1487	    *p++ = (unsigned char)*s++;
1488	}
1489	if (((s - bs) & 1) == 0 ||
1490	    s >= end ||
1491	    *s != 'u') {
1492	    continue;
1493	}
1494	p--;
1495	s++;
1496
1497	/* \uXXXX with 4 hex digits */
1498	for (x = 0, i = 0; i < 4; i++) {
1499	    c = (unsigned char)s[i];
1500	    if (!isxdigit(c)) {
1501		if (unicodeescape_decoding_error(&s, &x, errors,
1502						 "truncated \\uXXXX"))
1503		    goto onError;
1504		i++;
1505		break;
1506	    }
1507	    x = (x<<4) & ~0xF;
1508	    if (c >= '0' && c <= '9')
1509		x += c - '0';
1510	    else if (c >= 'a' && c <= 'f')
1511		x += 10 + c - 'a';
1512	    else
1513		x += 10 + c - 'A';
1514	}
1515	s += i;
1516	*p++ = x;
1517    }
1518    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1519	goto onError;
1520    return (PyObject *)v;
1521
1522 onError:
1523    Py_XDECREF(v);
1524    return NULL;
1525}
1526
1527PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1528					   int size)
1529{
1530    PyObject *repr;
1531    char *p;
1532    char *q;
1533
1534    static const char *hexdigit = "0123456789abcdef";
1535
1536    repr = PyString_FromStringAndSize(NULL, 6 * size);
1537    if (repr == NULL)
1538        return NULL;
1539    if (size == 0)
1540	return repr;
1541
1542    p = q = PyString_AS_STRING(repr);
1543    while (size-- > 0) {
1544        Py_UNICODE ch = *s++;
1545	/* Map 16-bit characters to '\uxxxx' */
1546	if (ch >= 256) {
1547            *p++ = '\\';
1548            *p++ = 'u';
1549            *p++ = hexdigit[(ch >> 12) & 0xf];
1550            *p++ = hexdigit[(ch >> 8) & 0xf];
1551            *p++ = hexdigit[(ch >> 4) & 0xf];
1552            *p++ = hexdigit[ch & 15];
1553        }
1554	/* Copy everything else as-is */
1555	else
1556            *p++ = (char) ch;
1557    }
1558    *p = '\0';
1559    if (_PyString_Resize(&repr, p - q))
1560	goto onError;
1561
1562    return repr;
1563
1564 onError:
1565    Py_DECREF(repr);
1566    return NULL;
1567}
1568
1569PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1570{
1571    if (!PyUnicode_Check(unicode)) {
1572	PyErr_BadArgument();
1573	return NULL;
1574    }
1575    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1576					    PyUnicode_GET_SIZE(unicode));
1577}
1578
1579/* --- Latin-1 Codec ------------------------------------------------------ */
1580
1581PyObject *PyUnicode_DecodeLatin1(const char *s,
1582				 int size,
1583				 const char *errors)
1584{
1585    PyUnicodeObject *v;
1586    Py_UNICODE *p;
1587
1588    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1589    if (size == 1 && *(unsigned char*)s < 256) {
1590	Py_UNICODE r = *(unsigned char*)s;
1591	return PyUnicode_FromUnicode(&r, 1);
1592    }
1593
1594    v = _PyUnicode_New(size);
1595    if (v == NULL)
1596	goto onError;
1597    if (size == 0)
1598	return (PyObject *)v;
1599    p = PyUnicode_AS_UNICODE(v);
1600    while (size-- > 0)
1601	*p++ = (unsigned char)*s++;
1602    return (PyObject *)v;
1603
1604 onError:
1605    Py_XDECREF(v);
1606    return NULL;
1607}
1608
1609static
1610int latin1_encoding_error(const Py_UNICODE **source,
1611			  char **dest,
1612			  const char *errors,
1613			  const char *details)
1614{
1615    if ((errors == NULL) ||
1616	(strcmp(errors,"strict") == 0)) {
1617	PyErr_Format(PyExc_UnicodeError,
1618		     "Latin-1 encoding error: %.400s",
1619		     details);
1620	return -1;
1621    }
1622    else if (strcmp(errors,"ignore") == 0) {
1623	return 0;
1624    }
1625    else if (strcmp(errors,"replace") == 0) {
1626	**dest = '?';
1627	(*dest)++;
1628	return 0;
1629    }
1630    else {
1631	PyErr_Format(PyExc_ValueError,
1632		     "Latin-1 encoding error; "
1633		     "unknown error handling code: %.400s",
1634		     errors);
1635	return -1;
1636    }
1637}
1638
1639PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1640				 int size,
1641				 const char *errors)
1642{
1643    PyObject *repr;
1644    char *s, *start;
1645
1646    repr = PyString_FromStringAndSize(NULL, size);
1647    if (repr == NULL)
1648        return NULL;
1649    if (size == 0)
1650	return repr;
1651
1652    s = PyString_AS_STRING(repr);
1653    start = s;
1654    while (size-- > 0) {
1655        Py_UNICODE ch = *p++;
1656	if (ch >= 256) {
1657	    if (latin1_encoding_error(&p, &s, errors,
1658				      "ordinal not in range(256)"))
1659		goto onError;
1660	}
1661	else
1662            *s++ = (char)ch;
1663    }
1664    /* Resize if error handling skipped some characters */
1665    if (s - start < PyString_GET_SIZE(repr))
1666	if (_PyString_Resize(&repr, s - start))
1667	    goto onError;
1668    return repr;
1669
1670 onError:
1671    Py_DECREF(repr);
1672    return NULL;
1673}
1674
1675PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1676{
1677    if (!PyUnicode_Check(unicode)) {
1678	PyErr_BadArgument();
1679	return NULL;
1680    }
1681    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1682				  PyUnicode_GET_SIZE(unicode),
1683				  NULL);
1684}
1685
1686/* --- 7-bit ASCII Codec -------------------------------------------------- */
1687
1688static
1689int ascii_decoding_error(const char **source,
1690			 Py_UNICODE **dest,
1691			 const char *errors,
1692			 const char *details)
1693{
1694    if ((errors == NULL) ||
1695	(strcmp(errors,"strict") == 0)) {
1696	PyErr_Format(PyExc_UnicodeError,
1697		     "ASCII decoding error: %.400s",
1698		     details);
1699	return -1;
1700    }
1701    else if (strcmp(errors,"ignore") == 0) {
1702	return 0;
1703    }
1704    else if (strcmp(errors,"replace") == 0) {
1705	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1706	(*dest)++;
1707	return 0;
1708    }
1709    else {
1710	PyErr_Format(PyExc_ValueError,
1711		     "ASCII decoding error; "
1712		     "unknown error handling code: %.400s",
1713		     errors);
1714	return -1;
1715    }
1716}
1717
1718PyObject *PyUnicode_DecodeASCII(const char *s,
1719				int size,
1720				const char *errors)
1721{
1722    PyUnicodeObject *v;
1723    Py_UNICODE *p;
1724
1725    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1726    if (size == 1 && *(unsigned char*)s < 128) {
1727	Py_UNICODE r = *(unsigned char*)s;
1728	return PyUnicode_FromUnicode(&r, 1);
1729    }
1730
1731    v = _PyUnicode_New(size);
1732    if (v == NULL)
1733	goto onError;
1734    if (size == 0)
1735	return (PyObject *)v;
1736    p = PyUnicode_AS_UNICODE(v);
1737    while (size-- > 0) {
1738	register unsigned char c;
1739
1740	c = (unsigned char)*s++;
1741	if (c < 128)
1742	    *p++ = c;
1743	else if (ascii_decoding_error(&s, &p, errors,
1744				      "ordinal not in range(128)"))
1745		goto onError;
1746    }
1747    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1748	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1749	    goto onError;
1750    return (PyObject *)v;
1751
1752 onError:
1753    Py_XDECREF(v);
1754    return NULL;
1755}
1756
1757static
1758int ascii_encoding_error(const Py_UNICODE **source,
1759			 char **dest,
1760			 const char *errors,
1761			 const char *details)
1762{
1763    if ((errors == NULL) ||
1764	(strcmp(errors,"strict") == 0)) {
1765	PyErr_Format(PyExc_UnicodeError,
1766		     "ASCII encoding error: %.400s",
1767		     details);
1768	return -1;
1769    }
1770    else if (strcmp(errors,"ignore") == 0) {
1771	return 0;
1772    }
1773    else if (strcmp(errors,"replace") == 0) {
1774	**dest = '?';
1775	(*dest)++;
1776	return 0;
1777    }
1778    else {
1779	PyErr_Format(PyExc_ValueError,
1780		     "ASCII encoding error; "
1781		     "unknown error handling code: %.400s",
1782		     errors);
1783	return -1;
1784    }
1785}
1786
1787PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1788				int size,
1789				const char *errors)
1790{
1791    PyObject *repr;
1792    char *s, *start;
1793
1794    repr = PyString_FromStringAndSize(NULL, size);
1795    if (repr == NULL)
1796        return NULL;
1797    if (size == 0)
1798	return repr;
1799
1800    s = PyString_AS_STRING(repr);
1801    start = s;
1802    while (size-- > 0) {
1803        Py_UNICODE ch = *p++;
1804	if (ch >= 128) {
1805	    if (ascii_encoding_error(&p, &s, errors,
1806				      "ordinal not in range(128)"))
1807		goto onError;
1808	}
1809	else
1810            *s++ = (char)ch;
1811    }
1812    /* Resize if error handling skipped some characters */
1813    if (s - start < PyString_GET_SIZE(repr))
1814	if (_PyString_Resize(&repr, s - start))
1815	    goto onError;
1816    return repr;
1817
1818 onError:
1819    Py_DECREF(repr);
1820    return NULL;
1821}
1822
1823PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1824{
1825    if (!PyUnicode_Check(unicode)) {
1826	PyErr_BadArgument();
1827	return NULL;
1828    }
1829    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1830				 PyUnicode_GET_SIZE(unicode),
1831				 NULL);
1832}
1833
1834#ifdef MS_WIN32
1835
1836/* --- MBCS codecs for Windows -------------------------------------------- */
1837
1838PyObject *PyUnicode_DecodeMBCS(const char *s,
1839				int size,
1840				const char *errors)
1841{
1842    PyUnicodeObject *v;
1843    Py_UNICODE *p;
1844
1845    /* First get the size of the result */
1846    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1847    if (size > 0 && usize==0)
1848        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1849
1850    v = _PyUnicode_New(usize);
1851    if (v == NULL)
1852        return NULL;
1853    if (usize == 0)
1854	return (PyObject *)v;
1855    p = PyUnicode_AS_UNICODE(v);
1856    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1857        Py_DECREF(v);
1858        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1859    }
1860
1861    return (PyObject *)v;
1862}
1863
1864PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1865				int size,
1866				const char *errors)
1867{
1868    PyObject *repr;
1869    char *s;
1870    DWORD mbcssize;
1871
1872    /* If there are no characters, bail now! */
1873    if (size==0)
1874	    return PyString_FromString("");
1875
1876    /* First get the size of the result */
1877    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1878    if (mbcssize==0)
1879        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1880
1881    repr = PyString_FromStringAndSize(NULL, mbcssize);
1882    if (repr == NULL)
1883        return NULL;
1884    if (mbcssize == 0)
1885        return repr;
1886
1887    /* Do the conversion */
1888    s = PyString_AS_STRING(repr);
1889    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1890        Py_DECREF(repr);
1891        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1892    }
1893    return repr;
1894}
1895
1896#endif /* MS_WIN32 */
1897
1898/* --- Character Mapping Codec -------------------------------------------- */
1899
1900static
1901int charmap_decoding_error(const char **source,
1902			 Py_UNICODE **dest,
1903			 const char *errors,
1904			 const char *details)
1905{
1906    if ((errors == NULL) ||
1907	(strcmp(errors,"strict") == 0)) {
1908	PyErr_Format(PyExc_UnicodeError,
1909		     "charmap decoding error: %.400s",
1910		     details);
1911	return -1;
1912    }
1913    else if (strcmp(errors,"ignore") == 0) {
1914	return 0;
1915    }
1916    else if (strcmp(errors,"replace") == 0) {
1917	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1918	(*dest)++;
1919	return 0;
1920    }
1921    else {
1922	PyErr_Format(PyExc_ValueError,
1923		     "charmap decoding error; "
1924		     "unknown error handling code: %.400s",
1925		     errors);
1926	return -1;
1927    }
1928}
1929
1930PyObject *PyUnicode_DecodeCharmap(const char *s,
1931				  int size,
1932				  PyObject *mapping,
1933				  const char *errors)
1934{
1935    PyUnicodeObject *v;
1936    Py_UNICODE *p;
1937    int extrachars = 0;
1938
1939    /* Default to Latin-1 */
1940    if (mapping == NULL)
1941	return PyUnicode_DecodeLatin1(s, size, errors);
1942
1943    v = _PyUnicode_New(size);
1944    if (v == NULL)
1945	goto onError;
1946    if (size == 0)
1947	return (PyObject *)v;
1948    p = PyUnicode_AS_UNICODE(v);
1949    while (size-- > 0) {
1950	unsigned char ch = *s++;
1951	PyObject *w, *x;
1952
1953	/* Get mapping (char ordinal -> integer, Unicode char or None) */
1954	w = PyInt_FromLong((long)ch);
1955	if (w == NULL)
1956	    goto onError;
1957	x = PyObject_GetItem(mapping, w);
1958	Py_DECREF(w);
1959	if (x == NULL) {
1960	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1961		/* No mapping found means: mapping is undefined. */
1962		PyErr_Clear();
1963		x = Py_None;
1964		Py_INCREF(x);
1965	    } else
1966		goto onError;
1967	}
1968
1969	/* Apply mapping */
1970	if (PyInt_Check(x)) {
1971	    long value = PyInt_AS_LONG(x);
1972	    if (value < 0 || value > 65535) {
1973		PyErr_SetString(PyExc_TypeError,
1974				"character mapping must be in range(65536)");
1975		Py_DECREF(x);
1976		goto onError;
1977	    }
1978	    *p++ = (Py_UNICODE)value;
1979	}
1980	else if (x == Py_None) {
1981	    /* undefined mapping */
1982	    if (charmap_decoding_error(&s, &p, errors,
1983				       "character maps to <undefined>")) {
1984		Py_DECREF(x);
1985		goto onError;
1986	    }
1987	}
1988	else if (PyUnicode_Check(x)) {
1989	    int targetsize = PyUnicode_GET_SIZE(x);
1990
1991	    if (targetsize == 1)
1992		/* 1-1 mapping */
1993		*p++ = *PyUnicode_AS_UNICODE(x);
1994
1995	    else if (targetsize > 1) {
1996		/* 1-n mapping */
1997		if (targetsize > extrachars) {
1998		    /* resize first */
1999		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2000		    int needed = (targetsize - extrachars) + \
2001			         (targetsize << 2);
2002		    extrachars += needed;
2003		    if (_PyUnicode_Resize(&v,
2004					 PyUnicode_GET_SIZE(v) + needed)) {
2005			Py_DECREF(x);
2006			goto onError;
2007		    }
2008		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2009		}
2010		Py_UNICODE_COPY(p,
2011				PyUnicode_AS_UNICODE(x),
2012				targetsize);
2013		p += targetsize;
2014		extrachars -= targetsize;
2015	    }
2016	    /* 1-0 mapping: skip the character */
2017	}
2018	else {
2019	    /* wrong return value */
2020	    PyErr_SetString(PyExc_TypeError,
2021		  "character mapping must return integer, None or unicode");
2022	    Py_DECREF(x);
2023	    goto onError;
2024	}
2025	Py_DECREF(x);
2026    }
2027    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2028	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2029	    goto onError;
2030    return (PyObject *)v;
2031
2032 onError:
2033    Py_XDECREF(v);
2034    return NULL;
2035}
2036
2037static
2038int charmap_encoding_error(const Py_UNICODE **source,
2039			   char **dest,
2040			   const char *errors,
2041			   const char *details)
2042{
2043    if ((errors == NULL) ||
2044	(strcmp(errors,"strict") == 0)) {
2045	PyErr_Format(PyExc_UnicodeError,
2046		     "charmap encoding error: %.400s",
2047		     details);
2048	return -1;
2049    }
2050    else if (strcmp(errors,"ignore") == 0) {
2051	return 0;
2052    }
2053    else if (strcmp(errors,"replace") == 0) {
2054	**dest = '?';
2055	(*dest)++;
2056	return 0;
2057    }
2058    else {
2059	PyErr_Format(PyExc_ValueError,
2060		     "charmap encoding error; "
2061		     "unknown error handling code: %.400s",
2062		     errors);
2063	return -1;
2064    }
2065}
2066
2067PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2068				  int size,
2069				  PyObject *mapping,
2070				  const char *errors)
2071{
2072    PyObject *v;
2073    char *s;
2074    int extrachars = 0;
2075
2076    /* Default to Latin-1 */
2077    if (mapping == NULL)
2078	return PyUnicode_EncodeLatin1(p, size, errors);
2079
2080    v = PyString_FromStringAndSize(NULL, size);
2081    if (v == NULL)
2082        return NULL;
2083    if (size == 0)
2084	return v;
2085    s = PyString_AS_STRING(v);
2086    while (size-- > 0) {
2087	Py_UNICODE ch = *p++;
2088	PyObject *w, *x;
2089
2090	/* Get mapping (Unicode ordinal -> string char, integer or None) */
2091	w = PyInt_FromLong((long)ch);
2092	if (w == NULL)
2093	    goto onError;
2094	x = PyObject_GetItem(mapping, w);
2095	Py_DECREF(w);
2096	if (x == NULL) {
2097	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2098		/* No mapping found means: mapping is undefined. */
2099		PyErr_Clear();
2100		x = Py_None;
2101		Py_INCREF(x);
2102	    } else
2103		goto onError;
2104	}
2105
2106	/* Apply mapping */
2107	if (PyInt_Check(x)) {
2108	    long value = PyInt_AS_LONG(x);
2109	    if (value < 0 || value > 255) {
2110		PyErr_SetString(PyExc_TypeError,
2111				"character mapping must be in range(256)");
2112		Py_DECREF(x);
2113		goto onError;
2114	    }
2115	    *s++ = (char)value;
2116	}
2117	else if (x == Py_None) {
2118	    /* undefined mapping */
2119	    if (charmap_encoding_error(&p, &s, errors,
2120				       "character maps to <undefined>")) {
2121		Py_DECREF(x);
2122		goto onError;
2123	    }
2124	}
2125	else if (PyString_Check(x)) {
2126	    int targetsize = PyString_GET_SIZE(x);
2127
2128	    if (targetsize == 1)
2129		/* 1-1 mapping */
2130		*s++ = *PyString_AS_STRING(x);
2131
2132	    else if (targetsize > 1) {
2133		/* 1-n mapping */
2134		if (targetsize > extrachars) {
2135		    /* resize first */
2136		    int oldpos = (int)(s - PyString_AS_STRING(v));
2137		    int needed = (targetsize - extrachars) + \
2138			         (targetsize << 2);
2139		    extrachars += needed;
2140		    if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2141			Py_DECREF(x);
2142			goto onError;
2143		    }
2144		    s = PyString_AS_STRING(v) + oldpos;
2145		}
2146		memcpy(s, PyString_AS_STRING(x), targetsize);
2147		s += targetsize;
2148		extrachars -= targetsize;
2149	    }
2150	    /* 1-0 mapping: skip the character */
2151	}
2152	else {
2153	    /* wrong return value */
2154	    PyErr_SetString(PyExc_TypeError,
2155		  "character mapping must return integer, None or unicode");
2156	    Py_DECREF(x);
2157	    goto onError;
2158	}
2159	Py_DECREF(x);
2160    }
2161    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2162	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2163	    goto onError;
2164    return v;
2165
2166 onError:
2167    Py_DECREF(v);
2168    return NULL;
2169}
2170
2171PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2172				    PyObject *mapping)
2173{
2174    if (!PyUnicode_Check(unicode) || mapping == NULL) {
2175	PyErr_BadArgument();
2176	return NULL;
2177    }
2178    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2179				   PyUnicode_GET_SIZE(unicode),
2180				   mapping,
2181				   NULL);
2182}
2183
2184static
2185int translate_error(const Py_UNICODE **source,
2186		    Py_UNICODE **dest,
2187		    const char *errors,
2188		    const char *details)
2189{
2190    if ((errors == NULL) ||
2191	(strcmp(errors,"strict") == 0)) {
2192	PyErr_Format(PyExc_UnicodeError,
2193		     "translate error: %.400s",
2194		     details);
2195	return -1;
2196    }
2197    else if (strcmp(errors,"ignore") == 0) {
2198	return 0;
2199    }
2200    else if (strcmp(errors,"replace") == 0) {
2201	**dest = '?';
2202	(*dest)++;
2203	return 0;
2204    }
2205    else {
2206	PyErr_Format(PyExc_ValueError,
2207		     "translate error; "
2208		     "unknown error handling code: %.400s",
2209		     errors);
2210	return -1;
2211    }
2212}
2213
2214PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2215				     int size,
2216				     PyObject *mapping,
2217				     const char *errors)
2218{
2219    PyUnicodeObject *v;
2220    Py_UNICODE *p;
2221
2222    if (mapping == NULL) {
2223	PyErr_BadArgument();
2224	return NULL;
2225    }
2226
2227    /* Output will never be longer than input */
2228    v = _PyUnicode_New(size);
2229    if (v == NULL)
2230	goto onError;
2231    if (size == 0)
2232	goto done;
2233    p = PyUnicode_AS_UNICODE(v);
2234    while (size-- > 0) {
2235	Py_UNICODE ch = *s++;
2236	PyObject *w, *x;
2237
2238	/* Get mapping */
2239	w = PyInt_FromLong(ch);
2240	if (w == NULL)
2241	    goto onError;
2242	x = PyObject_GetItem(mapping, w);
2243	Py_DECREF(w);
2244	if (x == NULL) {
2245	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2246		/* No mapping found: default to 1-1 mapping */
2247		PyErr_Clear();
2248		*p++ = ch;
2249		continue;
2250	    }
2251	    goto onError;
2252	}
2253
2254	/* Apply mapping */
2255	if (PyInt_Check(x))
2256	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2257	else if (x == Py_None) {
2258	    /* undefined mapping */
2259	    if (translate_error(&s, &p, errors,
2260				"character maps to <undefined>")) {
2261		Py_DECREF(x);
2262		goto onError;
2263	    }
2264	}
2265	else if (PyUnicode_Check(x)) {
2266	    if (PyUnicode_GET_SIZE(x) != 1) {
2267		/* 1-n mapping */
2268		PyErr_SetString(PyExc_NotImplementedError,
2269				"1-n mappings are currently not implemented");
2270		Py_DECREF(x);
2271		goto onError;
2272	    }
2273	    *p++ = *PyUnicode_AS_UNICODE(x);
2274	}
2275	else {
2276	    /* wrong return value */
2277	    PyErr_SetString(PyExc_TypeError,
2278		  "translate mapping must return integer, None or unicode");
2279	    Py_DECREF(x);
2280	    goto onError;
2281	}
2282	Py_DECREF(x);
2283    }
2284    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2285	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2286	    goto onError;
2287
2288 done:
2289    return (PyObject *)v;
2290
2291 onError:
2292    Py_XDECREF(v);
2293    return NULL;
2294}
2295
2296PyObject *PyUnicode_Translate(PyObject *str,
2297			      PyObject *mapping,
2298			      const char *errors)
2299{
2300    PyObject *result;
2301
2302    str = PyUnicode_FromObject(str);
2303    if (str == NULL)
2304	goto onError;
2305    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2306					PyUnicode_GET_SIZE(str),
2307					mapping,
2308					errors);
2309    Py_DECREF(str);
2310    return result;
2311
2312 onError:
2313    Py_XDECREF(str);
2314    return NULL;
2315}
2316
2317/* --- Decimal Encoder ---------------------------------------------------- */
2318
2319int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2320			    int length,
2321			    char *output,
2322			    const char *errors)
2323{
2324    Py_UNICODE *p, *end;
2325
2326    if (output == NULL) {
2327	PyErr_BadArgument();
2328	return -1;
2329    }
2330
2331    p = s;
2332    end = s + length;
2333    while (p < end) {
2334	register Py_UNICODE ch = *p++;
2335	int decimal;
2336
2337	if (Py_UNICODE_ISSPACE(ch)) {
2338	    *output++ = ' ';
2339	    continue;
2340	}
2341	decimal = Py_UNICODE_TODECIMAL(ch);
2342	if (decimal >= 0) {
2343	    *output++ = '0' + decimal;
2344	    continue;
2345	}
2346	if (0 < ch && ch < 256) {
2347	    *output++ = (char)ch;
2348	    continue;
2349	}
2350	/* All other characters are considered invalid */
2351	if (errors == NULL || strcmp(errors, "strict") == 0) {
2352	    PyErr_SetString(PyExc_ValueError,
2353			    "invalid decimal Unicode string");
2354	    goto onError;
2355	}
2356	else if (strcmp(errors, "ignore") == 0)
2357	    continue;
2358	else if (strcmp(errors, "replace") == 0) {
2359	    *output++ = '?';
2360	    continue;
2361	}
2362    }
2363    /* 0-terminate the output string */
2364    *output++ = '\0';
2365    return 0;
2366
2367 onError:
2368    return -1;
2369}
2370
2371/* --- Helpers ------------------------------------------------------------ */
2372
2373static
2374int count(PyUnicodeObject *self,
2375	  int start,
2376	  int end,
2377	  PyUnicodeObject *substring)
2378{
2379    int count = 0;
2380
2381    if (start < 0)
2382        start += self->length;
2383    if (start < 0)
2384        start = 0;
2385    if (end > self->length)
2386        end = self->length;
2387    if (end < 0)
2388        end += self->length;
2389    if (end < 0)
2390        end = 0;
2391
2392    if (substring->length == 0)
2393	return (end - start + 1);
2394
2395    end -= substring->length;
2396
2397    while (start <= end)
2398        if (Py_UNICODE_MATCH(self, start, substring)) {
2399            count++;
2400            start += substring->length;
2401        } else
2402            start++;
2403
2404    return count;
2405}
2406
2407int PyUnicode_Count(PyObject *str,
2408		    PyObject *substr,
2409		    int start,
2410		    int end)
2411{
2412    int result;
2413
2414    str = PyUnicode_FromObject(str);
2415    if (str == NULL)
2416	return -1;
2417    substr = PyUnicode_FromObject(substr);
2418    if (substr == NULL) {
2419	Py_DECREF(str);
2420	return -1;
2421    }
2422
2423    result = count((PyUnicodeObject *)str,
2424		   start, end,
2425		   (PyUnicodeObject *)substr);
2426
2427    Py_DECREF(str);
2428    Py_DECREF(substr);
2429    return result;
2430}
2431
2432static
2433int findstring(PyUnicodeObject *self,
2434	       PyUnicodeObject *substring,
2435	       int start,
2436	       int end,
2437	       int direction)
2438{
2439    if (start < 0)
2440        start += self->length;
2441    if (start < 0)
2442        start = 0;
2443
2444    if (substring->length == 0)
2445        return start;
2446
2447    if (end > self->length)
2448        end = self->length;
2449    if (end < 0)
2450        end += self->length;
2451    if (end < 0)
2452        end = 0;
2453
2454    end -= substring->length;
2455
2456    if (direction < 0) {
2457        for (; end >= start; end--)
2458            if (Py_UNICODE_MATCH(self, end, substring))
2459                return end;
2460    } else {
2461        for (; start <= end; start++)
2462            if (Py_UNICODE_MATCH(self, start, substring))
2463                return start;
2464    }
2465
2466    return -1;
2467}
2468
2469int PyUnicode_Find(PyObject *str,
2470		   PyObject *substr,
2471		   int start,
2472		   int end,
2473		   int direction)
2474{
2475    int result;
2476
2477    str = PyUnicode_FromObject(str);
2478    if (str == NULL)
2479	return -1;
2480    substr = PyUnicode_FromObject(substr);
2481    if (substr == NULL) {
2482	Py_DECREF(substr);
2483	return -1;
2484    }
2485
2486    result = findstring((PyUnicodeObject *)str,
2487			(PyUnicodeObject *)substr,
2488			start, end, direction);
2489    Py_DECREF(str);
2490    Py_DECREF(substr);
2491    return result;
2492}
2493
2494static
2495int tailmatch(PyUnicodeObject *self,
2496	      PyUnicodeObject *substring,
2497	      int start,
2498	      int end,
2499	      int direction)
2500{
2501    if (start < 0)
2502        start += self->length;
2503    if (start < 0)
2504        start = 0;
2505
2506    if (substring->length == 0)
2507        return 1;
2508
2509    if (end > self->length)
2510        end = self->length;
2511    if (end < 0)
2512        end += self->length;
2513    if (end < 0)
2514        end = 0;
2515
2516    end -= substring->length;
2517    if (end < start)
2518	return 0;
2519
2520    if (direction > 0) {
2521	if (Py_UNICODE_MATCH(self, end, substring))
2522	    return 1;
2523    } else {
2524        if (Py_UNICODE_MATCH(self, start, substring))
2525	    return 1;
2526    }
2527
2528    return 0;
2529}
2530
2531int PyUnicode_Tailmatch(PyObject *str,
2532			PyObject *substr,
2533			int start,
2534			int end,
2535			int direction)
2536{
2537    int result;
2538
2539    str = PyUnicode_FromObject(str);
2540    if (str == NULL)
2541	return -1;
2542    substr = PyUnicode_FromObject(substr);
2543    if (substr == NULL) {
2544	Py_DECREF(substr);
2545	return -1;
2546    }
2547
2548    result = tailmatch((PyUnicodeObject *)str,
2549		       (PyUnicodeObject *)substr,
2550		       start, end, direction);
2551    Py_DECREF(str);
2552    Py_DECREF(substr);
2553    return result;
2554}
2555
2556static
2557const Py_UNICODE *findchar(const Py_UNICODE *s,
2558		     int size,
2559		     Py_UNICODE ch)
2560{
2561    /* like wcschr, but doesn't stop at NULL characters */
2562
2563    while (size-- > 0) {
2564        if (*s == ch)
2565            return s;
2566        s++;
2567    }
2568
2569    return NULL;
2570}
2571
2572/* Apply fixfct filter to the Unicode object self and return a
2573   reference to the modified object */
2574
2575static
2576PyObject *fixup(PyUnicodeObject *self,
2577		int (*fixfct)(PyUnicodeObject *s))
2578{
2579
2580    PyUnicodeObject *u;
2581
2582    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
2583    if (u == NULL)
2584	return NULL;
2585
2586    Py_UNICODE_COPY(u->str, self->str, self->length);
2587
2588    if (!fixfct(u)) {
2589	/* fixfct should return TRUE if it modified the buffer. If
2590	   FALSE, return a reference to the original buffer instead
2591	   (to save space, not time) */
2592	Py_INCREF(self);
2593	Py_DECREF(u);
2594	return (PyObject*) self;
2595    }
2596    return (PyObject*) u;
2597}
2598
2599static
2600int fixupper(PyUnicodeObject *self)
2601{
2602    int len = self->length;
2603    Py_UNICODE *s = self->str;
2604    int status = 0;
2605
2606    while (len-- > 0) {
2607	register Py_UNICODE ch;
2608
2609	ch = Py_UNICODE_TOUPPER(*s);
2610	if (ch != *s) {
2611            status = 1;
2612	    *s = ch;
2613	}
2614        s++;
2615    }
2616
2617    return status;
2618}
2619
2620static
2621int fixlower(PyUnicodeObject *self)
2622{
2623    int len = self->length;
2624    Py_UNICODE *s = self->str;
2625    int status = 0;
2626
2627    while (len-- > 0) {
2628	register Py_UNICODE ch;
2629
2630	ch = Py_UNICODE_TOLOWER(*s);
2631	if (ch != *s) {
2632            status = 1;
2633	    *s = ch;
2634	}
2635        s++;
2636    }
2637
2638    return status;
2639}
2640
2641static
2642int fixswapcase(PyUnicodeObject *self)
2643{
2644    int len = self->length;
2645    Py_UNICODE *s = self->str;
2646    int status = 0;
2647
2648    while (len-- > 0) {
2649        if (Py_UNICODE_ISUPPER(*s)) {
2650            *s = Py_UNICODE_TOLOWER(*s);
2651            status = 1;
2652        } else if (Py_UNICODE_ISLOWER(*s)) {
2653            *s = Py_UNICODE_TOUPPER(*s);
2654            status = 1;
2655        }
2656        s++;
2657    }
2658
2659    return status;
2660}
2661
2662static
2663int fixcapitalize(PyUnicodeObject *self)
2664{
2665    int len = self->length;
2666    Py_UNICODE *s = self->str;
2667    int status = 0;
2668
2669    if (len == 0)
2670	return 0;
2671    if (Py_UNICODE_ISLOWER(*s)) {
2672	*s = Py_UNICODE_TOUPPER(*s);
2673	status = 1;
2674    }
2675    s++;
2676    while (--len > 0) {
2677        if (Py_UNICODE_ISUPPER(*s)) {
2678            *s = Py_UNICODE_TOLOWER(*s);
2679            status = 1;
2680        }
2681        s++;
2682    }
2683    return status;
2684}
2685
2686static
2687int fixtitle(PyUnicodeObject *self)
2688{
2689    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2690    register Py_UNICODE *e;
2691    int previous_is_cased;
2692
2693    /* Shortcut for single character strings */
2694    if (PyUnicode_GET_SIZE(self) == 1) {
2695	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2696	if (*p != ch) {
2697	    *p = ch;
2698	    return 1;
2699	}
2700	else
2701	    return 0;
2702    }
2703
2704    e = p + PyUnicode_GET_SIZE(self);
2705    previous_is_cased = 0;
2706    for (; p < e; p++) {
2707	register const Py_UNICODE ch = *p;
2708
2709	if (previous_is_cased)
2710	    *p = Py_UNICODE_TOLOWER(ch);
2711	else
2712	    *p = Py_UNICODE_TOTITLE(ch);
2713
2714	if (Py_UNICODE_ISLOWER(ch) ||
2715	    Py_UNICODE_ISUPPER(ch) ||
2716	    Py_UNICODE_ISTITLE(ch))
2717	    previous_is_cased = 1;
2718	else
2719	    previous_is_cased = 0;
2720    }
2721    return 1;
2722}
2723
2724PyObject *PyUnicode_Join(PyObject *separator,
2725			 PyObject *seq)
2726{
2727    Py_UNICODE *sep;
2728    int seplen;
2729    PyUnicodeObject *res = NULL;
2730    int reslen = 0;
2731    Py_UNICODE *p;
2732    int sz = 100;
2733    int i;
2734    PyObject *it;
2735
2736    it = PyObject_GetIter(seq);
2737    if (it == NULL)
2738        return NULL;
2739
2740    if (separator == NULL) {
2741	Py_UNICODE blank = ' ';
2742	sep = &blank;
2743	seplen = 1;
2744    }
2745    else {
2746	separator = PyUnicode_FromObject(separator);
2747	if (separator == NULL)
2748	    goto onError;
2749	sep = PyUnicode_AS_UNICODE(separator);
2750	seplen = PyUnicode_GET_SIZE(separator);
2751    }
2752
2753    res = _PyUnicode_New(sz);
2754    if (res == NULL)
2755	goto onError;
2756    p = PyUnicode_AS_UNICODE(res);
2757    reslen = 0;
2758
2759    for (i = 0; ; ++i) {
2760	int itemlen;
2761	PyObject *item = PyIter_Next(it);
2762	if (item == NULL) {
2763	    if (PyErr_Occurred())
2764		goto onError;
2765	    break;
2766	}
2767	if (!PyUnicode_Check(item)) {
2768	    PyObject *v;
2769	    v = PyUnicode_FromObject(item);
2770	    Py_DECREF(item);
2771	    item = v;
2772	    if (item == NULL)
2773		goto onError;
2774	}
2775	itemlen = PyUnicode_GET_SIZE(item);
2776	while (reslen + itemlen + seplen >= sz) {
2777	    if (_PyUnicode_Resize(&res, sz*2))
2778		goto onError;
2779	    sz *= 2;
2780	    p = PyUnicode_AS_UNICODE(res) + reslen;
2781	}
2782	if (i > 0) {
2783	    Py_UNICODE_COPY(p, sep, seplen);
2784	    p += seplen;
2785	    reslen += seplen;
2786	}
2787	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
2788	p += itemlen;
2789	reslen += itemlen;
2790	Py_DECREF(item);
2791    }
2792    if (_PyUnicode_Resize(&res, reslen))
2793	goto onError;
2794
2795    Py_XDECREF(separator);
2796    Py_DECREF(it);
2797    return (PyObject *)res;
2798
2799 onError:
2800    Py_XDECREF(separator);
2801    Py_XDECREF(res);
2802    Py_DECREF(it);
2803    return NULL;
2804}
2805
2806static
2807PyUnicodeObject *pad(PyUnicodeObject *self,
2808		     int left,
2809		     int right,
2810		     Py_UNICODE fill)
2811{
2812    PyUnicodeObject *u;
2813
2814    if (left < 0)
2815        left = 0;
2816    if (right < 0)
2817        right = 0;
2818
2819    if (left == 0 && right == 0) {
2820        Py_INCREF(self);
2821        return self;
2822    }
2823
2824    u = _PyUnicode_New(left + self->length + right);
2825    if (u) {
2826        if (left)
2827            Py_UNICODE_FILL(u->str, fill, left);
2828        Py_UNICODE_COPY(u->str + left, self->str, self->length);
2829        if (right)
2830            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2831    }
2832
2833    return u;
2834}
2835
2836#define SPLIT_APPEND(data, left, right)					\
2837	str = PyUnicode_FromUnicode(data + left, right - left);		\
2838	if (!str)							\
2839	    goto onError;						\
2840	if (PyList_Append(list, str)) {					\
2841	    Py_DECREF(str);						\
2842	    goto onError;						\
2843	}								\
2844        else								\
2845            Py_DECREF(str);
2846
2847static
2848PyObject *split_whitespace(PyUnicodeObject *self,
2849			   PyObject *list,
2850			   int maxcount)
2851{
2852    register int i;
2853    register int j;
2854    int len = self->length;
2855    PyObject *str;
2856
2857    for (i = j = 0; i < len; ) {
2858	/* find a token */
2859	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2860	    i++;
2861	j = i;
2862	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2863	    i++;
2864	if (j < i) {
2865	    if (maxcount-- <= 0)
2866		break;
2867	    SPLIT_APPEND(self->str, j, i);
2868	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2869		i++;
2870	    j = i;
2871	}
2872    }
2873    if (j < len) {
2874	SPLIT_APPEND(self->str, j, len);
2875    }
2876    return list;
2877
2878 onError:
2879    Py_DECREF(list);
2880    return NULL;
2881}
2882
2883PyObject *PyUnicode_Splitlines(PyObject *string,
2884			       int keepends)
2885{
2886    register int i;
2887    register int j;
2888    int len;
2889    PyObject *list;
2890    PyObject *str;
2891    Py_UNICODE *data;
2892
2893    string = PyUnicode_FromObject(string);
2894    if (string == NULL)
2895	return NULL;
2896    data = PyUnicode_AS_UNICODE(string);
2897    len = PyUnicode_GET_SIZE(string);
2898
2899    list = PyList_New(0);
2900    if (!list)
2901        goto onError;
2902
2903    for (i = j = 0; i < len; ) {
2904	int eol;
2905
2906	/* Find a line and append it */
2907	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2908	    i++;
2909
2910	/* Skip the line break reading CRLF as one line break */
2911	eol = i;
2912	if (i < len) {
2913	    if (data[i] == '\r' && i + 1 < len &&
2914		data[i+1] == '\n')
2915		i += 2;
2916	    else
2917		i++;
2918	    if (keepends)
2919		eol = i;
2920	}
2921	SPLIT_APPEND(data, j, eol);
2922	j = i;
2923    }
2924    if (j < len) {
2925	SPLIT_APPEND(data, j, len);
2926    }
2927
2928    Py_DECREF(string);
2929    return list;
2930
2931 onError:
2932    Py_DECREF(list);
2933    Py_DECREF(string);
2934    return NULL;
2935}
2936
2937static
2938PyObject *split_char(PyUnicodeObject *self,
2939		     PyObject *list,
2940		     Py_UNICODE ch,
2941		     int maxcount)
2942{
2943    register int i;
2944    register int j;
2945    int len = self->length;
2946    PyObject *str;
2947
2948    for (i = j = 0; i < len; ) {
2949	if (self->str[i] == ch) {
2950	    if (maxcount-- <= 0)
2951		break;
2952	    SPLIT_APPEND(self->str, j, i);
2953	    i = j = i + 1;
2954	} else
2955	    i++;
2956    }
2957    if (j <= len) {
2958	SPLIT_APPEND(self->str, j, len);
2959    }
2960    return list;
2961
2962 onError:
2963    Py_DECREF(list);
2964    return NULL;
2965}
2966
2967static
2968PyObject *split_substring(PyUnicodeObject *self,
2969			  PyObject *list,
2970			  PyUnicodeObject *substring,
2971			  int maxcount)
2972{
2973    register int i;
2974    register int j;
2975    int len = self->length;
2976    int sublen = substring->length;
2977    PyObject *str;
2978
2979    for (i = j = 0; i <= len - sublen; ) {
2980	if (Py_UNICODE_MATCH(self, i, substring)) {
2981	    if (maxcount-- <= 0)
2982		break;
2983	    SPLIT_APPEND(self->str, j, i);
2984	    i = j = i + sublen;
2985	} else
2986	    i++;
2987    }
2988    if (j <= len) {
2989	SPLIT_APPEND(self->str, j, len);
2990    }
2991    return list;
2992
2993 onError:
2994    Py_DECREF(list);
2995    return NULL;
2996}
2997
2998#undef SPLIT_APPEND
2999
3000static
3001PyObject *split(PyUnicodeObject *self,
3002		PyUnicodeObject *substring,
3003		int maxcount)
3004{
3005    PyObject *list;
3006
3007    if (maxcount < 0)
3008        maxcount = INT_MAX;
3009
3010    list = PyList_New(0);
3011    if (!list)
3012        return NULL;
3013
3014    if (substring == NULL)
3015	return split_whitespace(self,list,maxcount);
3016
3017    else if (substring->length == 1)
3018	return split_char(self,list,substring->str[0],maxcount);
3019
3020    else if (substring->length == 0) {
3021	Py_DECREF(list);
3022	PyErr_SetString(PyExc_ValueError, "empty separator");
3023	return NULL;
3024    }
3025    else
3026	return split_substring(self,list,substring,maxcount);
3027}
3028
3029static
3030PyObject *strip(PyUnicodeObject *self,
3031		int left,
3032		int right)
3033{
3034    Py_UNICODE *p = self->str;
3035    int start = 0;
3036    int end = self->length;
3037
3038    if (left)
3039        while (start < end && Py_UNICODE_ISSPACE(p[start]))
3040            start++;
3041
3042    if (right)
3043        while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3044            end--;
3045
3046    if (start == 0 && end == self->length) {
3047        /* couldn't strip anything off, return original string */
3048        Py_INCREF(self);
3049        return (PyObject*) self;
3050    }
3051
3052    return (PyObject*) PyUnicode_FromUnicode(
3053        self->str + start,
3054        end - start
3055        );
3056}
3057
3058static
3059PyObject *replace(PyUnicodeObject *self,
3060		  PyUnicodeObject *str1,
3061		  PyUnicodeObject *str2,
3062		  int maxcount)
3063{
3064    PyUnicodeObject *u;
3065
3066    if (maxcount < 0)
3067	maxcount = INT_MAX;
3068
3069    if (str1->length == 1 && str2->length == 1) {
3070        int i;
3071
3072        /* replace characters */
3073        if (!findchar(self->str, self->length, str1->str[0])) {
3074            /* nothing to replace, return original string */
3075            Py_INCREF(self);
3076            u = self;
3077        } else {
3078	    Py_UNICODE u1 = str1->str[0];
3079	    Py_UNICODE u2 = str2->str[0];
3080
3081            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3082                NULL,
3083                self->length
3084                );
3085            if (u != NULL) {
3086		Py_UNICODE_COPY(u->str, self->str,
3087				self->length);
3088                for (i = 0; i < u->length; i++)
3089                    if (u->str[i] == u1) {
3090                        if (--maxcount < 0)
3091                            break;
3092                        u->str[i] = u2;
3093                    }
3094        }
3095        }
3096
3097    } else {
3098        int n, i;
3099        Py_UNICODE *p;
3100
3101        /* replace strings */
3102        n = count(self, 0, self->length, str1);
3103        if (n > maxcount)
3104            n = maxcount;
3105        if (n == 0) {
3106            /* nothing to replace, return original string */
3107            Py_INCREF(self);
3108            u = self;
3109        } else {
3110            u = _PyUnicode_New(
3111                self->length + n * (str2->length - str1->length));
3112            if (u) {
3113                i = 0;
3114                p = u->str;
3115                while (i <= self->length - str1->length)
3116                    if (Py_UNICODE_MATCH(self, i, str1)) {
3117                        /* replace string segment */
3118                        Py_UNICODE_COPY(p, str2->str, str2->length);
3119                        p += str2->length;
3120                        i += str1->length;
3121                        if (--n <= 0) {
3122                            /* copy remaining part */
3123                            Py_UNICODE_COPY(p, self->str+i, self->length-i);
3124                            break;
3125                        }
3126                    } else
3127                        *p++ = self->str[i++];
3128            }
3129        }
3130    }
3131
3132    return (PyObject *) u;
3133}
3134
3135/* --- Unicode Object Methods --------------------------------------------- */
3136
3137static char title__doc__[] =
3138"S.title() -> unicode\n\
3139\n\
3140Return a titlecased version of S, i.e. words start with title case\n\
3141characters, all remaining cased characters have lower case.";
3142
3143static PyObject*
3144unicode_title(PyUnicodeObject *self, PyObject *args)
3145{
3146    if (!PyArg_NoArgs(args))
3147        return NULL;
3148    return fixup(self, fixtitle);
3149}
3150
3151static char capitalize__doc__[] =
3152"S.capitalize() -> unicode\n\
3153\n\
3154Return a capitalized version of S, i.e. make the first character\n\
3155have upper case.";
3156
3157static PyObject*
3158unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3159{
3160    if (!PyArg_NoArgs(args))
3161        return NULL;
3162    return fixup(self, fixcapitalize);
3163}
3164
3165#if 0
3166static char capwords__doc__[] =
3167"S.capwords() -> unicode\n\
3168\n\
3169Apply .capitalize() to all words in S and return the result with\n\
3170normalized whitespace (all whitespace strings are replaced by ' ').";
3171
3172static PyObject*
3173unicode_capwords(PyUnicodeObject *self, PyObject *args)
3174{
3175    PyObject *list;
3176    PyObject *item;
3177    int i;
3178
3179    if (!PyArg_NoArgs(args))
3180        return NULL;
3181
3182    /* Split into words */
3183    list = split(self, NULL, -1);
3184    if (!list)
3185        return NULL;
3186
3187    /* Capitalize each word */
3188    for (i = 0; i < PyList_GET_SIZE(list); i++) {
3189        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3190		     fixcapitalize);
3191        if (item == NULL)
3192            goto onError;
3193        Py_DECREF(PyList_GET_ITEM(list, i));
3194        PyList_SET_ITEM(list, i, item);
3195    }
3196
3197    /* Join the words to form a new string */
3198    item = PyUnicode_Join(NULL, list);
3199
3200onError:
3201    Py_DECREF(list);
3202    return (PyObject *)item;
3203}
3204#endif
3205
3206static char center__doc__[] =
3207"S.center(width) -> unicode\n\
3208\n\
3209Return S centered in a Unicode string of length width. Padding is done\n\
3210using spaces.";
3211
3212static PyObject *
3213unicode_center(PyUnicodeObject *self, PyObject *args)
3214{
3215    int marg, left;
3216    int width;
3217
3218    if (!PyArg_ParseTuple(args, "i:center", &width))
3219        return NULL;
3220
3221    if (self->length >= width) {
3222        Py_INCREF(self);
3223        return (PyObject*) self;
3224    }
3225
3226    marg = width - self->length;
3227    left = marg / 2 + (marg & width & 1);
3228
3229    return (PyObject*) pad(self, left, marg - left, ' ');
3230}
3231
3232#if 0
3233
3234/* This code should go into some future Unicode collation support
3235   module. The basic comparison should compare ordinals on a naive
3236   basis (this is what Java does and thus JPython too). */
3237
3238/* speedy UTF-16 code point order comparison */
3239/* gleaned from: */
3240/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3241
3242static short utf16Fixup[32] =
3243{
3244    0, 0, 0, 0, 0, 0, 0, 0,
3245    0, 0, 0, 0, 0, 0, 0, 0,
3246    0, 0, 0, 0, 0, 0, 0, 0,
3247    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3248};
3249
3250static int
3251unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3252{
3253    int len1, len2;
3254
3255    Py_UNICODE *s1 = str1->str;
3256    Py_UNICODE *s2 = str2->str;
3257
3258    len1 = str1->length;
3259    len2 = str2->length;
3260
3261    while (len1 > 0 && len2 > 0) {
3262        Py_UNICODE c1, c2;
3263	long diff;
3264
3265        c1 = *s1++;
3266        c2 = *s2++;
3267	if (c1 > (1<<11) * 26)
3268	    c1 += utf16Fixup[c1>>11];
3269	if (c2 > (1<<11) * 26)
3270            c2 += utf16Fixup[c2>>11];
3271
3272        /* now c1 and c2 are in UTF-32-compatible order */
3273        diff = (long)c1 - (long)c2;
3274        if (diff)
3275            return (diff < 0) ? -1 : (diff != 0);
3276        len1--; len2--;
3277    }
3278
3279    return (len1 < len2) ? -1 : (len1 != len2);
3280}
3281
3282#else
3283
3284static int
3285unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3286{
3287    register int len1, len2;
3288
3289    Py_UNICODE *s1 = str1->str;
3290    Py_UNICODE *s2 = str2->str;
3291
3292    len1 = str1->length;
3293    len2 = str2->length;
3294
3295    while (len1 > 0 && len2 > 0) {
3296	register long diff;
3297
3298        diff = (long)*s1++ - (long)*s2++;
3299        if (diff)
3300            return (diff < 0) ? -1 : (diff != 0);
3301        len1--; len2--;
3302    }
3303
3304    return (len1 < len2) ? -1 : (len1 != len2);
3305}
3306
3307#endif
3308
3309int PyUnicode_Compare(PyObject *left,
3310		      PyObject *right)
3311{
3312    PyUnicodeObject *u = NULL, *v = NULL;
3313    int result;
3314
3315    /* Coerce the two arguments */
3316    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3317    if (u == NULL)
3318	goto onError;
3319    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3320    if (v == NULL)
3321	goto onError;
3322
3323    /* Shortcut for empty or interned objects */
3324    if (v == u) {
3325	Py_DECREF(u);
3326	Py_DECREF(v);
3327	return 0;
3328    }
3329
3330    result = unicode_compare(u, v);
3331
3332    Py_DECREF(u);
3333    Py_DECREF(v);
3334    return result;
3335
3336onError:
3337    Py_XDECREF(u);
3338    Py_XDECREF(v);
3339    return -1;
3340}
3341
3342int PyUnicode_Contains(PyObject *container,
3343		       PyObject *element)
3344{
3345    PyUnicodeObject *u = NULL, *v = NULL;
3346    int result;
3347    register const Py_UNICODE *p, *e;
3348    register Py_UNICODE ch;
3349
3350    /* Coerce the two arguments */
3351    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3352    if (v == NULL) {
3353	PyErr_SetString(PyExc_TypeError,
3354	    "'in <string>' requires character as left operand");
3355	goto onError;
3356    }
3357    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3358    if (u == NULL) {
3359	Py_DECREF(v);
3360	goto onError;
3361    }
3362
3363    /* Check v in u */
3364    if (PyUnicode_GET_SIZE(v) != 1) {
3365	PyErr_SetString(PyExc_TypeError,
3366	    "'in <string>' requires character as left operand");
3367	goto onError;
3368    }
3369    ch = *PyUnicode_AS_UNICODE(v);
3370    p = PyUnicode_AS_UNICODE(u);
3371    e = p + PyUnicode_GET_SIZE(u);
3372    result = 0;
3373    while (p < e) {
3374	if (*p++ == ch) {
3375	    result = 1;
3376	    break;
3377	}
3378    }
3379
3380    Py_DECREF(u);
3381    Py_DECREF(v);
3382    return result;
3383
3384onError:
3385    Py_XDECREF(u);
3386    Py_XDECREF(v);
3387    return -1;
3388}
3389
3390/* Concat to string or Unicode object giving a new Unicode object. */
3391
3392PyObject *PyUnicode_Concat(PyObject *left,
3393			   PyObject *right)
3394{
3395    PyUnicodeObject *u = NULL, *v = NULL, *w;
3396
3397    /* Coerce the two arguments */
3398    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3399    if (u == NULL)
3400	goto onError;
3401    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3402    if (v == NULL)
3403	goto onError;
3404
3405    /* Shortcuts */
3406    if (v == unicode_empty) {
3407	Py_DECREF(v);
3408	return (PyObject *)u;
3409    }
3410    if (u == unicode_empty) {
3411	Py_DECREF(u);
3412	return (PyObject *)v;
3413    }
3414
3415    /* Concat the two Unicode strings */
3416    w = _PyUnicode_New(u->length + v->length);
3417    if (w == NULL)
3418	goto onError;
3419    Py_UNICODE_COPY(w->str, u->str, u->length);
3420    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3421
3422    Py_DECREF(u);
3423    Py_DECREF(v);
3424    return (PyObject *)w;
3425
3426onError:
3427    Py_XDECREF(u);
3428    Py_XDECREF(v);
3429    return NULL;
3430}
3431
3432static char count__doc__[] =
3433"S.count(sub[, start[, end]]) -> int\n\
3434\n\
3435Return the number of occurrences of substring sub in Unicode string\n\
3436S[start:end].  Optional arguments start and end are\n\
3437interpreted as in slice notation.";
3438
3439static PyObject *
3440unicode_count(PyUnicodeObject *self, PyObject *args)
3441{
3442    PyUnicodeObject *substring;
3443    int start = 0;
3444    int end = INT_MAX;
3445    PyObject *result;
3446
3447    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3448		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3449        return NULL;
3450
3451    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3452						(PyObject *)substring);
3453    if (substring == NULL)
3454	return NULL;
3455
3456    if (start < 0)
3457        start += self->length;
3458    if (start < 0)
3459        start = 0;
3460    if (end > self->length)
3461        end = self->length;
3462    if (end < 0)
3463        end += self->length;
3464    if (end < 0)
3465        end = 0;
3466
3467    result = PyInt_FromLong((long) count(self, start, end, substring));
3468
3469    Py_DECREF(substring);
3470    return result;
3471}
3472
3473static char encode__doc__[] =
3474"S.encode([encoding[,errors]]) -> string\n\
3475\n\
3476Return an encoded string version of S. Default encoding is the current\n\
3477default string encoding. errors may be given to set a different error\n\
3478handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3479a ValueError. Other possible values are 'ignore' and 'replace'.";
3480
3481static PyObject *
3482unicode_encode(PyUnicodeObject *self, PyObject *args)
3483{
3484    char *encoding = NULL;
3485    char *errors = NULL;
3486    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3487        return NULL;
3488    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3489}
3490
3491static char expandtabs__doc__[] =
3492"S.expandtabs([tabsize]) -> unicode\n\
3493\n\
3494Return a copy of S where all tab characters are expanded using spaces.\n\
3495If tabsize is not given, a tab size of 8 characters is assumed.";
3496
3497static PyObject*
3498unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3499{
3500    Py_UNICODE *e;
3501    Py_UNICODE *p;
3502    Py_UNICODE *q;
3503    int i, j;
3504    PyUnicodeObject *u;
3505    int tabsize = 8;
3506
3507    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3508	return NULL;
3509
3510    /* First pass: determine size of output string */
3511    i = j = 0;
3512    e = self->str + self->length;
3513    for (p = self->str; p < e; p++)
3514        if (*p == '\t') {
3515	    if (tabsize > 0)
3516		j += tabsize - (j % tabsize);
3517	}
3518        else {
3519            j++;
3520            if (*p == '\n' || *p == '\r') {
3521                i += j;
3522                j = 0;
3523            }
3524        }
3525
3526    /* Second pass: create output string and fill it */
3527    u = _PyUnicode_New(i + j);
3528    if (!u)
3529        return NULL;
3530
3531    j = 0;
3532    q = u->str;
3533
3534    for (p = self->str; p < e; p++)
3535        if (*p == '\t') {
3536	    if (tabsize > 0) {
3537		i = tabsize - (j % tabsize);
3538		j += i;
3539		while (i--)
3540		    *q++ = ' ';
3541	    }
3542	}
3543	else {
3544            j++;
3545	    *q++ = *p;
3546            if (*p == '\n' || *p == '\r')
3547                j = 0;
3548        }
3549
3550    return (PyObject*) u;
3551}
3552
3553static char find__doc__[] =
3554"S.find(sub [,start [,end]]) -> int\n\
3555\n\
3556Return the lowest index in S where substring sub is found,\n\
3557such that sub is contained within s[start,end].  Optional\n\
3558arguments start and end are interpreted as in slice notation.\n\
3559\n\
3560Return -1 on failure.";
3561
3562static PyObject *
3563unicode_find(PyUnicodeObject *self, PyObject *args)
3564{
3565    PyUnicodeObject *substring;
3566    int start = 0;
3567    int end = INT_MAX;
3568    PyObject *result;
3569
3570    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3571		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3572        return NULL;
3573    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3574						(PyObject *)substring);
3575    if (substring == NULL)
3576	return NULL;
3577
3578    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3579
3580    Py_DECREF(substring);
3581    return result;
3582}
3583
3584static PyObject *
3585unicode_getitem(PyUnicodeObject *self, int index)
3586{
3587    if (index < 0 || index >= self->length) {
3588        PyErr_SetString(PyExc_IndexError, "string index out of range");
3589        return NULL;
3590    }
3591
3592    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3593}
3594
3595static long
3596unicode_hash(PyUnicodeObject *self)
3597{
3598    /* Since Unicode objects compare equal to their ASCII string
3599       counterparts, they should use the individual character values
3600       as basis for their hash value.  This is needed to assure that
3601       strings and Unicode objects behave in the same way as
3602       dictionary keys. */
3603
3604    register int len;
3605    register Py_UNICODE *p;
3606    register long x;
3607
3608    if (self->hash != -1)
3609	return self->hash;
3610    len = PyUnicode_GET_SIZE(self);
3611    p = PyUnicode_AS_UNICODE(self);
3612    x = *p << 7;
3613    while (--len >= 0)
3614	x = (1000003*x) ^ *p++;
3615    x ^= PyUnicode_GET_SIZE(self);
3616    if (x == -1)
3617	x = -2;
3618    self->hash = x;
3619    return x;
3620}
3621
3622static char index__doc__[] =
3623"S.index(sub [,start [,end]]) -> int\n\
3624\n\
3625Like S.find() but raise ValueError when the substring is not found.";
3626
3627static PyObject *
3628unicode_index(PyUnicodeObject *self, PyObject *args)
3629{
3630    int result;
3631    PyUnicodeObject *substring;
3632    int start = 0;
3633    int end = INT_MAX;
3634
3635    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3636		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3637        return NULL;
3638
3639    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3640						(PyObject *)substring);
3641    if (substring == NULL)
3642	return NULL;
3643
3644    result = findstring(self, substring, start, end, 1);
3645
3646    Py_DECREF(substring);
3647    if (result < 0) {
3648        PyErr_SetString(PyExc_ValueError, "substring not found");
3649        return NULL;
3650    }
3651    return PyInt_FromLong(result);
3652}
3653
3654static char islower__doc__[] =
3655"S.islower() -> int\n\
3656\n\
3657Return 1 if  all cased characters in S are lowercase and there is\n\
3658at least one cased character in S, 0 otherwise.";
3659
3660static PyObject*
3661unicode_islower(PyUnicodeObject *self, PyObject *args)
3662{
3663    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3664    register const Py_UNICODE *e;
3665    int cased;
3666
3667    if (!PyArg_NoArgs(args))
3668        return NULL;
3669
3670    /* Shortcut for single character strings */
3671    if (PyUnicode_GET_SIZE(self) == 1)
3672	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3673
3674    /* Special case for empty strings */
3675    if (PyString_GET_SIZE(self) == 0)
3676	return PyInt_FromLong(0);
3677
3678    e = p + PyUnicode_GET_SIZE(self);
3679    cased = 0;
3680    for (; p < e; p++) {
3681	register const Py_UNICODE ch = *p;
3682
3683	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3684	    return PyInt_FromLong(0);
3685	else if (!cased && Py_UNICODE_ISLOWER(ch))
3686	    cased = 1;
3687    }
3688    return PyInt_FromLong(cased);
3689}
3690
3691static char isupper__doc__[] =
3692"S.isupper() -> int\n\
3693\n\
3694Return 1 if  all cased characters in S are uppercase and there is\n\
3695at least one cased character in S, 0 otherwise.";
3696
3697static PyObject*
3698unicode_isupper(PyUnicodeObject *self, PyObject *args)
3699{
3700    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3701    register const Py_UNICODE *e;
3702    int cased;
3703
3704    if (!PyArg_NoArgs(args))
3705        return NULL;
3706
3707    /* Shortcut for single character strings */
3708    if (PyUnicode_GET_SIZE(self) == 1)
3709	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3710
3711    /* Special case for empty strings */
3712    if (PyString_GET_SIZE(self) == 0)
3713	return PyInt_FromLong(0);
3714
3715    e = p + PyUnicode_GET_SIZE(self);
3716    cased = 0;
3717    for (; p < e; p++) {
3718	register const Py_UNICODE ch = *p;
3719
3720	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3721	    return PyInt_FromLong(0);
3722	else if (!cased && Py_UNICODE_ISUPPER(ch))
3723	    cased = 1;
3724    }
3725    return PyInt_FromLong(cased);
3726}
3727
3728static char istitle__doc__[] =
3729"S.istitle() -> int\n\
3730\n\
3731Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3732may only follow uncased characters and lowercase characters only cased\n\
3733ones. Return 0 otherwise.";
3734
3735static PyObject*
3736unicode_istitle(PyUnicodeObject *self, PyObject *args)
3737{
3738    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3739    register const Py_UNICODE *e;
3740    int cased, previous_is_cased;
3741
3742    if (!PyArg_NoArgs(args))
3743        return NULL;
3744
3745    /* Shortcut for single character strings */
3746    if (PyUnicode_GET_SIZE(self) == 1)
3747	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3748			      (Py_UNICODE_ISUPPER(*p) != 0));
3749
3750    /* Special case for empty strings */
3751    if (PyString_GET_SIZE(self) == 0)
3752	return PyInt_FromLong(0);
3753
3754    e = p + PyUnicode_GET_SIZE(self);
3755    cased = 0;
3756    previous_is_cased = 0;
3757    for (; p < e; p++) {
3758	register const Py_UNICODE ch = *p;
3759
3760	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3761	    if (previous_is_cased)
3762		return PyInt_FromLong(0);
3763	    previous_is_cased = 1;
3764	    cased = 1;
3765	}
3766	else if (Py_UNICODE_ISLOWER(ch)) {
3767	    if (!previous_is_cased)
3768		return PyInt_FromLong(0);
3769	    previous_is_cased = 1;
3770	    cased = 1;
3771	}
3772	else
3773	    previous_is_cased = 0;
3774    }
3775    return PyInt_FromLong(cased);
3776}
3777
3778static char isspace__doc__[] =
3779"S.isspace() -> int\n\
3780\n\
3781Return 1 if there are only whitespace characters in S,\n\
37820 otherwise.";
3783
3784static PyObject*
3785unicode_isspace(PyUnicodeObject *self, PyObject *args)
3786{
3787    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3788    register const Py_UNICODE *e;
3789
3790    if (!PyArg_NoArgs(args))
3791        return NULL;
3792
3793    /* Shortcut for single character strings */
3794    if (PyUnicode_GET_SIZE(self) == 1 &&
3795	Py_UNICODE_ISSPACE(*p))
3796	return PyInt_FromLong(1);
3797
3798    /* Special case for empty strings */
3799    if (PyString_GET_SIZE(self) == 0)
3800	return PyInt_FromLong(0);
3801
3802    e = p + PyUnicode_GET_SIZE(self);
3803    for (; p < e; p++) {
3804	if (!Py_UNICODE_ISSPACE(*p))
3805	    return PyInt_FromLong(0);
3806    }
3807    return PyInt_FromLong(1);
3808}
3809
3810static char isalpha__doc__[] =
3811"S.isalpha() -> int\n\
3812\n\
3813Return 1 if  all characters in S are alphabetic\n\
3814and there is at least one character in S, 0 otherwise.";
3815
3816static PyObject*
3817unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3818{
3819    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3820    register const Py_UNICODE *e;
3821
3822    if (!PyArg_NoArgs(args))
3823        return NULL;
3824
3825    /* Shortcut for single character strings */
3826    if (PyUnicode_GET_SIZE(self) == 1 &&
3827	Py_UNICODE_ISALPHA(*p))
3828	return PyInt_FromLong(1);
3829
3830    /* Special case for empty strings */
3831    if (PyString_GET_SIZE(self) == 0)
3832	return PyInt_FromLong(0);
3833
3834    e = p + PyUnicode_GET_SIZE(self);
3835    for (; p < e; p++) {
3836	if (!Py_UNICODE_ISALPHA(*p))
3837	    return PyInt_FromLong(0);
3838    }
3839    return PyInt_FromLong(1);
3840}
3841
3842static char isalnum__doc__[] =
3843"S.isalnum() -> int\n\
3844\n\
3845Return 1 if  all characters in S are alphanumeric\n\
3846and there is at least one character in S, 0 otherwise.";
3847
3848static PyObject*
3849unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3850{
3851    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3852    register const Py_UNICODE *e;
3853
3854    if (!PyArg_NoArgs(args))
3855        return NULL;
3856
3857    /* Shortcut for single character strings */
3858    if (PyUnicode_GET_SIZE(self) == 1 &&
3859	Py_UNICODE_ISALNUM(*p))
3860	return PyInt_FromLong(1);
3861
3862    /* Special case for empty strings */
3863    if (PyString_GET_SIZE(self) == 0)
3864	return PyInt_FromLong(0);
3865
3866    e = p + PyUnicode_GET_SIZE(self);
3867    for (; p < e; p++) {
3868	if (!Py_UNICODE_ISALNUM(*p))
3869	    return PyInt_FromLong(0);
3870    }
3871    return PyInt_FromLong(1);
3872}
3873
3874static char isdecimal__doc__[] =
3875"S.isdecimal() -> int\n\
3876\n\
3877Return 1 if there are only decimal characters in S,\n\
38780 otherwise.";
3879
3880static PyObject*
3881unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3882{
3883    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3884    register const Py_UNICODE *e;
3885
3886    if (!PyArg_NoArgs(args))
3887        return NULL;
3888
3889    /* Shortcut for single character strings */
3890    if (PyUnicode_GET_SIZE(self) == 1 &&
3891	Py_UNICODE_ISDECIMAL(*p))
3892	return PyInt_FromLong(1);
3893
3894    /* Special case for empty strings */
3895    if (PyString_GET_SIZE(self) == 0)
3896	return PyInt_FromLong(0);
3897
3898    e = p + PyUnicode_GET_SIZE(self);
3899    for (; p < e; p++) {
3900	if (!Py_UNICODE_ISDECIMAL(*p))
3901	    return PyInt_FromLong(0);
3902    }
3903    return PyInt_FromLong(1);
3904}
3905
3906static char isdigit__doc__[] =
3907"S.isdigit() -> int\n\
3908\n\
3909Return 1 if there are only digit characters in S,\n\
39100 otherwise.";
3911
3912static PyObject*
3913unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3914{
3915    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3916    register const Py_UNICODE *e;
3917
3918    if (!PyArg_NoArgs(args))
3919        return NULL;
3920
3921    /* Shortcut for single character strings */
3922    if (PyUnicode_GET_SIZE(self) == 1 &&
3923	Py_UNICODE_ISDIGIT(*p))
3924	return PyInt_FromLong(1);
3925
3926    /* Special case for empty strings */
3927    if (PyString_GET_SIZE(self) == 0)
3928	return PyInt_FromLong(0);
3929
3930    e = p + PyUnicode_GET_SIZE(self);
3931    for (; p < e; p++) {
3932	if (!Py_UNICODE_ISDIGIT(*p))
3933	    return PyInt_FromLong(0);
3934    }
3935    return PyInt_FromLong(1);
3936}
3937
3938static char isnumeric__doc__[] =
3939"S.isnumeric() -> int\n\
3940\n\
3941Return 1 if there are only numeric characters in S,\n\
39420 otherwise.";
3943
3944static PyObject*
3945unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3946{
3947    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3948    register const Py_UNICODE *e;
3949
3950    if (!PyArg_NoArgs(args))
3951        return NULL;
3952
3953    /* Shortcut for single character strings */
3954    if (PyUnicode_GET_SIZE(self) == 1 &&
3955	Py_UNICODE_ISNUMERIC(*p))
3956	return PyInt_FromLong(1);
3957
3958    /* Special case for empty strings */
3959    if (PyString_GET_SIZE(self) == 0)
3960	return PyInt_FromLong(0);
3961
3962    e = p + PyUnicode_GET_SIZE(self);
3963    for (; p < e; p++) {
3964	if (!Py_UNICODE_ISNUMERIC(*p))
3965	    return PyInt_FromLong(0);
3966    }
3967    return PyInt_FromLong(1);
3968}
3969
3970static char join__doc__[] =
3971"S.join(sequence) -> unicode\n\
3972\n\
3973Return a string which is the concatenation of the strings in the\n\
3974sequence.  The separator between elements is S.";
3975
3976static PyObject*
3977unicode_join(PyUnicodeObject *self, PyObject *args)
3978{
3979    PyObject *data;
3980    if (!PyArg_ParseTuple(args, "O:join", &data))
3981        return NULL;
3982
3983    return PyUnicode_Join((PyObject *)self, data);
3984}
3985
3986static int
3987unicode_length(PyUnicodeObject *self)
3988{
3989    return self->length;
3990}
3991
3992static char ljust__doc__[] =
3993"S.ljust(width) -> unicode\n\
3994\n\
3995Return S left justified in a Unicode string of length width. Padding is\n\
3996done using spaces.";
3997
3998static PyObject *
3999unicode_ljust(PyUnicodeObject *self, PyObject *args)
4000{
4001    int width;
4002    if (!PyArg_ParseTuple(args, "i:ljust", &width))
4003        return NULL;
4004
4005    if (self->length >= width) {
4006        Py_INCREF(self);
4007        return (PyObject*) self;
4008    }
4009
4010    return (PyObject*) pad(self, 0, width - self->length, ' ');
4011}
4012
4013static char lower__doc__[] =
4014"S.lower() -> unicode\n\
4015\n\
4016Return a copy of the string S converted to lowercase.";
4017
4018static PyObject*
4019unicode_lower(PyUnicodeObject *self, PyObject *args)
4020{
4021    if (!PyArg_NoArgs(args))
4022        return NULL;
4023    return fixup(self, fixlower);
4024}
4025
4026static char lstrip__doc__[] =
4027"S.lstrip() -> unicode\n\
4028\n\
4029Return a copy of the string S with leading whitespace removed.";
4030
4031static PyObject *
4032unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4033{
4034    if (!PyArg_NoArgs(args))
4035        return NULL;
4036    return strip(self, 1, 0);
4037}
4038
4039static PyObject*
4040unicode_repeat(PyUnicodeObject *str, int len)
4041{
4042    PyUnicodeObject *u;
4043    Py_UNICODE *p;
4044    int nchars;
4045    size_t nbytes;
4046
4047    if (len < 0)
4048        len = 0;
4049
4050    if (len == 1) {
4051        /* no repeat, return original string */
4052        Py_INCREF(str);
4053        return (PyObject*) str;
4054    }
4055
4056    /* ensure # of chars needed doesn't overflow int and # of bytes
4057     * needed doesn't overflow size_t
4058     */
4059    nchars = len * str->length;
4060    if (len && nchars / len != str->length) {
4061        PyErr_SetString(PyExc_OverflowError,
4062                        "repeated string is too long");
4063        return NULL;
4064    }
4065    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4066    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4067        PyErr_SetString(PyExc_OverflowError,
4068                        "repeated string is too long");
4069        return NULL;
4070    }
4071    u = _PyUnicode_New(nchars);
4072    if (!u)
4073        return NULL;
4074
4075    p = u->str;
4076
4077    while (len-- > 0) {
4078        Py_UNICODE_COPY(p, str->str, str->length);
4079        p += str->length;
4080    }
4081
4082    return (PyObject*) u;
4083}
4084
4085PyObject *PyUnicode_Replace(PyObject *obj,
4086			    PyObject *subobj,
4087			    PyObject *replobj,
4088			    int maxcount)
4089{
4090    PyObject *self;
4091    PyObject *str1;
4092    PyObject *str2;
4093    PyObject *result;
4094
4095    self = PyUnicode_FromObject(obj);
4096    if (self == NULL)
4097	return NULL;
4098    str1 = PyUnicode_FromObject(subobj);
4099    if (str1 == NULL) {
4100	Py_DECREF(self);
4101	return NULL;
4102    }
4103    str2 = PyUnicode_FromObject(replobj);
4104    if (str2 == NULL) {
4105	Py_DECREF(self);
4106	Py_DECREF(str1);
4107	return NULL;
4108    }
4109    result = replace((PyUnicodeObject *)self,
4110		     (PyUnicodeObject *)str1,
4111		     (PyUnicodeObject *)str2,
4112		     maxcount);
4113    Py_DECREF(self);
4114    Py_DECREF(str1);
4115    Py_DECREF(str2);
4116    return result;
4117}
4118
4119static char replace__doc__[] =
4120"S.replace (old, new[, maxsplit]) -> unicode\n\
4121\n\
4122Return a copy of S with all occurrences of substring\n\
4123old replaced by new.  If the optional argument maxsplit is\n\
4124given, only the first maxsplit occurrences are replaced.";
4125
4126static PyObject*
4127unicode_replace(PyUnicodeObject *self, PyObject *args)
4128{
4129    PyUnicodeObject *str1;
4130    PyUnicodeObject *str2;
4131    int maxcount = -1;
4132    PyObject *result;
4133
4134    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4135        return NULL;
4136    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4137    if (str1 == NULL)
4138	return NULL;
4139    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4140    if (str2 == NULL)
4141	return NULL;
4142
4143    result = replace(self, str1, str2, maxcount);
4144
4145    Py_DECREF(str1);
4146    Py_DECREF(str2);
4147    return result;
4148}
4149
4150static
4151PyObject *unicode_repr(PyObject *unicode)
4152{
4153    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4154				PyUnicode_GET_SIZE(unicode),
4155				1);
4156}
4157
4158static char rfind__doc__[] =
4159"S.rfind(sub [,start [,end]]) -> int\n\
4160\n\
4161Return the highest index in S where substring sub is found,\n\
4162such that sub is contained within s[start,end].  Optional\n\
4163arguments start and end are interpreted as in slice notation.\n\
4164\n\
4165Return -1 on failure.";
4166
4167static PyObject *
4168unicode_rfind(PyUnicodeObject *self, PyObject *args)
4169{
4170    PyUnicodeObject *substring;
4171    int start = 0;
4172    int end = INT_MAX;
4173    PyObject *result;
4174
4175    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4176		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4177        return NULL;
4178    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4179						(PyObject *)substring);
4180    if (substring == NULL)
4181	return NULL;
4182
4183    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4184
4185    Py_DECREF(substring);
4186    return result;
4187}
4188
4189static char rindex__doc__[] =
4190"S.rindex(sub [,start [,end]]) -> int\n\
4191\n\
4192Like S.rfind() but raise ValueError when the substring is not found.";
4193
4194static PyObject *
4195unicode_rindex(PyUnicodeObject *self, PyObject *args)
4196{
4197    int result;
4198    PyUnicodeObject *substring;
4199    int start = 0;
4200    int end = INT_MAX;
4201
4202    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4203		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4204        return NULL;
4205    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4206						(PyObject *)substring);
4207    if (substring == NULL)
4208	return NULL;
4209
4210    result = findstring(self, substring, start, end, -1);
4211
4212    Py_DECREF(substring);
4213    if (result < 0) {
4214        PyErr_SetString(PyExc_ValueError, "substring not found");
4215        return NULL;
4216    }
4217    return PyInt_FromLong(result);
4218}
4219
4220static char rjust__doc__[] =
4221"S.rjust(width) -> unicode\n\
4222\n\
4223Return S right justified in a Unicode string of length width. Padding is\n\
4224done using spaces.";
4225
4226static PyObject *
4227unicode_rjust(PyUnicodeObject *self, PyObject *args)
4228{
4229    int width;
4230    if (!PyArg_ParseTuple(args, "i:rjust", &width))
4231        return NULL;
4232
4233    if (self->length >= width) {
4234        Py_INCREF(self);
4235        return (PyObject*) self;
4236    }
4237
4238    return (PyObject*) pad(self, width - self->length, 0, ' ');
4239}
4240
4241static char rstrip__doc__[] =
4242"S.rstrip() -> unicode\n\
4243\n\
4244Return a copy of the string S with trailing whitespace removed.";
4245
4246static PyObject *
4247unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4248{
4249    if (!PyArg_NoArgs(args))
4250        return NULL;
4251    return strip(self, 0, 1);
4252}
4253
4254static PyObject*
4255unicode_slice(PyUnicodeObject *self, int start, int end)
4256{
4257    /* standard clamping */
4258    if (start < 0)
4259        start = 0;
4260    if (end < 0)
4261        end = 0;
4262    if (end > self->length)
4263        end = self->length;
4264    if (start == 0 && end == self->length) {
4265        /* full slice, return original string */
4266        Py_INCREF(self);
4267        return (PyObject*) self;
4268    }
4269    if (start > end)
4270        start = end;
4271    /* copy slice */
4272    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4273					     end - start);
4274}
4275
4276PyObject *PyUnicode_Split(PyObject *s,
4277			  PyObject *sep,
4278			  int maxsplit)
4279{
4280    PyObject *result;
4281
4282    s = PyUnicode_FromObject(s);
4283    if (s == NULL)
4284	return NULL;
4285    if (sep != NULL) {
4286	sep = PyUnicode_FromObject(sep);
4287	if (sep == NULL) {
4288	    Py_DECREF(s);
4289	    return NULL;
4290	}
4291    }
4292
4293    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4294
4295    Py_DECREF(s);
4296    Py_XDECREF(sep);
4297    return result;
4298}
4299
4300static char split__doc__[] =
4301"S.split([sep [,maxsplit]]) -> list of strings\n\
4302\n\
4303Return a list of the words in S, using sep as the\n\
4304delimiter string.  If maxsplit is given, at most maxsplit\n\
4305splits are done. If sep is not specified, any whitespace string\n\
4306is a separator.";
4307
4308static PyObject*
4309unicode_split(PyUnicodeObject *self, PyObject *args)
4310{
4311    PyObject *substring = Py_None;
4312    int maxcount = -1;
4313
4314    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4315        return NULL;
4316
4317    if (substring == Py_None)
4318	return split(self, NULL, maxcount);
4319    else if (PyUnicode_Check(substring))
4320	return split(self, (PyUnicodeObject *)substring, maxcount);
4321    else
4322	return PyUnicode_Split((PyObject *)self, substring, maxcount);
4323}
4324
4325static char splitlines__doc__[] =
4326"S.splitlines([keepends]]) -> list of strings\n\
4327\n\
4328Return a list of the lines in S, breaking at line boundaries.\n\
4329Line breaks are not included in the resulting list unless keepends\n\
4330is given and true.";
4331
4332static PyObject*
4333unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4334{
4335    int keepends = 0;
4336
4337    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4338        return NULL;
4339
4340    return PyUnicode_Splitlines((PyObject *)self, keepends);
4341}
4342
4343static
4344PyObject *unicode_str(PyUnicodeObject *self)
4345{
4346    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4347}
4348
4349static char strip__doc__[] =
4350"S.strip() -> unicode\n\
4351\n\
4352Return a copy of S with leading and trailing whitespace removed.";
4353
4354static PyObject *
4355unicode_strip(PyUnicodeObject *self, PyObject *args)
4356{
4357    if (!PyArg_NoArgs(args))
4358        return NULL;
4359    return strip(self, 1, 1);
4360}
4361
4362static char swapcase__doc__[] =
4363"S.swapcase() -> unicode\n\
4364\n\
4365Return a copy of S with uppercase characters converted to lowercase\n\
4366and vice versa.";
4367
4368static PyObject*
4369unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4370{
4371    if (!PyArg_NoArgs(args))
4372        return NULL;
4373    return fixup(self, fixswapcase);
4374}
4375
4376static char translate__doc__[] =
4377"S.translate(table) -> unicode\n\
4378\n\
4379Return a copy of the string S, where all characters have been mapped\n\
4380through the given translation table, which must be a mapping of\n\
4381Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4382are left untouched. Characters mapped to None are deleted.";
4383
4384static PyObject*
4385unicode_translate(PyUnicodeObject *self, PyObject *args)
4386{
4387    PyObject *table;
4388
4389    if (!PyArg_ParseTuple(args, "O:translate", &table))
4390	return NULL;
4391    return PyUnicode_TranslateCharmap(self->str,
4392				      self->length,
4393				      table,
4394				      "ignore");
4395}
4396
4397static char upper__doc__[] =
4398"S.upper() -> unicode\n\
4399\n\
4400Return a copy of S converted to uppercase.";
4401
4402static PyObject*
4403unicode_upper(PyUnicodeObject *self, PyObject *args)
4404{
4405    if (!PyArg_NoArgs(args))
4406        return NULL;
4407    return fixup(self, fixupper);
4408}
4409
4410#if 0
4411static char zfill__doc__[] =
4412"S.zfill(width) -> unicode\n\
4413\n\
4414Pad a numeric string x with zeros on the left, to fill a field\n\
4415of the specified width. The string x is never truncated.";
4416
4417static PyObject *
4418unicode_zfill(PyUnicodeObject *self, PyObject *args)
4419{
4420    int fill;
4421    PyUnicodeObject *u;
4422
4423    int width;
4424    if (!PyArg_ParseTuple(args, "i:zfill", &width))
4425        return NULL;
4426
4427    if (self->length >= width) {
4428        Py_INCREF(self);
4429        return (PyObject*) self;
4430    }
4431
4432    fill = width - self->length;
4433
4434    u = pad(self, fill, 0, '0');
4435
4436    if (u->str[fill] == '+' || u->str[fill] == '-') {
4437        /* move sign to beginning of string */
4438        u->str[0] = u->str[fill];
4439        u->str[fill] = '0';
4440    }
4441
4442    return (PyObject*) u;
4443}
4444#endif
4445
4446#if 0
4447static PyObject*
4448unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4449{
4450    if (!PyArg_NoArgs(args))
4451        return NULL;
4452    return PyInt_FromLong(unicode_freelist_size);
4453}
4454#endif
4455
4456static char startswith__doc__[] =
4457"S.startswith(prefix[, start[, end]]) -> int\n\
4458\n\
4459Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4460optional start, test S beginning at that position.  With optional end, stop\n\
4461comparing S at that position.";
4462
4463static PyObject *
4464unicode_startswith(PyUnicodeObject *self,
4465		   PyObject *args)
4466{
4467    PyUnicodeObject *substring;
4468    int start = 0;
4469    int end = INT_MAX;
4470    PyObject *result;
4471
4472    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4473		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4474	return NULL;
4475    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4476						(PyObject *)substring);
4477    if (substring == NULL)
4478	return NULL;
4479
4480    result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4481
4482    Py_DECREF(substring);
4483    return result;
4484}
4485
4486
4487static char endswith__doc__[] =
4488"S.endswith(suffix[, start[, end]]) -> int\n\
4489\n\
4490Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4491optional start, test S beginning at that position.  With optional end, stop\n\
4492comparing S at that position.";
4493
4494static PyObject *
4495unicode_endswith(PyUnicodeObject *self,
4496		 PyObject *args)
4497{
4498    PyUnicodeObject *substring;
4499    int start = 0;
4500    int end = INT_MAX;
4501    PyObject *result;
4502
4503    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4504		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4505	return NULL;
4506    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4507						(PyObject *)substring);
4508    if (substring == NULL)
4509	return NULL;
4510
4511    result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4512
4513    Py_DECREF(substring);
4514    return result;
4515}
4516
4517
4518static PyMethodDef unicode_methods[] = {
4519
4520    /* Order is according to common usage: often used methods should
4521       appear first, since lookup is done sequentially. */
4522
4523    {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4524    {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4525    {"split", (PyCFunction) unicode_split, 1, split__doc__},
4526    {"join", (PyCFunction) unicode_join, 1, join__doc__},
4527    {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4528    {"title", (PyCFunction) unicode_title, 0, title__doc__},
4529    {"center", (PyCFunction) unicode_center, 1, center__doc__},
4530    {"count", (PyCFunction) unicode_count, 1, count__doc__},
4531    {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4532    {"find", (PyCFunction) unicode_find, 1, find__doc__},
4533    {"index", (PyCFunction) unicode_index, 1, index__doc__},
4534    {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4535    {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4536    {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4537/*  {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4538    {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4539    {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4540    {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4541    {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4542    {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4543    {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4544    {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4545    {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4546    {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4547    {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4548    {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4549    {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4550    {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4551    {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4552    {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4553    {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4554    {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4555    {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4556    {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4557    {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4558#if 0
4559    {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4560    {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4561#endif
4562
4563#if 0
4564    /* This one is just used for debugging the implementation. */
4565    {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4566#endif
4567
4568    {NULL, NULL}
4569};
4570
4571static PyObject *
4572unicode_getattr(PyUnicodeObject *self, char *name)
4573{
4574    return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4575}
4576
4577static PySequenceMethods unicode_as_sequence = {
4578    (inquiry) unicode_length, 		/* sq_length */
4579    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
4580    (intargfunc) unicode_repeat, 	/* sq_repeat */
4581    (intargfunc) unicode_getitem, 	/* sq_item */
4582    (intintargfunc) unicode_slice, 	/* sq_slice */
4583    0, 					/* sq_ass_item */
4584    0, 					/* sq_ass_slice */
4585    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
4586};
4587
4588static int
4589unicode_buffer_getreadbuf(PyUnicodeObject *self,
4590			  int index,
4591			  const void **ptr)
4592{
4593    if (index != 0) {
4594        PyErr_SetString(PyExc_SystemError,
4595			"accessing non-existent unicode segment");
4596        return -1;
4597    }
4598    *ptr = (void *) self->str;
4599    return PyUnicode_GET_DATA_SIZE(self);
4600}
4601
4602static int
4603unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4604			   const void **ptr)
4605{
4606    PyErr_SetString(PyExc_TypeError,
4607		    "cannot use unicode as modifyable buffer");
4608    return -1;
4609}
4610
4611static int
4612unicode_buffer_getsegcount(PyUnicodeObject *self,
4613			   int *lenp)
4614{
4615    if (lenp)
4616        *lenp = PyUnicode_GET_DATA_SIZE(self);
4617    return 1;
4618}
4619
4620static int
4621unicode_buffer_getcharbuf(PyUnicodeObject *self,
4622			  int index,
4623			  const void **ptr)
4624{
4625    PyObject *str;
4626
4627    if (index != 0) {
4628        PyErr_SetString(PyExc_SystemError,
4629			"accessing non-existent unicode segment");
4630        return -1;
4631    }
4632    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4633    if (str == NULL)
4634	return -1;
4635    *ptr = (void *) PyString_AS_STRING(str);
4636    return PyString_GET_SIZE(str);
4637}
4638
4639/* Helpers for PyUnicode_Format() */
4640
4641static PyObject *
4642getnextarg(PyObject *args, int arglen, int *p_argidx)
4643{
4644    int argidx = *p_argidx;
4645    if (argidx < arglen) {
4646	(*p_argidx)++;
4647	if (arglen < 0)
4648	    return args;
4649	else
4650	    return PyTuple_GetItem(args, argidx);
4651    }
4652    PyErr_SetString(PyExc_TypeError,
4653		    "not enough arguments for format string");
4654    return NULL;
4655}
4656
4657#define F_LJUST (1<<0)
4658#define F_SIGN	(1<<1)
4659#define F_BLANK (1<<2)
4660#define F_ALT	(1<<3)
4661#define F_ZERO	(1<<4)
4662
4663static
4664int usprintf(register Py_UNICODE *buffer, char *format, ...)
4665{
4666    register int i;
4667    int len;
4668    va_list va;
4669    char *charbuffer;
4670    va_start(va, format);
4671
4672    /* First, format the string as char array, then expand to Py_UNICODE
4673       array. */
4674    charbuffer = (char *)buffer;
4675    len = vsprintf(charbuffer, format, va);
4676    for (i = len - 1; i >= 0; i--)
4677	buffer[i] = (Py_UNICODE) charbuffer[i];
4678
4679    va_end(va);
4680    return len;
4681}
4682
4683static int
4684formatfloat(Py_UNICODE *buf,
4685	    size_t buflen,
4686	    int flags,
4687	    int prec,
4688	    int type,
4689	    PyObject *v)
4690{
4691    /* fmt = '%#.' + `prec` + `type`
4692       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4693    char fmt[20];
4694    double x;
4695
4696    x = PyFloat_AsDouble(v);
4697    if (x == -1.0 && PyErr_Occurred())
4698	return -1;
4699    if (prec < 0)
4700	prec = 6;
4701    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4702	type = 'g';
4703    sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4704    /* worst case length calc to ensure no buffer overrun:
4705         fmt = %#.<prec>g
4706         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4707            for any double rep.)
4708         len = 1 + prec + 1 + 2 + 5 = 9 + prec
4709       If prec=0 the effective precision is 1 (the leading digit is
4710       always given), therefore increase by one to 10+prec. */
4711    if (buflen <= (size_t)10 + (size_t)prec) {
4712	PyErr_SetString(PyExc_OverflowError,
4713	    "formatted float is too long (precision too long?)");
4714	return -1;
4715    }
4716    return usprintf(buf, fmt, x);
4717}
4718
4719static PyObject*
4720formatlong(PyObject *val, int flags, int prec, int type)
4721{
4722	char *buf;
4723	int i, len;
4724	PyObject *str; /* temporary string object. */
4725	PyUnicodeObject *result;
4726
4727	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4728	if (!str)
4729		return NULL;
4730	result = _PyUnicode_New(len);
4731	for (i = 0; i < len; i++)
4732		result->str[i] = buf[i];
4733	result->str[len] = 0;
4734	Py_DECREF(str);
4735	return (PyObject*)result;
4736}
4737
4738static int
4739formatint(Py_UNICODE *buf,
4740	  size_t buflen,
4741	  int flags,
4742	  int prec,
4743	  int type,
4744	  PyObject *v)
4745{
4746    /* fmt = '%#.' + `prec` + 'l' + `type`
4747       worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4748       + 1 + 1 = 24*/
4749    char fmt[64]; /* plenty big enough! */
4750    long x;
4751    int use_native_c_format = 1;
4752
4753    x = PyInt_AsLong(v);
4754    if (x == -1 && PyErr_Occurred())
4755	return -1;
4756    if (prec < 0)
4757	prec = 1;
4758    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4759       worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4760    if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4761        PyErr_SetString(PyExc_OverflowError,
4762            "formatted integer is too long (precision too long?)");
4763        return -1;
4764    }
4765    /* When converting 0 under %#x or %#X, C leaves off the base marker,
4766     * but we want it (for consistency with other %#x conversions, and
4767     * for consistency with Python's hex() function).
4768     * BUG 28-Apr-2001 tim:  At least two platform Cs (Metrowerks &
4769     * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4770     * So add it only if the platform doesn't already.
4771     */
4772    if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4773        /* Only way to know what the platform does is to try it. */
4774        sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4775        if (fmt[1] != (char)type) {
4776            /* Supply our own leading 0x/0X -- needed under std C */
4777            use_native_c_format = 0;
4778            sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4779        }
4780    }
4781    if (use_native_c_format)
4782         sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4783    return usprintf(buf, fmt, x);
4784}
4785
4786static int
4787formatchar(Py_UNICODE *buf,
4788           size_t buflen,
4789           PyObject *v)
4790{
4791    /* presume that the buffer is at least 2 characters long */
4792    if (PyUnicode_Check(v)) {
4793	if (PyUnicode_GET_SIZE(v) != 1)
4794	    goto onError;
4795	buf[0] = PyUnicode_AS_UNICODE(v)[0];
4796    }
4797
4798    else if (PyString_Check(v)) {
4799	if (PyString_GET_SIZE(v) != 1)
4800	    goto onError;
4801	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4802    }
4803
4804    else {
4805	/* Integer input truncated to a character */
4806        long x;
4807	x = PyInt_AsLong(v);
4808	if (x == -1 && PyErr_Occurred())
4809	    goto onError;
4810	buf[0] = (char) x;
4811    }
4812    buf[1] = '\0';
4813    return 1;
4814
4815 onError:
4816    PyErr_SetString(PyExc_TypeError,
4817		    "%c requires int or char");
4818    return -1;
4819}
4820
4821/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4822
4823   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4824   chars are formatted. XXX This is a magic number. Each formatting
4825   routine does bounds checking to ensure no overflow, but a better
4826   solution may be to malloc a buffer of appropriate size for each
4827   format. For now, the current solution is sufficient.
4828*/
4829#define FORMATBUFLEN (size_t)120
4830
4831PyObject *PyUnicode_Format(PyObject *format,
4832			   PyObject *args)
4833{
4834    Py_UNICODE *fmt, *res;
4835    int fmtcnt, rescnt, reslen, arglen, argidx;
4836    int args_owned = 0;
4837    PyUnicodeObject *result = NULL;
4838    PyObject *dict = NULL;
4839    PyObject *uformat;
4840
4841    if (format == NULL || args == NULL) {
4842	PyErr_BadInternalCall();
4843	return NULL;
4844    }
4845    uformat = PyUnicode_FromObject(format);
4846    if (uformat == NULL)
4847	return NULL;
4848    fmt = PyUnicode_AS_UNICODE(uformat);
4849    fmtcnt = PyUnicode_GET_SIZE(uformat);
4850
4851    reslen = rescnt = fmtcnt + 100;
4852    result = _PyUnicode_New(reslen);
4853    if (result == NULL)
4854	goto onError;
4855    res = PyUnicode_AS_UNICODE(result);
4856
4857    if (PyTuple_Check(args)) {
4858	arglen = PyTuple_Size(args);
4859	argidx = 0;
4860    }
4861    else {
4862	arglen = -1;
4863	argidx = -2;
4864    }
4865    if (args->ob_type->tp_as_mapping)
4866	dict = args;
4867
4868    while (--fmtcnt >= 0) {
4869	if (*fmt != '%') {
4870	    if (--rescnt < 0) {
4871		rescnt = fmtcnt + 100;
4872		reslen += rescnt;
4873		if (_PyUnicode_Resize(&result, reslen) < 0)
4874		    return NULL;
4875		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4876		--rescnt;
4877	    }
4878	    *res++ = *fmt++;
4879	}
4880	else {
4881	    /* Got a format specifier */
4882	    int flags = 0;
4883	    int width = -1;
4884	    int prec = -1;
4885	    Py_UNICODE c = '\0';
4886	    Py_UNICODE fill;
4887	    PyObject *v = NULL;
4888	    PyObject *temp = NULL;
4889	    Py_UNICODE *pbuf;
4890	    Py_UNICODE sign;
4891	    int len;
4892	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4893
4894	    fmt++;
4895	    if (*fmt == '(') {
4896		Py_UNICODE *keystart;
4897		int keylen;
4898		PyObject *key;
4899		int pcount = 1;
4900
4901		if (dict == NULL) {
4902		    PyErr_SetString(PyExc_TypeError,
4903				    "format requires a mapping");
4904		    goto onError;
4905		}
4906		++fmt;
4907		--fmtcnt;
4908		keystart = fmt;
4909		/* Skip over balanced parentheses */
4910		while (pcount > 0 && --fmtcnt >= 0) {
4911		    if (*fmt == ')')
4912			--pcount;
4913		    else if (*fmt == '(')
4914			++pcount;
4915		    fmt++;
4916		}
4917		keylen = fmt - keystart - 1;
4918		if (fmtcnt < 0 || pcount > 0) {
4919		    PyErr_SetString(PyExc_ValueError,
4920				    "incomplete format key");
4921		    goto onError;
4922		}
4923		/* keys are converted to strings using UTF-8 and
4924		   then looked up since Python uses strings to hold
4925		   variables names etc. in its namespaces and we
4926		   wouldn't want to break common idioms. */
4927		key = PyUnicode_EncodeUTF8(keystart,
4928					   keylen,
4929					   NULL);
4930		if (key == NULL)
4931		    goto onError;
4932		if (args_owned) {
4933		    Py_DECREF(args);
4934		    args_owned = 0;
4935		}
4936		args = PyObject_GetItem(dict, key);
4937		Py_DECREF(key);
4938		if (args == NULL) {
4939		    goto onError;
4940		}
4941		args_owned = 1;
4942		arglen = -1;
4943		argidx = -2;
4944	    }
4945	    while (--fmtcnt >= 0) {
4946		switch (c = *fmt++) {
4947		case '-': flags |= F_LJUST; continue;
4948		case '+': flags |= F_SIGN; continue;
4949		case ' ': flags |= F_BLANK; continue;
4950		case '#': flags |= F_ALT; continue;
4951		case '0': flags |= F_ZERO; continue;
4952		}
4953		break;
4954	    }
4955	    if (c == '*') {
4956		v = getnextarg(args, arglen, &argidx);
4957		if (v == NULL)
4958		    goto onError;
4959		if (!PyInt_Check(v)) {
4960		    PyErr_SetString(PyExc_TypeError,
4961				    "* wants int");
4962		    goto onError;
4963		}
4964		width = PyInt_AsLong(v);
4965		if (width < 0) {
4966		    flags |= F_LJUST;
4967		    width = -width;
4968		}
4969		if (--fmtcnt >= 0)
4970		    c = *fmt++;
4971	    }
4972	    else if (c >= '0' && c <= '9') {
4973		width = c - '0';
4974		while (--fmtcnt >= 0) {
4975		    c = *fmt++;
4976		    if (c < '0' || c > '9')
4977			break;
4978		    if ((width*10) / 10 != width) {
4979			PyErr_SetString(PyExc_ValueError,
4980					"width too big");
4981			goto onError;
4982		    }
4983		    width = width*10 + (c - '0');
4984		}
4985	    }
4986	    if (c == '.') {
4987		prec = 0;
4988		if (--fmtcnt >= 0)
4989		    c = *fmt++;
4990		if (c == '*') {
4991		    v = getnextarg(args, arglen, &argidx);
4992		    if (v == NULL)
4993			goto onError;
4994		    if (!PyInt_Check(v)) {
4995			PyErr_SetString(PyExc_TypeError,
4996					"* wants int");
4997			goto onError;
4998		    }
4999		    prec = PyInt_AsLong(v);
5000		    if (prec < 0)
5001			prec = 0;
5002		    if (--fmtcnt >= 0)
5003			c = *fmt++;
5004		}
5005		else if (c >= '0' && c <= '9') {
5006		    prec = c - '0';
5007		    while (--fmtcnt >= 0) {
5008			c = Py_CHARMASK(*fmt++);
5009			if (c < '0' || c > '9')
5010			    break;
5011			if ((prec*10) / 10 != prec) {
5012			    PyErr_SetString(PyExc_ValueError,
5013					    "prec too big");
5014			    goto onError;
5015			}
5016			prec = prec*10 + (c - '0');
5017		    }
5018		}
5019	    } /* prec */
5020	    if (fmtcnt >= 0) {
5021		if (c == 'h' || c == 'l' || c == 'L') {
5022		    if (--fmtcnt >= 0)
5023			c = *fmt++;
5024		}
5025	    }
5026	    if (fmtcnt < 0) {
5027		PyErr_SetString(PyExc_ValueError,
5028				"incomplete format");
5029		goto onError;
5030	    }
5031	    if (c != '%') {
5032		v = getnextarg(args, arglen, &argidx);
5033		if (v == NULL)
5034		    goto onError;
5035	    }
5036	    sign = 0;
5037	    fill = ' ';
5038	    switch (c) {
5039
5040	    case '%':
5041		pbuf = formatbuf;
5042		/* presume that buffer length is at least 1 */
5043		pbuf[0] = '%';
5044		len = 1;
5045		break;
5046
5047	    case 's':
5048	    case 'r':
5049		if (PyUnicode_Check(v) && c == 's') {
5050		    temp = v;
5051		    Py_INCREF(temp);
5052		}
5053		else {
5054		    PyObject *unicode;
5055		    if (c == 's')
5056			temp = PyObject_Str(v);
5057		    else
5058			temp = PyObject_Repr(v);
5059		    if (temp == NULL)
5060			goto onError;
5061		    if (!PyString_Check(temp)) {
5062			/* XXX Note: this should never happen, since
5063   			       PyObject_Repr() and PyObject_Str() assure
5064			       this */
5065			Py_DECREF(temp);
5066			PyErr_SetString(PyExc_TypeError,
5067					"%s argument has non-string str()");
5068			goto onError;
5069		    }
5070		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5071						   PyString_GET_SIZE(temp),
5072					       NULL,
5073						   "strict");
5074		    Py_DECREF(temp);
5075		    temp = unicode;
5076		    if (temp == NULL)
5077			goto onError;
5078		}
5079		pbuf = PyUnicode_AS_UNICODE(temp);
5080		len = PyUnicode_GET_SIZE(temp);
5081		if (prec >= 0 && len > prec)
5082		    len = prec;
5083		break;
5084
5085	    case 'i':
5086	    case 'd':
5087	    case 'u':
5088	    case 'o':
5089	    case 'x':
5090	    case 'X':
5091		if (c == 'i')
5092		    c = 'd';
5093		if (PyLong_Check(v)) {
5094		    temp = formatlong(v, flags, prec, c);
5095		    if (!temp)
5096			goto onError;
5097		    pbuf = PyUnicode_AS_UNICODE(temp);
5098		    len = PyUnicode_GET_SIZE(temp);
5099		    /* unbounded ints can always produce
5100		       a sign character! */
5101		    sign = 1;
5102		}
5103		else {
5104		    pbuf = formatbuf;
5105		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5106				    flags, prec, c, v);
5107		    if (len < 0)
5108			goto onError;
5109		    /* only d conversion is signed */
5110		    sign = c == 'd';
5111		}
5112		if (flags & F_ZERO)
5113		    fill = '0';
5114		break;
5115
5116	    case 'e':
5117	    case 'E':
5118	    case 'f':
5119	    case 'g':
5120	    case 'G':
5121		pbuf = formatbuf;
5122		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5123			flags, prec, c, v);
5124		if (len < 0)
5125		    goto onError;
5126		sign = 1;
5127		if (flags & F_ZERO)
5128		    fill = '0';
5129		break;
5130
5131	    case 'c':
5132		pbuf = formatbuf;
5133		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5134		if (len < 0)
5135		    goto onError;
5136		break;
5137
5138	    default:
5139		PyErr_Format(PyExc_ValueError,
5140			     "unsupported format character '%c' (0x%x) "
5141			     "at index %i",
5142			     (31<=c && c<=126) ? c : '?',
5143                             c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5144		goto onError;
5145	    }
5146	    if (sign) {
5147		if (*pbuf == '-' || *pbuf == '+') {
5148		    sign = *pbuf++;
5149		    len--;
5150		}
5151		else if (flags & F_SIGN)
5152		    sign = '+';
5153		else if (flags & F_BLANK)
5154		    sign = ' ';
5155		else
5156		    sign = 0;
5157	    }
5158	    if (width < len)
5159		width = len;
5160	    if (rescnt < width + (sign != 0)) {
5161		reslen -= rescnt;
5162		rescnt = width + fmtcnt + 100;
5163		reslen += rescnt;
5164		if (_PyUnicode_Resize(&result, reslen) < 0)
5165		    return NULL;
5166		res = PyUnicode_AS_UNICODE(result)
5167		    + reslen - rescnt;
5168	    }
5169	    if (sign) {
5170		if (fill != ' ')
5171		    *res++ = sign;
5172		rescnt--;
5173		if (width > len)
5174		    width--;
5175	    }
5176	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5177		assert(pbuf[0] == '0');
5178		assert(pbuf[1] == c);
5179		if (fill != ' ') {
5180		    *res++ = *pbuf++;
5181		    *res++ = *pbuf++;
5182		}
5183		rescnt -= 2;
5184		width -= 2;
5185		if (width < 0)
5186		    width = 0;
5187		len -= 2;
5188	    }
5189	    if (width > len && !(flags & F_LJUST)) {
5190		do {
5191		    --rescnt;
5192		    *res++ = fill;
5193		} while (--width > len);
5194	    }
5195	    if (fill == ' ') {
5196		if (sign)
5197		    *res++ = sign;
5198		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5199		    assert(pbuf[0] == '0');
5200		    assert(pbuf[1] == c);
5201		    *res++ = *pbuf++;
5202		    *res++ = *pbuf++;
5203		}
5204	    }
5205	    Py_UNICODE_COPY(res, pbuf, len);
5206	    res += len;
5207	    rescnt -= len;
5208	    while (--width >= len) {
5209		--rescnt;
5210		*res++ = ' ';
5211	    }
5212	    if (dict && (argidx < arglen) && c != '%') {
5213		PyErr_SetString(PyExc_TypeError,
5214				"not all arguments converted");
5215		goto onError;
5216	    }
5217	    Py_XDECREF(temp);
5218	} /* '%' */
5219    } /* until end */
5220    if (argidx < arglen && !dict) {
5221	PyErr_SetString(PyExc_TypeError,
5222			"not all arguments converted");
5223	goto onError;
5224    }
5225
5226    if (args_owned) {
5227	Py_DECREF(args);
5228    }
5229    Py_DECREF(uformat);
5230    if (_PyUnicode_Resize(&result, reslen - rescnt))
5231	goto onError;
5232    return (PyObject *)result;
5233
5234 onError:
5235    Py_XDECREF(result);
5236    Py_DECREF(uformat);
5237    if (args_owned) {
5238	Py_DECREF(args);
5239    }
5240    return NULL;
5241}
5242
5243static PyBufferProcs unicode_as_buffer = {
5244    (getreadbufferproc) unicode_buffer_getreadbuf,
5245    (getwritebufferproc) unicode_buffer_getwritebuf,
5246    (getsegcountproc) unicode_buffer_getsegcount,
5247    (getcharbufferproc) unicode_buffer_getcharbuf,
5248};
5249
5250PyTypeObject PyUnicode_Type = {
5251    PyObject_HEAD_INIT(&PyType_Type)
5252    0, 					/* ob_size */
5253    "unicode", 				/* tp_name */
5254    sizeof(PyUnicodeObject), 		/* tp_size */
5255    0, 					/* tp_itemsize */
5256    /* Slots */
5257    (destructor)_PyUnicode_Free, 	/* tp_dealloc */
5258    0, 					/* tp_print */
5259    (getattrfunc)unicode_getattr, 	/* tp_getattr */
5260    0, 					/* tp_setattr */
5261    (cmpfunc) unicode_compare, 		/* tp_compare */
5262    (reprfunc) unicode_repr, 		/* tp_repr */
5263    0, 					/* tp_as_number */
5264    &unicode_as_sequence, 		/* tp_as_sequence */
5265    0, 					/* tp_as_mapping */
5266    (hashfunc) unicode_hash, 		/* tp_hash*/
5267    0, 					/* tp_call*/
5268    (reprfunc) unicode_str,	 	/* tp_str */
5269    (getattrofunc) NULL, 		/* tp_getattro */
5270    (setattrofunc) NULL, 		/* tp_setattro */
5271    &unicode_as_buffer,			/* tp_as_buffer */
5272    Py_TPFLAGS_DEFAULT,			/* tp_flags */
5273};
5274
5275/* Initialize the Unicode implementation */
5276
5277void _PyUnicode_Init(void)
5278{
5279    int i;
5280
5281    /* Doublecheck the configuration... */
5282    if (sizeof(Py_UNICODE) != 2)
5283        Py_FatalError("Unicode configuration error: "
5284		      "sizeof(Py_UNICODE) != 2 bytes");
5285
5286    /* Init the implementation */
5287    unicode_freelist = NULL;
5288    unicode_freelist_size = 0;
5289    unicode_empty = _PyUnicode_New(0);
5290    strcpy(unicode_default_encoding, "ascii");
5291    for (i = 0; i < 256; i++)
5292	unicode_latin1[i] = NULL;
5293}
5294
5295/* Finalize the Unicode implementation */
5296
5297void
5298_PyUnicode_Fini(void)
5299{
5300    PyUnicodeObject *u;
5301    int i;
5302
5303    Py_XDECREF(unicode_empty);
5304    unicode_empty = NULL;
5305
5306    for (i = 0; i < 256; i++) {
5307	if (unicode_latin1[i]) {
5308	    Py_DECREF(unicode_latin1[i]);
5309	    unicode_latin1[i] = NULL;
5310	}
5311    }
5312
5313    for (u = unicode_freelist; u != NULL;) {
5314	PyUnicodeObject *v = u;
5315	u = *(PyUnicodeObject **)u;
5316	if (v->str)
5317	    PyMem_DEL(v->str);
5318	Py_XDECREF(v->defenc);
5319	PyObject_DEL(v);
5320    }
5321    unicode_freelist = NULL;
5322    unicode_freelist_size = 0;
5323}
5324