unicodeobject.c revision 2cfe36828342e16cd274b968736a01aed5c49557
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Copyright (c) Corporation for National Research Initiatives.
8
9--------------------------------------------------------------------
10The original string type implementation is:
11
12    Copyright (c) 1999 by Secret Labs AB
13    Copyright (c) 1999 by Fredrik Lundh
14
15By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
38
39#include "Python.h"
40
41#include "unicodeobject.h"
42#include "ucnhash.h"
43
44#ifdef MS_WIN32
45#include <windows.h>
46#endif
47
48/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE       1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54   The implementation will keep allocated Unicode memory intact for
55   all objects on the free list having a size less than this
56   limit. This reduces malloc() overhead for small Unicode objects.
57
58   At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60   malloc()-overhead) bytes of unused garbage.
61
62   Setting the limit to 0 effectively turns the feature off.
63
64   Note: This is an experimental feature ! If you get core dumps when
65   using Unicode objects, turn this feature off.
66
67*/
68
69#define KEEPALIVE_SIZE_LIMIT       9
70
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
79/* --- Globals ------------------------------------------------------------
80
81   The globals are initialized by the _PyUnicode_Init() API and should
82   not be used before calling that API.
83
84*/
85
86/* Free list for Unicode objects */
87static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
89
90/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94   shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
97/* Default encoding to use and assume when NULL is passed as encoding
98   parameter; it is initialized by _PyUnicode_Init().
99
100   Always use the PyUnicode_SetDefaultEncoding() and
101   PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
104static char unicode_default_encoding[100];
105
106/* --- Unicode Object ----------------------------------------------------- */
107
108static
109int unicode_resize(register PyUnicodeObject *unicode,
110                      int length)
111{
112    void *oldstr;
113
114    /* Shortcut if there's nothing much to do. */
115    if (unicode->length == length)
116	goto reset;
117
118    /* Resizing shared object (unicode_empty or single character
119       objects) in-place is not allowed. Use PyUnicode_Resize()
120       instead ! */
121    if (unicode == unicode_empty ||
122	(unicode->length == 1 &&
123	 unicode->str[0] < 256 &&
124	 unicode_latin1[unicode->str[0]] == unicode)) {
125        PyErr_SetString(PyExc_SystemError,
126                        "can't resize shared unicode objects");
127        return -1;
128    }
129
130    /* We allocate one more byte to make sure the string is
131       Ux0000 terminated -- XXX is this needed ? */
132    oldstr = unicode->str;
133    PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
134    if (!unicode->str) {
135	unicode->str = oldstr;
136        PyErr_NoMemory();
137        return -1;
138    }
139    unicode->str[length] = 0;
140    unicode->length = length;
141
142 reset:
143    /* Reset the object caches */
144    if (unicode->defenc) {
145        Py_DECREF(unicode->defenc);
146        unicode->defenc = NULL;
147    }
148    unicode->hash = -1;
149
150    return 0;
151}
152
153/* We allocate one more byte to make sure the string is
154   Ux0000 terminated -- XXX is this needed ?
155
156   XXX This allocator could further be enhanced by assuring that the
157       free list never reduces its size below 1.
158
159*/
160
161static
162PyUnicodeObject *_PyUnicode_New(int length)
163{
164    register PyUnicodeObject *unicode;
165
166    /* Optimization for empty strings */
167    if (length == 0 && unicode_empty != NULL) {
168        Py_INCREF(unicode_empty);
169        return unicode_empty;
170    }
171
172    /* Unicode freelist & memory allocation */
173    if (unicode_freelist) {
174        unicode = unicode_freelist;
175        unicode_freelist = *(PyUnicodeObject **)unicode;
176        unicode_freelist_size--;
177	if (unicode->str) {
178	    /* Keep-Alive optimization: we only upsize the buffer,
179	       never downsize it. */
180	    if ((unicode->length < length) &&
181		unicode_resize(unicode, length)) {
182		PyMem_DEL(unicode->str);
183		goto onError;
184	    }
185	}
186      else {
187	    unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
188      }
189      PyObject_INIT(unicode, &PyUnicode_Type);
190    }
191    else {
192        unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
193        if (unicode == NULL)
194            return NULL;
195	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
196    }
197
198    if (!unicode->str) {
199	PyErr_NoMemory();
200	goto onError;
201    }
202    unicode->str[length] = 0;
203    unicode->length = length;
204    unicode->hash = -1;
205    unicode->defenc = NULL;
206    return unicode;
207
208 onError:
209    _Py_ForgetReference((PyObject *)unicode);
210    PyObject_DEL(unicode);
211    return NULL;
212}
213
214static
215void _PyUnicode_Free(register PyUnicodeObject *unicode)
216{
217    if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
218        /* Keep-Alive optimization */
219	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
220	    PyMem_DEL(unicode->str);
221	    unicode->str = NULL;
222	    unicode->length = 0;
223	}
224	if (unicode->defenc) {
225	    Py_DECREF(unicode->defenc);
226	    unicode->defenc = NULL;
227	}
228	/* Add to free list */
229        *(PyUnicodeObject **)unicode = unicode_freelist;
230        unicode_freelist = unicode;
231        unicode_freelist_size++;
232    }
233    else {
234	PyMem_DEL(unicode->str);
235	Py_XDECREF(unicode->defenc);
236	PyObject_DEL(unicode);
237    }
238}
239
240int PyUnicode_Resize(PyObject **unicode,
241		     int length)
242{
243    register PyUnicodeObject *v;
244
245    /* Argument checks */
246    if (unicode == NULL) {
247	PyErr_BadInternalCall();
248	return -1;
249    }
250    v = (PyUnicodeObject *)*unicode;
251    if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
252	PyErr_BadInternalCall();
253	return -1;
254    }
255
256    /* Resizing unicode_empty and single character objects is not
257       possible since these are being shared. We simply return a fresh
258       copy with the same Unicode content. */
259    if (v->length != length &&
260	(v == unicode_empty || v->length == 1)) {
261	PyUnicodeObject *w = _PyUnicode_New(length);
262	if (w == NULL)
263	    return -1;
264	Py_UNICODE_COPY(w->str, v->str,
265			length < v->length ? length : v->length);
266	*unicode = (PyObject *)w;
267	return 0;
268    }
269
270    /* Note that we don't have to modify *unicode for unshared Unicode
271       objects, since we can modify them in-place. */
272    return unicode_resize(v, length);
273}
274
275/* Internal API for use in unicodeobject.c only ! */
276#define _PyUnicode_Resize(unicodevar, length) \
277        PyUnicode_Resize(((PyObject **)(unicodevar)), length)
278
279PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
280				int size)
281{
282    PyUnicodeObject *unicode;
283
284    /* If the Unicode data is known at construction time, we can apply
285       some optimizations which share commonly used objects. */
286    if (u != NULL) {
287
288	/* Optimization for empty strings */
289	if (size == 0 && unicode_empty != NULL) {
290	    Py_INCREF(unicode_empty);
291	    return (PyObject *)unicode_empty;
292	}
293
294	/* Single character Unicode objects in the Latin-1 range are
295	   shared when using this constructor */
296	if (size == 1 && *u < 256) {
297	    unicode = unicode_latin1[*u];
298	    if (!unicode) {
299		unicode = _PyUnicode_New(1);
300		unicode->str[0] = *u;
301		if (!unicode)
302		    return NULL;
303		unicode_latin1[*u] = unicode;
304	    }
305	    Py_INCREF(unicode);
306	    return (PyObject *)unicode;
307	}
308    }
309
310    unicode = _PyUnicode_New(size);
311    if (!unicode)
312        return NULL;
313
314    /* Copy the Unicode data into the new object */
315    if (u != NULL)
316	Py_UNICODE_COPY(unicode->str, u, size);
317
318    return (PyObject *)unicode;
319}
320
321#ifdef HAVE_WCHAR_H
322
323PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
324				 int size)
325{
326    PyUnicodeObject *unicode;
327
328    if (w == NULL) {
329	PyErr_BadInternalCall();
330	return NULL;
331    }
332
333    unicode = _PyUnicode_New(size);
334    if (!unicode)
335        return NULL;
336
337    /* Copy the wchar_t data into the new object */
338#ifdef HAVE_USABLE_WCHAR_T
339    memcpy(unicode->str, w, size * sizeof(wchar_t));
340#else
341    {
342	register Py_UNICODE *u;
343	register int i;
344	u = PyUnicode_AS_UNICODE(unicode);
345	for (i = size; i >= 0; i--)
346	    *u++ = *w++;
347    }
348#endif
349
350    return (PyObject *)unicode;
351}
352
353int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
354			 register wchar_t *w,
355			 int size)
356{
357    if (unicode == NULL) {
358	PyErr_BadInternalCall();
359	return -1;
360    }
361    if (size > PyUnicode_GET_SIZE(unicode))
362	size = PyUnicode_GET_SIZE(unicode);
363#ifdef HAVE_USABLE_WCHAR_T
364    memcpy(w, unicode->str, size * sizeof(wchar_t));
365#else
366    {
367	register Py_UNICODE *u;
368	register int i;
369	u = PyUnicode_AS_UNICODE(unicode);
370	for (i = size; i >= 0; i--)
371	    *w++ = *u++;
372    }
373#endif
374
375    return size;
376}
377
378#endif
379
380PyObject *PyUnicode_FromObject(register PyObject *obj)
381{
382    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
383}
384
385PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
386				      const char *encoding,
387				      const char *errors)
388{
389    const char *s;
390    int len;
391    int owned = 0;
392    PyObject *v;
393
394    if (obj == NULL) {
395	PyErr_BadInternalCall();
396	return NULL;
397    }
398
399    /* Coerce object */
400    if (PyInstance_Check(obj)) {
401	PyObject *func;
402	func = PyObject_GetAttrString(obj, "__str__");
403	if (func == NULL) {
404	    PyErr_SetString(PyExc_TypeError,
405		  "coercing to Unicode: instance doesn't define __str__");
406	    return NULL;
407	}
408	obj = PyEval_CallObject(func, NULL);
409	Py_DECREF(func);
410	if (obj == NULL)
411	    return NULL;
412	owned = 1;
413    }
414    if (PyUnicode_Check(obj)) {
415	Py_INCREF(obj);
416	v = obj;
417	if (encoding) {
418	    PyErr_SetString(PyExc_TypeError,
419			    "decoding Unicode is not supported");
420	    return NULL;
421	}
422	goto done;
423    }
424    else if (PyString_Check(obj)) {
425	s = PyString_AS_STRING(obj);
426	len = PyString_GET_SIZE(obj);
427    }
428    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
429	/* Overwrite the error message with something more useful in
430	   case of a TypeError. */
431	if (PyErr_ExceptionMatches(PyExc_TypeError))
432	    PyErr_Format(PyExc_TypeError,
433			 "coercing to Unicode: need string or buffer, "
434			 "%.80s found",
435			 obj->ob_type->tp_name);
436	goto onError;
437    }
438
439    /* Convert to Unicode */
440    if (len == 0) {
441	Py_INCREF(unicode_empty);
442	v = (PyObject *)unicode_empty;
443    }
444    else
445	v = PyUnicode_Decode(s, len, encoding, errors);
446
447 done:
448    if (owned) {
449	Py_DECREF(obj);
450    }
451    return v;
452
453 onError:
454    if (owned) {
455	Py_DECREF(obj);
456    }
457    return NULL;
458}
459
460PyObject *PyUnicode_Decode(const char *s,
461			   int size,
462			   const char *encoding,
463			   const char *errors)
464{
465    PyObject *buffer = NULL, *unicode;
466
467    if (encoding == NULL)
468	encoding = PyUnicode_GetDefaultEncoding();
469
470    /* Shortcuts for common default encodings */
471    if (strcmp(encoding, "utf-8") == 0)
472        return PyUnicode_DecodeUTF8(s, size, errors);
473    else if (strcmp(encoding, "latin-1") == 0)
474        return PyUnicode_DecodeLatin1(s, size, errors);
475    else if (strcmp(encoding, "ascii") == 0)
476        return PyUnicode_DecodeASCII(s, size, errors);
477
478    /* Decode via the codec registry */
479    buffer = PyBuffer_FromMemory((void *)s, size);
480    if (buffer == NULL)
481        goto onError;
482    unicode = PyCodec_Decode(buffer, encoding, errors);
483    if (unicode == NULL)
484        goto onError;
485    if (!PyUnicode_Check(unicode)) {
486        PyErr_Format(PyExc_TypeError,
487                     "decoder did not return an unicode object (type=%.400s)",
488                     unicode->ob_type->tp_name);
489        Py_DECREF(unicode);
490        goto onError;
491    }
492    Py_DECREF(buffer);
493    return unicode;
494
495 onError:
496    Py_XDECREF(buffer);
497    return NULL;
498}
499
500PyObject *PyUnicode_Encode(const Py_UNICODE *s,
501			   int size,
502			   const char *encoding,
503			   const char *errors)
504{
505    PyObject *v, *unicode;
506
507    unicode = PyUnicode_FromUnicode(s, size);
508    if (unicode == NULL)
509	return NULL;
510    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
511    Py_DECREF(unicode);
512    return v;
513}
514
515PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
516                                    const char *encoding,
517                                    const char *errors)
518{
519    PyObject *v;
520
521    if (!PyUnicode_Check(unicode)) {
522        PyErr_BadArgument();
523        goto onError;
524    }
525
526    if (encoding == NULL)
527	encoding = PyUnicode_GetDefaultEncoding();
528
529    /* Shortcuts for common default encodings */
530    if (errors == NULL) {
531	if (strcmp(encoding, "utf-8") == 0)
532        return PyUnicode_AsUTF8String(unicode);
533	else if (strcmp(encoding, "latin-1") == 0)
534	    return PyUnicode_AsLatin1String(unicode);
535	else if (strcmp(encoding, "ascii") == 0)
536	    return PyUnicode_AsASCIIString(unicode);
537    }
538
539    /* Encode via the codec registry */
540    v = PyCodec_Encode(unicode, encoding, errors);
541    if (v == NULL)
542        goto onError;
543    /* XXX Should we really enforce this ? */
544    if (!PyString_Check(v)) {
545        PyErr_Format(PyExc_TypeError,
546                     "encoder did not return a string object (type=%.400s)",
547                     v->ob_type->tp_name);
548        Py_DECREF(v);
549        goto onError;
550    }
551    return v;
552
553 onError:
554    return NULL;
555}
556
557/* Return a Python string holding the default encoded value of the
558   Unicode object.
559
560   The resulting string is cached in the Unicode object for subsequent
561   usage by this function. The cached version is needed to implement
562   the character buffer interface and will live (at least) as long as
563   the Unicode object itself.
564
565   The refcount of the string is *not* incremented.
566
567   *** Exported for internal use by the interpreter only !!! ***
568
569*/
570
571PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
572					    const char *errors)
573{
574    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
575
576    if (v)
577        return v;
578    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
579    if (v && errors == NULL)
580        ((PyUnicodeObject *)unicode)->defenc = v;
581    return v;
582}
583
584Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
585{
586    if (!PyUnicode_Check(unicode)) {
587        PyErr_BadArgument();
588        goto onError;
589    }
590    return PyUnicode_AS_UNICODE(unicode);
591
592 onError:
593    return NULL;
594}
595
596int PyUnicode_GetSize(PyObject *unicode)
597{
598    if (!PyUnicode_Check(unicode)) {
599        PyErr_BadArgument();
600        goto onError;
601    }
602    return PyUnicode_GET_SIZE(unicode);
603
604 onError:
605    return -1;
606}
607
608const char *PyUnicode_GetDefaultEncoding(void)
609{
610    return unicode_default_encoding;
611}
612
613int PyUnicode_SetDefaultEncoding(const char *encoding)
614{
615    PyObject *v;
616
617    /* Make sure the encoding is valid. As side effect, this also
618       loads the encoding into the codec registry cache. */
619    v = _PyCodec_Lookup(encoding);
620    if (v == NULL)
621	goto onError;
622    Py_DECREF(v);
623    strncpy(unicode_default_encoding,
624	    encoding,
625	    sizeof(unicode_default_encoding));
626    return 0;
627
628 onError:
629    return -1;
630}
631
632/* --- UTF-8 Codec -------------------------------------------------------- */
633
634static
635char utf8_code_length[256] = {
636    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
637       illegal prefix.  see RFC 2279 for details */
638    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
651    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
652    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
653    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
654};
655
656static
657int utf8_decoding_error(const char **source,
658                        Py_UNICODE **dest,
659                        const char *errors,
660                        const char *details)
661{
662    if ((errors == NULL) ||
663        (strcmp(errors,"strict") == 0)) {
664        PyErr_Format(PyExc_UnicodeError,
665                     "UTF-8 decoding error: %.400s",
666                     details);
667        return -1;
668    }
669    else if (strcmp(errors,"ignore") == 0) {
670        (*source)++;
671        return 0;
672    }
673    else if (strcmp(errors,"replace") == 0) {
674        (*source)++;
675        **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
676        (*dest)++;
677        return 0;
678    }
679    else {
680        PyErr_Format(PyExc_ValueError,
681                     "UTF-8 decoding error; unknown error handling code: %.400s",
682                     errors);
683        return -1;
684    }
685}
686
687PyObject *PyUnicode_DecodeUTF8(const char *s,
688			       int size,
689			       const char *errors)
690{
691    int n;
692    const char *e;
693    PyUnicodeObject *unicode;
694    Py_UNICODE *p;
695    const char *errmsg = "";
696
697    /* Note: size will always be longer than the resulting Unicode
698       character count */
699    unicode = _PyUnicode_New(size);
700    if (!unicode)
701        return NULL;
702    if (size == 0)
703        return (PyObject *)unicode;
704
705    /* Unpack UTF-8 encoded data */
706    p = unicode->str;
707    e = s + size;
708
709    while (s < e) {
710        Py_UCS4 ch = (unsigned char)*s;
711
712        if (ch < 0x80) {
713            *p++ = (Py_UNICODE)ch;
714            s++;
715            continue;
716        }
717
718        n = utf8_code_length[ch];
719
720        if (s + n > e) {
721	    errmsg = "unexpected end of data";
722	    goto utf8Error;
723	}
724
725        switch (n) {
726
727        case 0:
728            errmsg = "unexpected code byte";
729	    goto utf8Error;
730
731        case 1:
732            errmsg = "internal error";
733	    goto utf8Error;
734
735        case 2:
736            if ((s[1] & 0xc0) != 0x80) {
737                errmsg = "invalid data";
738		goto utf8Error;
739	    }
740            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
741            if (ch < 0x80) {
742                errmsg = "illegal encoding";
743		goto utf8Error;
744	    }
745	    else
746		*p++ = (Py_UNICODE)ch;
747            break;
748
749        case 3:
750            if ((s[1] & 0xc0) != 0x80 ||
751                (s[2] & 0xc0) != 0x80) {
752                errmsg = "invalid data";
753		goto utf8Error;
754	    }
755            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
756            if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
757                errmsg = "illegal encoding";
758		goto utf8Error;
759	    }
760	    else
761				*p++ = (Py_UNICODE)ch;
762            break;
763
764        case 4:
765            if ((s[1] & 0xc0) != 0x80 ||
766                (s[2] & 0xc0) != 0x80 ||
767                (s[3] & 0xc0) != 0x80) {
768                errmsg = "invalid data";
769		goto utf8Error;
770	    }
771            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
772                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
773            /* validate and convert to UTF-16 */
774            if ((ch < 0x10000) ||   /* minimum value allowed for 4
775                                       byte encoding */
776                (ch > 0x10ffff)) {  /* maximum value allowed for
777                                       UTF-16 */
778                errmsg = "illegal encoding";
779		goto utf8Error;
780	    }
781            /*  compute and append the two surrogates: */
782
783            /*  translate from 10000..10FFFF to 0..FFFF */
784            ch -= 0x10000;
785
786            /*  high surrogate = top 10 bits added to D800 */
787            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
788
789            /*  low surrogate = bottom 10 bits added to DC00 */
790            *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
791            break;
792
793        default:
794            /* Other sizes are only needed for UCS-4 */
795            errmsg = "unsupported Unicode code range";
796	    goto utf8Error;
797        }
798        s += n;
799	continue;
800
801    utf8Error:
802      if (utf8_decoding_error(&s, &p, errors, errmsg))
803          goto onError;
804    }
805
806    /* Adjust length */
807    if (_PyUnicode_Resize(&unicode, p - unicode->str))
808        goto onError;
809
810    return (PyObject *)unicode;
811
812onError:
813    Py_DECREF(unicode);
814    return NULL;
815}
816
817/* Not used anymore, now that the encoder supports UTF-16
818   surrogates. */
819#if 0
820static
821int utf8_encoding_error(const Py_UNICODE **source,
822			char **dest,
823			const char *errors,
824			const char *details)
825{
826    if ((errors == NULL) ||
827	(strcmp(errors,"strict") == 0)) {
828	PyErr_Format(PyExc_UnicodeError,
829		     "UTF-8 encoding error: %.400s",
830		     details);
831	return -1;
832    }
833    else if (strcmp(errors,"ignore") == 0) {
834	return 0;
835    }
836    else if (strcmp(errors,"replace") == 0) {
837	**dest = '?';
838	(*dest)++;
839	return 0;
840    }
841    else {
842	PyErr_Format(PyExc_ValueError,
843		     "UTF-8 encoding error; "
844		     "unknown error handling code: %.400s",
845		     errors);
846	return -1;
847    }
848}
849#endif
850
851PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
852			       int size,
853			       const char *errors)
854{
855    PyObject *v;
856    char *p;
857    char *q;
858    Py_UCS4 ch2;
859    unsigned int cbAllocated = 3 * size;
860    unsigned int cbWritten = 0;
861    int i = 0;
862
863    v = PyString_FromStringAndSize(NULL, cbAllocated);
864    if (v == NULL)
865        return NULL;
866    if (size == 0)
867        return v;
868
869    p = q = PyString_AS_STRING(v);
870    while (i < size) {
871        Py_UCS4 ch = s[i++];
872        if (ch < 0x80) {
873            *p++ = (char) ch;
874            cbWritten++;
875        }
876        else if (ch < 0x0800) {
877            *p++ = 0xc0 | (ch >> 6);
878            *p++ = 0x80 | (ch & 0x3f);
879            cbWritten += 2;
880        }
881        else {
882            /* Check for high surrogate */
883            if (0xD800 <= ch && ch <= 0xDBFF) {
884                if (i != size) {
885                    ch2 = s[i];
886                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
887
888                        if (cbWritten >= (cbAllocated - 4)) {
889			    /* Provide enough room for some more
890			       surrogates */
891			    cbAllocated += 4*10;
892                            if (_PyString_Resize(&v, cbAllocated))
893				goto onError;
894                        }
895
896                        /* combine the two values */
897                        ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
898
899                        *p++ = (char)((ch >> 18) | 0xf0);
900                        *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
901                        i++;
902                        cbWritten += 4;
903                    }
904                }
905            }
906            else {
907                *p++ = (char)(0xe0 | (ch >> 12));
908                cbWritten += 3;
909            }
910            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
911            *p++ = (char)(0x80 | (ch & 0x3f));
912        }
913    }
914    *p = '\0';
915    if (_PyString_Resize(&v, p - q))
916	goto onError;
917    return v;
918
919 onError:
920    Py_DECREF(v);
921    return NULL;
922}
923
924PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
925{
926    if (!PyUnicode_Check(unicode)) {
927        PyErr_BadArgument();
928        return NULL;
929    }
930    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
931				PyUnicode_GET_SIZE(unicode),
932				NULL);
933}
934
935/* --- UTF-16 Codec ------------------------------------------------------- */
936
937static
938int utf16_decoding_error(const Py_UNICODE **source,
939			 Py_UNICODE **dest,
940			 const char *errors,
941			 const char *details)
942{
943    if ((errors == NULL) ||
944        (strcmp(errors,"strict") == 0)) {
945        PyErr_Format(PyExc_UnicodeError,
946                     "UTF-16 decoding error: %.400s",
947                     details);
948        return -1;
949    }
950    else if (strcmp(errors,"ignore") == 0) {
951        return 0;
952    }
953    else if (strcmp(errors,"replace") == 0) {
954	if (dest) {
955	    **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
956	    (*dest)++;
957	}
958        return 0;
959    }
960    else {
961        PyErr_Format(PyExc_ValueError,
962                     "UTF-16 decoding error; "
963		     "unknown error handling code: %.400s",
964                     errors);
965        return -1;
966    }
967}
968
969PyObject *PyUnicode_DecodeUTF16(const char *s,
970				int size,
971				const char *errors,
972				int *byteorder)
973{
974    PyUnicodeObject *unicode;
975    Py_UNICODE *p;
976    const Py_UNICODE *q, *e;
977    int bo = 0;
978    const char *errmsg = "";
979
980    /* size should be an even number */
981    if (size % sizeof(Py_UNICODE) != 0) {
982	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
983	    return NULL;
984	/* The remaining input chars are ignored if we fall through
985           here... */
986    }
987
988    /* Note: size will always be longer than the resulting Unicode
989       character count */
990    unicode = _PyUnicode_New(size);
991    if (!unicode)
992        return NULL;
993    if (size == 0)
994        return (PyObject *)unicode;
995
996    /* Unpack UTF-16 encoded data */
997    p = unicode->str;
998    q = (Py_UNICODE *)s;
999    e = q + (size / sizeof(Py_UNICODE));
1000
1001    if (byteorder)
1002	bo = *byteorder;
1003
1004    while (q < e) {
1005	register Py_UNICODE ch = *q++;
1006
1007	/* Check for BOM marks (U+FEFF) in the input and adjust
1008	   current byte order setting accordingly. Swap input
1009	   bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
1010	   !) */
1011#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1012	if (ch == 0xFEFF) {
1013	    bo = -1;
1014	    continue;
1015	} else if (ch == 0xFFFE) {
1016	    bo = 1;
1017	    continue;
1018	}
1019	if (bo == 1)
1020	    ch = (ch >> 8) | (ch << 8);
1021#else
1022	if (ch == 0xFEFF) {
1023	    bo = 1;
1024	    continue;
1025	} else if (ch == 0xFFFE) {
1026	    bo = -1;
1027	    continue;
1028	}
1029	if (bo == -1)
1030	    ch = (ch >> 8) | (ch << 8);
1031#endif
1032	if (ch < 0xD800 || ch > 0xDFFF) {
1033	    *p++ = ch;
1034	    continue;
1035	}
1036
1037	/* UTF-16 code pair: */
1038	if (q >= e) {
1039	    errmsg = "unexpected end of data";
1040	    goto utf16Error;
1041	}
1042	if (0xDC00 <= *q && *q <= 0xDFFF) {
1043	    q++;
1044	    if (0xD800 <= *q && *q <= 0xDBFF) {
1045		/* This is valid data (a UTF-16 surrogate pair), but
1046		   we are not able to store this information since our
1047		   Py_UNICODE type only has 16 bits... this might
1048		   change someday, even though it's unlikely. */
1049		errmsg = "code pairs are not supported";
1050		goto utf16Error;
1051	    }
1052	    else
1053		continue;
1054	}
1055	errmsg = "illegal encoding";
1056	/* Fall through to report the error */
1057
1058    utf16Error:
1059	if (utf16_decoding_error(&q, &p, errors, errmsg))
1060	    goto onError;
1061    }
1062
1063    if (byteorder)
1064        *byteorder = bo;
1065
1066    /* Adjust length */
1067    if (_PyUnicode_Resize(&unicode, p - unicode->str))
1068        goto onError;
1069
1070    return (PyObject *)unicode;
1071
1072onError:
1073    Py_DECREF(unicode);
1074    return NULL;
1075}
1076
1077#undef UTF16_ERROR
1078
1079PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1080				int size,
1081				const char *errors,
1082				int byteorder)
1083{
1084    PyObject *v;
1085    Py_UNICODE *p;
1086    char *q;
1087
1088    /* We don't create UTF-16 pairs... */
1089    v = PyString_FromStringAndSize(NULL,
1090			sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1091    if (v == NULL)
1092        return NULL;
1093
1094    q = PyString_AS_STRING(v);
1095    p = (Py_UNICODE *)q;
1096    if (byteorder == 0)
1097	*p++ = 0xFEFF;
1098    if (size == 0)
1099        return v;
1100    if (byteorder == 0 ||
1101#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1102	byteorder == -1
1103#else
1104	byteorder == 1
1105#endif
1106	)
1107	Py_UNICODE_COPY(p, s, size);
1108    else
1109	while (size-- > 0) {
1110	    Py_UNICODE ch = *s++;
1111	    *p++ = (ch >> 8) | (ch << 8);
1112	}
1113    return v;
1114}
1115
1116PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1117{
1118    if (!PyUnicode_Check(unicode)) {
1119        PyErr_BadArgument();
1120        return NULL;
1121    }
1122    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1123				 PyUnicode_GET_SIZE(unicode),
1124				 NULL,
1125				 0);
1126}
1127
1128/* --- Unicode Escape Codec ----------------------------------------------- */
1129
1130static
1131int unicodeescape_decoding_error(const char **source,
1132                                 Py_UNICODE *x,
1133                                 const char *errors,
1134                                 const char *details)
1135{
1136    if ((errors == NULL) ||
1137        (strcmp(errors,"strict") == 0)) {
1138        PyErr_Format(PyExc_UnicodeError,
1139                     "Unicode-Escape decoding error: %.400s",
1140                     details);
1141        return -1;
1142    }
1143    else if (strcmp(errors,"ignore") == 0) {
1144        return 0;
1145    }
1146    else if (strcmp(errors,"replace") == 0) {
1147        *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1148        return 0;
1149    }
1150    else {
1151        PyErr_Format(PyExc_ValueError,
1152                     "Unicode-Escape decoding error; "
1153                     "unknown error handling code: %.400s",
1154                     errors);
1155        return -1;
1156    }
1157}
1158
1159static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1160
1161PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1162					int size,
1163					const char *errors)
1164{
1165    PyUnicodeObject *v;
1166    Py_UNICODE *p, *buf;
1167    const char *end;
1168    char* message;
1169    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1170
1171    /* Escaped strings will always be longer than the resulting
1172       Unicode string, so we start with size here and then reduce the
1173       length after conversion to the true value. */
1174    v = _PyUnicode_New(size);
1175    if (v == NULL)
1176        goto onError;
1177    if (size == 0)
1178        return (PyObject *)v;
1179
1180    p = buf = PyUnicode_AS_UNICODE(v);
1181    end = s + size;
1182
1183    while (s < end) {
1184        unsigned char c;
1185        Py_UNICODE x;
1186        int i, digits;
1187
1188        /* Non-escape characters are interpreted as Unicode ordinals */
1189        if (*s != '\\') {
1190            *p++ = (unsigned char) *s++;
1191            continue;
1192        }
1193
1194        /* \ - Escapes */
1195        s++;
1196        switch (*s++) {
1197
1198        /* \x escapes */
1199        case '\n': break;
1200        case '\\': *p++ = '\\'; break;
1201        case '\'': *p++ = '\''; break;
1202        case '\"': *p++ = '\"'; break;
1203        case 'b': *p++ = '\b'; break;
1204        case 'f': *p++ = '\014'; break; /* FF */
1205        case 't': *p++ = '\t'; break;
1206        case 'n': *p++ = '\n'; break;
1207        case 'r': *p++ = '\r'; break;
1208        case 'v': *p++ = '\013'; break; /* VT */
1209        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1210
1211        /* \OOO (octal) escapes */
1212        case '0': case '1': case '2': case '3':
1213        case '4': case '5': case '6': case '7':
1214            x = s[-1] - '0';
1215            if ('0' <= *s && *s <= '7') {
1216                x = (x<<3) + *s++ - '0';
1217                if ('0' <= *s && *s <= '7')
1218                    x = (x<<3) + *s++ - '0';
1219            }
1220            *p++ = x;
1221            break;
1222
1223        /* hex escapes */
1224        /* \xXX */
1225        case 'x':
1226            digits = 2;
1227            message = "truncated \\xXX escape";
1228            goto hexescape;
1229
1230        /* \uXXXX */
1231        case 'u':
1232            digits = 4;
1233            message = "truncated \\uXXXX escape";
1234            goto hexescape;
1235
1236        /* \UXXXXXXXX */
1237        case 'U':
1238            digits = 8;
1239            message = "truncated \\UXXXXXXXX escape";
1240        hexescape:
1241            chr = 0;
1242            for (i = 0; i < digits; i++) {
1243                c = (unsigned char) s[i];
1244                if (!isxdigit(c)) {
1245                    if (unicodeescape_decoding_error(&s, &x, errors, message))
1246                        goto onError;
1247                    chr = x;
1248                    i++;
1249                    break;
1250                }
1251                chr = (chr<<4) & ~0xF;
1252                if (c >= '0' && c <= '9')
1253                    chr += c - '0';
1254                else if (c >= 'a' && c <= 'f')
1255                    chr += 10 + c - 'a';
1256                else
1257                    chr += 10 + c - 'A';
1258            }
1259            s += i;
1260        store:
1261            /* when we get here, chr is a 32-bit unicode character */
1262            if (chr <= 0xffff)
1263                /* UCS-2 character */
1264                *p++ = (Py_UNICODE) chr;
1265            else if (chr <= 0x10ffff) {
1266                /* UCS-4 character.  store as two surrogate characters */
1267                chr -= 0x10000L;
1268                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1269                *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1270            } else {
1271                if (unicodeescape_decoding_error(
1272                    &s, &x, errors,
1273                    "illegal Unicode character")
1274                    )
1275                    goto onError;
1276                *p++ = x; /* store replacement character */
1277            }
1278            break;
1279
1280        /* \N{name} */
1281        case 'N':
1282            message = "malformed \\N character escape";
1283            if (ucnhash_CAPI == NULL) {
1284                /* load the unicode data module */
1285                PyObject *m, *v;
1286                m = PyImport_ImportModule("unicodedata");
1287                if (m == NULL)
1288                    goto ucnhashError;
1289                v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1290                Py_DECREF(m);
1291                if (v == NULL)
1292                    goto ucnhashError;
1293                ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1294                Py_DECREF(v);
1295                if (ucnhash_CAPI == NULL)
1296                    goto ucnhashError;
1297            }
1298            if (*s == '{') {
1299                const char *start = s+1;
1300                /* look for the closing brace */
1301                while (*s != '}' && s < end)
1302                    s++;
1303                if (s > start && s < end && *s == '}') {
1304                    /* found a name.  look it up in the unicode database */
1305                    message = "unknown Unicode character name";
1306                    s++;
1307                    if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1308                        goto store;
1309                }
1310            }
1311            if (unicodeescape_decoding_error(&s, &x, errors, message))
1312                goto onError;
1313            *p++ = x;
1314            break;
1315
1316        default:
1317            *p++ = '\\';
1318            *p++ = (unsigned char)s[-1];
1319            break;
1320        }
1321    }
1322    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1323		goto onError;
1324    return (PyObject *)v;
1325
1326ucnhashError:
1327    PyErr_SetString(
1328        PyExc_UnicodeError,
1329        "\\N escapes not supported (can't load unicodedata module)"
1330        );
1331    return NULL;
1332
1333onError:
1334    Py_XDECREF(v);
1335    return NULL;
1336}
1337
1338/* Return a Unicode-Escape string version of the Unicode object.
1339
1340   If quotes is true, the string is enclosed in u"" or u'' quotes as
1341   appropriate.
1342
1343*/
1344
1345static const Py_UNICODE *findchar(const Py_UNICODE *s,
1346				  int size,
1347				  Py_UNICODE ch);
1348
1349static
1350PyObject *unicodeescape_string(const Py_UNICODE *s,
1351                               int size,
1352                               int quotes)
1353{
1354    PyObject *repr;
1355    char *p;
1356    char *q;
1357
1358    static const char *hexdigit = "0123456789abcdef";
1359
1360    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1361    if (repr == NULL)
1362        return NULL;
1363
1364    p = q = PyString_AS_STRING(repr);
1365
1366    if (quotes) {
1367        *p++ = 'u';
1368        *p++ = (findchar(s, size, '\'') &&
1369                !findchar(s, size, '"')) ? '"' : '\'';
1370    }
1371    while (size-- > 0) {
1372        Py_UNICODE ch = *s++;
1373        /* Escape quotes */
1374        if (quotes && (ch == q[1] || ch == '\\')) {
1375            *p++ = '\\';
1376            *p++ = (char) ch;
1377        }
1378        /* Map 16-bit characters to '\uxxxx' */
1379        else if (ch >= 256) {
1380            *p++ = '\\';
1381            *p++ = 'u';
1382            *p++ = hexdigit[(ch >> 12) & 0xf];
1383            *p++ = hexdigit[(ch >> 8) & 0xf];
1384            *p++ = hexdigit[(ch >> 4) & 0xf];
1385            *p++ = hexdigit[ch & 15];
1386        }
1387        /* Map special whitespace to '\t', \n', '\r' */
1388        else if (ch == '\t') {
1389            *p++ = '\\';
1390            *p++ = 't';
1391        }
1392        else if (ch == '\n') {
1393            *p++ = '\\';
1394            *p++ = 'n';
1395        }
1396        else if (ch == '\r') {
1397            *p++ = '\\';
1398            *p++ = 'r';
1399        }
1400        /* Map non-printable US ASCII to '\xhh' */
1401        else if (ch < ' ' || ch >= 128) {
1402            *p++ = '\\';
1403            *p++ = 'x';
1404            *p++ = hexdigit[(ch >> 4) & 0xf];
1405            *p++ = hexdigit[ch & 15];
1406        }
1407        /* Copy everything else as-is */
1408        else
1409            *p++ = (char) ch;
1410    }
1411    if (quotes)
1412        *p++ = q[1];
1413
1414    *p = '\0';
1415    if (_PyString_Resize(&repr, p - q))
1416	goto onError;
1417
1418    return repr;
1419
1420 onError:
1421    Py_DECREF(repr);
1422    return NULL;
1423}
1424
1425PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1426					int size)
1427{
1428    return unicodeescape_string(s, size, 0);
1429}
1430
1431PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1432{
1433    if (!PyUnicode_Check(unicode)) {
1434        PyErr_BadArgument();
1435        return NULL;
1436    }
1437    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1438					 PyUnicode_GET_SIZE(unicode));
1439}
1440
1441/* --- Raw Unicode Escape Codec ------------------------------------------- */
1442
1443PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1444					   int size,
1445					   const char *errors)
1446{
1447    PyUnicodeObject *v;
1448    Py_UNICODE *p, *buf;
1449    const char *end;
1450    const char *bs;
1451
1452    /* Escaped strings will always be longer than the resulting
1453       Unicode string, so we start with size here and then reduce the
1454       length after conversion to the true value. */
1455    v = _PyUnicode_New(size);
1456    if (v == NULL)
1457	goto onError;
1458    if (size == 0)
1459	return (PyObject *)v;
1460    p = buf = PyUnicode_AS_UNICODE(v);
1461    end = s + size;
1462    while (s < end) {
1463	unsigned char c;
1464	Py_UNICODE x;
1465	int i;
1466
1467	/* Non-escape characters are interpreted as Unicode ordinals */
1468	if (*s != '\\') {
1469	    *p++ = (unsigned char)*s++;
1470	    continue;
1471	}
1472
1473	/* \u-escapes are only interpreted iff the number of leading
1474	   backslashes if odd */
1475	bs = s;
1476	for (;s < end;) {
1477	    if (*s != '\\')
1478		break;
1479	    *p++ = (unsigned char)*s++;
1480	}
1481	if (((s - bs) & 1) == 0 ||
1482	    s >= end ||
1483	    *s != 'u') {
1484	    continue;
1485	}
1486	p--;
1487	s++;
1488
1489	/* \uXXXX with 4 hex digits */
1490	for (x = 0, i = 0; i < 4; i++) {
1491	    c = (unsigned char)s[i];
1492	    if (!isxdigit(c)) {
1493		if (unicodeescape_decoding_error(&s, &x, errors,
1494						 "truncated \\uXXXX"))
1495		    goto onError;
1496		i++;
1497		break;
1498	    }
1499	    x = (x<<4) & ~0xF;
1500	    if (c >= '0' && c <= '9')
1501		x += c - '0';
1502	    else if (c >= 'a' && c <= 'f')
1503		x += 10 + c - 'a';
1504	    else
1505		x += 10 + c - 'A';
1506	}
1507	s += i;
1508	*p++ = x;
1509    }
1510    if (_PyUnicode_Resize(&v, (int)(p - buf)))
1511	goto onError;
1512    return (PyObject *)v;
1513
1514 onError:
1515    Py_XDECREF(v);
1516    return NULL;
1517}
1518
1519PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1520					   int size)
1521{
1522    PyObject *repr;
1523    char *p;
1524    char *q;
1525
1526    static const char *hexdigit = "0123456789abcdef";
1527
1528    repr = PyString_FromStringAndSize(NULL, 6 * size);
1529    if (repr == NULL)
1530        return NULL;
1531    if (size == 0)
1532	return repr;
1533
1534    p = q = PyString_AS_STRING(repr);
1535    while (size-- > 0) {
1536        Py_UNICODE ch = *s++;
1537	/* Map 16-bit characters to '\uxxxx' */
1538	if (ch >= 256) {
1539            *p++ = '\\';
1540            *p++ = 'u';
1541            *p++ = hexdigit[(ch >> 12) & 0xf];
1542            *p++ = hexdigit[(ch >> 8) & 0xf];
1543            *p++ = hexdigit[(ch >> 4) & 0xf];
1544            *p++ = hexdigit[ch & 15];
1545        }
1546	/* Copy everything else as-is */
1547	else
1548            *p++ = (char) ch;
1549    }
1550    *p = '\0';
1551    if (_PyString_Resize(&repr, p - q))
1552	goto onError;
1553
1554    return repr;
1555
1556 onError:
1557    Py_DECREF(repr);
1558    return NULL;
1559}
1560
1561PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1562{
1563    if (!PyUnicode_Check(unicode)) {
1564	PyErr_BadArgument();
1565	return NULL;
1566    }
1567    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1568					    PyUnicode_GET_SIZE(unicode));
1569}
1570
1571/* --- Latin-1 Codec ------------------------------------------------------ */
1572
1573PyObject *PyUnicode_DecodeLatin1(const char *s,
1574				 int size,
1575				 const char *errors)
1576{
1577    PyUnicodeObject *v;
1578    Py_UNICODE *p;
1579
1580    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1581    if (size == 1 && *(unsigned char*)s < 256) {
1582	Py_UNICODE r = *(unsigned char*)s;
1583	return PyUnicode_FromUnicode(&r, 1);
1584    }
1585
1586    v = _PyUnicode_New(size);
1587    if (v == NULL)
1588	goto onError;
1589    if (size == 0)
1590	return (PyObject *)v;
1591    p = PyUnicode_AS_UNICODE(v);
1592    while (size-- > 0)
1593	*p++ = (unsigned char)*s++;
1594    return (PyObject *)v;
1595
1596 onError:
1597    Py_XDECREF(v);
1598    return NULL;
1599}
1600
1601static
1602int latin1_encoding_error(const Py_UNICODE **source,
1603			  char **dest,
1604			  const char *errors,
1605			  const char *details)
1606{
1607    if ((errors == NULL) ||
1608	(strcmp(errors,"strict") == 0)) {
1609	PyErr_Format(PyExc_UnicodeError,
1610		     "Latin-1 encoding error: %.400s",
1611		     details);
1612	return -1;
1613    }
1614    else if (strcmp(errors,"ignore") == 0) {
1615	return 0;
1616    }
1617    else if (strcmp(errors,"replace") == 0) {
1618	**dest = '?';
1619	(*dest)++;
1620	return 0;
1621    }
1622    else {
1623	PyErr_Format(PyExc_ValueError,
1624		     "Latin-1 encoding error; "
1625		     "unknown error handling code: %.400s",
1626		     errors);
1627	return -1;
1628    }
1629}
1630
1631PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1632				 int size,
1633				 const char *errors)
1634{
1635    PyObject *repr;
1636    char *s, *start;
1637
1638    repr = PyString_FromStringAndSize(NULL, size);
1639    if (repr == NULL)
1640        return NULL;
1641    if (size == 0)
1642	return repr;
1643
1644    s = PyString_AS_STRING(repr);
1645    start = s;
1646    while (size-- > 0) {
1647        Py_UNICODE ch = *p++;
1648	if (ch >= 256) {
1649	    if (latin1_encoding_error(&p, &s, errors,
1650				      "ordinal not in range(256)"))
1651		goto onError;
1652	}
1653	else
1654            *s++ = (char)ch;
1655    }
1656    /* Resize if error handling skipped some characters */
1657    if (s - start < PyString_GET_SIZE(repr))
1658	if (_PyString_Resize(&repr, s - start))
1659	    goto onError;
1660    return repr;
1661
1662 onError:
1663    Py_DECREF(repr);
1664    return NULL;
1665}
1666
1667PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1668{
1669    if (!PyUnicode_Check(unicode)) {
1670	PyErr_BadArgument();
1671	return NULL;
1672    }
1673    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1674				  PyUnicode_GET_SIZE(unicode),
1675				  NULL);
1676}
1677
1678/* --- 7-bit ASCII Codec -------------------------------------------------- */
1679
1680static
1681int ascii_decoding_error(const char **source,
1682			 Py_UNICODE **dest,
1683			 const char *errors,
1684			 const char *details)
1685{
1686    if ((errors == NULL) ||
1687	(strcmp(errors,"strict") == 0)) {
1688	PyErr_Format(PyExc_UnicodeError,
1689		     "ASCII decoding error: %.400s",
1690		     details);
1691	return -1;
1692    }
1693    else if (strcmp(errors,"ignore") == 0) {
1694	return 0;
1695    }
1696    else if (strcmp(errors,"replace") == 0) {
1697	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1698	(*dest)++;
1699	return 0;
1700    }
1701    else {
1702	PyErr_Format(PyExc_ValueError,
1703		     "ASCII decoding error; "
1704		     "unknown error handling code: %.400s",
1705		     errors);
1706	return -1;
1707    }
1708}
1709
1710PyObject *PyUnicode_DecodeASCII(const char *s,
1711				int size,
1712				const char *errors)
1713{
1714    PyUnicodeObject *v;
1715    Py_UNICODE *p;
1716
1717    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1718    if (size == 1 && *(unsigned char*)s < 128) {
1719	Py_UNICODE r = *(unsigned char*)s;
1720	return PyUnicode_FromUnicode(&r, 1);
1721    }
1722
1723    v = _PyUnicode_New(size);
1724    if (v == NULL)
1725	goto onError;
1726    if (size == 0)
1727	return (PyObject *)v;
1728    p = PyUnicode_AS_UNICODE(v);
1729    while (size-- > 0) {
1730	register unsigned char c;
1731
1732	c = (unsigned char)*s++;
1733	if (c < 128)
1734	    *p++ = c;
1735	else if (ascii_decoding_error(&s, &p, errors,
1736				      "ordinal not in range(128)"))
1737		goto onError;
1738    }
1739    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1740	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1741	    goto onError;
1742    return (PyObject *)v;
1743
1744 onError:
1745    Py_XDECREF(v);
1746    return NULL;
1747}
1748
1749static
1750int ascii_encoding_error(const Py_UNICODE **source,
1751			 char **dest,
1752			 const char *errors,
1753			 const char *details)
1754{
1755    if ((errors == NULL) ||
1756	(strcmp(errors,"strict") == 0)) {
1757	PyErr_Format(PyExc_UnicodeError,
1758		     "ASCII encoding error: %.400s",
1759		     details);
1760	return -1;
1761    }
1762    else if (strcmp(errors,"ignore") == 0) {
1763	return 0;
1764    }
1765    else if (strcmp(errors,"replace") == 0) {
1766	**dest = '?';
1767	(*dest)++;
1768	return 0;
1769    }
1770    else {
1771	PyErr_Format(PyExc_ValueError,
1772		     "ASCII encoding error; "
1773		     "unknown error handling code: %.400s",
1774		     errors);
1775	return -1;
1776    }
1777}
1778
1779PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1780				int size,
1781				const char *errors)
1782{
1783    PyObject *repr;
1784    char *s, *start;
1785
1786    repr = PyString_FromStringAndSize(NULL, size);
1787    if (repr == NULL)
1788        return NULL;
1789    if (size == 0)
1790	return repr;
1791
1792    s = PyString_AS_STRING(repr);
1793    start = s;
1794    while (size-- > 0) {
1795        Py_UNICODE ch = *p++;
1796	if (ch >= 128) {
1797	    if (ascii_encoding_error(&p, &s, errors,
1798				      "ordinal not in range(128)"))
1799		goto onError;
1800	}
1801	else
1802            *s++ = (char)ch;
1803    }
1804    /* Resize if error handling skipped some characters */
1805    if (s - start < PyString_GET_SIZE(repr))
1806	if (_PyString_Resize(&repr, s - start))
1807	    goto onError;
1808    return repr;
1809
1810 onError:
1811    Py_DECREF(repr);
1812    return NULL;
1813}
1814
1815PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1816{
1817    if (!PyUnicode_Check(unicode)) {
1818	PyErr_BadArgument();
1819	return NULL;
1820    }
1821    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1822				 PyUnicode_GET_SIZE(unicode),
1823				 NULL);
1824}
1825
1826#ifdef MS_WIN32
1827
1828/* --- MBCS codecs for Windows -------------------------------------------- */
1829
1830PyObject *PyUnicode_DecodeMBCS(const char *s,
1831				int size,
1832				const char *errors)
1833{
1834    PyUnicodeObject *v;
1835    Py_UNICODE *p;
1836
1837    /* First get the size of the result */
1838    DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1839    if (size > 0 && usize==0)
1840        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1841
1842    v = _PyUnicode_New(usize);
1843    if (v == NULL)
1844        return NULL;
1845    if (usize == 0)
1846	return (PyObject *)v;
1847    p = PyUnicode_AS_UNICODE(v);
1848    if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1849        Py_DECREF(v);
1850        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1851    }
1852
1853    return (PyObject *)v;
1854}
1855
1856PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1857				int size,
1858				const char *errors)
1859{
1860    PyObject *repr;
1861    char *s;
1862    DWORD mbcssize;
1863
1864    /* If there are no characters, bail now! */
1865    if (size==0)
1866	    return PyString_FromString("");
1867
1868    /* First get the size of the result */
1869    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1870    if (mbcssize==0)
1871        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872
1873    repr = PyString_FromStringAndSize(NULL, mbcssize);
1874    if (repr == NULL)
1875        return NULL;
1876    if (mbcssize == 0)
1877        return repr;
1878
1879    /* Do the conversion */
1880    s = PyString_AS_STRING(repr);
1881    if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1882        Py_DECREF(repr);
1883        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1884    }
1885    return repr;
1886}
1887
1888#endif /* MS_WIN32 */
1889
1890/* --- Character Mapping Codec -------------------------------------------- */
1891
1892static
1893int charmap_decoding_error(const char **source,
1894			 Py_UNICODE **dest,
1895			 const char *errors,
1896			 const char *details)
1897{
1898    if ((errors == NULL) ||
1899	(strcmp(errors,"strict") == 0)) {
1900	PyErr_Format(PyExc_UnicodeError,
1901		     "charmap decoding error: %.400s",
1902		     details);
1903	return -1;
1904    }
1905    else if (strcmp(errors,"ignore") == 0) {
1906	return 0;
1907    }
1908    else if (strcmp(errors,"replace") == 0) {
1909	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1910	(*dest)++;
1911	return 0;
1912    }
1913    else {
1914	PyErr_Format(PyExc_ValueError,
1915		     "charmap decoding error; "
1916		     "unknown error handling code: %.400s",
1917		     errors);
1918	return -1;
1919    }
1920}
1921
1922PyObject *PyUnicode_DecodeCharmap(const char *s,
1923				  int size,
1924				  PyObject *mapping,
1925				  const char *errors)
1926{
1927    PyUnicodeObject *v;
1928    Py_UNICODE *p;
1929    int extrachars = 0;
1930
1931    /* Default to Latin-1 */
1932    if (mapping == NULL)
1933	return PyUnicode_DecodeLatin1(s, size, errors);
1934
1935    v = _PyUnicode_New(size);
1936    if (v == NULL)
1937	goto onError;
1938    if (size == 0)
1939	return (PyObject *)v;
1940    p = PyUnicode_AS_UNICODE(v);
1941    while (size-- > 0) {
1942	unsigned char ch = *s++;
1943	PyObject *w, *x;
1944
1945	/* Get mapping (char ordinal -> integer, Unicode char or None) */
1946	w = PyInt_FromLong((long)ch);
1947	if (w == NULL)
1948	    goto onError;
1949	x = PyObject_GetItem(mapping, w);
1950	Py_DECREF(w);
1951	if (x == NULL) {
1952	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1953		/* No mapping found means: mapping is undefined. */
1954		PyErr_Clear();
1955		x = Py_None;
1956		Py_INCREF(x);
1957	    } else
1958		goto onError;
1959	}
1960
1961	/* Apply mapping */
1962	if (PyInt_Check(x)) {
1963	    long value = PyInt_AS_LONG(x);
1964	    if (value < 0 || value > 65535) {
1965		PyErr_SetString(PyExc_TypeError,
1966				"character mapping must be in range(65536)");
1967		Py_DECREF(x);
1968		goto onError;
1969	    }
1970	    *p++ = (Py_UNICODE)value;
1971	}
1972	else if (x == Py_None) {
1973	    /* undefined mapping */
1974	    if (charmap_decoding_error(&s, &p, errors,
1975				       "character maps to <undefined>")) {
1976		Py_DECREF(x);
1977		goto onError;
1978	    }
1979	}
1980	else if (PyUnicode_Check(x)) {
1981	    int targetsize = PyUnicode_GET_SIZE(x);
1982
1983	    if (targetsize == 1)
1984		/* 1-1 mapping */
1985		*p++ = *PyUnicode_AS_UNICODE(x);
1986
1987	    else if (targetsize > 1) {
1988		/* 1-n mapping */
1989		if (targetsize > extrachars) {
1990		    /* resize first */
1991		    int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1992		    int needed = (targetsize - extrachars) + \
1993			         (targetsize << 2);
1994		    extrachars += needed;
1995		    if (_PyUnicode_Resize(&v,
1996					 PyUnicode_GET_SIZE(v) + needed)) {
1997			Py_DECREF(x);
1998			goto onError;
1999		    }
2000		    p = PyUnicode_AS_UNICODE(v) + oldpos;
2001		}
2002		Py_UNICODE_COPY(p,
2003				PyUnicode_AS_UNICODE(x),
2004				targetsize);
2005		p += targetsize;
2006		extrachars -= targetsize;
2007	    }
2008	    /* 1-0 mapping: skip the character */
2009	}
2010	else {
2011	    /* wrong return value */
2012	    PyErr_SetString(PyExc_TypeError,
2013		  "character mapping must return integer, None or unicode");
2014	    Py_DECREF(x);
2015	    goto onError;
2016	}
2017	Py_DECREF(x);
2018    }
2019    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2020	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2021	    goto onError;
2022    return (PyObject *)v;
2023
2024 onError:
2025    Py_XDECREF(v);
2026    return NULL;
2027}
2028
2029static
2030int charmap_encoding_error(const Py_UNICODE **source,
2031			   char **dest,
2032			   const char *errors,
2033			   const char *details)
2034{
2035    if ((errors == NULL) ||
2036	(strcmp(errors,"strict") == 0)) {
2037	PyErr_Format(PyExc_UnicodeError,
2038		     "charmap encoding error: %.400s",
2039		     details);
2040	return -1;
2041    }
2042    else if (strcmp(errors,"ignore") == 0) {
2043	return 0;
2044    }
2045    else if (strcmp(errors,"replace") == 0) {
2046	**dest = '?';
2047	(*dest)++;
2048	return 0;
2049    }
2050    else {
2051	PyErr_Format(PyExc_ValueError,
2052		     "charmap encoding error; "
2053		     "unknown error handling code: %.400s",
2054		     errors);
2055	return -1;
2056    }
2057}
2058
2059PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2060				  int size,
2061				  PyObject *mapping,
2062				  const char *errors)
2063{
2064    PyObject *v;
2065    char *s;
2066    int extrachars = 0;
2067
2068    /* Default to Latin-1 */
2069    if (mapping == NULL)
2070	return PyUnicode_EncodeLatin1(p, size, errors);
2071
2072    v = PyString_FromStringAndSize(NULL, size);
2073    if (v == NULL)
2074        return NULL;
2075    if (size == 0)
2076	return v;
2077    s = PyString_AS_STRING(v);
2078    while (size-- > 0) {
2079	Py_UNICODE ch = *p++;
2080	PyObject *w, *x;
2081
2082	/* Get mapping (Unicode ordinal -> string char, integer or None) */
2083	w = PyInt_FromLong((long)ch);
2084	if (w == NULL)
2085	    goto onError;
2086	x = PyObject_GetItem(mapping, w);
2087	Py_DECREF(w);
2088	if (x == NULL) {
2089	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2090		/* No mapping found means: mapping is undefined. */
2091		PyErr_Clear();
2092		x = Py_None;
2093		Py_INCREF(x);
2094	    } else
2095		goto onError;
2096	}
2097
2098	/* Apply mapping */
2099	if (PyInt_Check(x)) {
2100	    long value = PyInt_AS_LONG(x);
2101	    if (value < 0 || value > 255) {
2102		PyErr_SetString(PyExc_TypeError,
2103				"character mapping must be in range(256)");
2104		Py_DECREF(x);
2105		goto onError;
2106	    }
2107	    *s++ = (char)value;
2108	}
2109	else if (x == Py_None) {
2110	    /* undefined mapping */
2111	    if (charmap_encoding_error(&p, &s, errors,
2112				       "character maps to <undefined>")) {
2113		Py_DECREF(x);
2114		goto onError;
2115	    }
2116	}
2117	else if (PyString_Check(x)) {
2118	    int targetsize = PyString_GET_SIZE(x);
2119
2120	    if (targetsize == 1)
2121		/* 1-1 mapping */
2122		*s++ = *PyString_AS_STRING(x);
2123
2124	    else if (targetsize > 1) {
2125		/* 1-n mapping */
2126		if (targetsize > extrachars) {
2127		    /* resize first */
2128		    int oldpos = (int)(s - PyString_AS_STRING(v));
2129		    int needed = (targetsize - extrachars) + \
2130			         (targetsize << 2);
2131		    extrachars += needed;
2132		    if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2133			Py_DECREF(x);
2134			goto onError;
2135		    }
2136		    s = PyString_AS_STRING(v) + oldpos;
2137		}
2138		memcpy(s, PyString_AS_STRING(x), targetsize);
2139		s += targetsize;
2140		extrachars -= targetsize;
2141	    }
2142	    /* 1-0 mapping: skip the character */
2143	}
2144	else {
2145	    /* wrong return value */
2146	    PyErr_SetString(PyExc_TypeError,
2147		  "character mapping must return integer, None or unicode");
2148	    Py_DECREF(x);
2149	    goto onError;
2150	}
2151	Py_DECREF(x);
2152    }
2153    if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2154	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2155	    goto onError;
2156    return v;
2157
2158 onError:
2159    Py_DECREF(v);
2160    return NULL;
2161}
2162
2163PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2164				    PyObject *mapping)
2165{
2166    if (!PyUnicode_Check(unicode) || mapping == NULL) {
2167	PyErr_BadArgument();
2168	return NULL;
2169    }
2170    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2171				   PyUnicode_GET_SIZE(unicode),
2172				   mapping,
2173				   NULL);
2174}
2175
2176static
2177int translate_error(const Py_UNICODE **source,
2178		    Py_UNICODE **dest,
2179		    const char *errors,
2180		    const char *details)
2181{
2182    if ((errors == NULL) ||
2183	(strcmp(errors,"strict") == 0)) {
2184	PyErr_Format(PyExc_UnicodeError,
2185		     "translate error: %.400s",
2186		     details);
2187	return -1;
2188    }
2189    else if (strcmp(errors,"ignore") == 0) {
2190	return 0;
2191    }
2192    else if (strcmp(errors,"replace") == 0) {
2193	**dest = '?';
2194	(*dest)++;
2195	return 0;
2196    }
2197    else {
2198	PyErr_Format(PyExc_ValueError,
2199		     "translate error; "
2200		     "unknown error handling code: %.400s",
2201		     errors);
2202	return -1;
2203    }
2204}
2205
2206PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2207				     int size,
2208				     PyObject *mapping,
2209				     const char *errors)
2210{
2211    PyUnicodeObject *v;
2212    Py_UNICODE *p;
2213
2214    if (mapping == NULL) {
2215	PyErr_BadArgument();
2216	return NULL;
2217    }
2218
2219    /* Output will never be longer than input */
2220    v = _PyUnicode_New(size);
2221    if (v == NULL)
2222	goto onError;
2223    if (size == 0)
2224	goto done;
2225    p = PyUnicode_AS_UNICODE(v);
2226    while (size-- > 0) {
2227	Py_UNICODE ch = *s++;
2228	PyObject *w, *x;
2229
2230	/* Get mapping */
2231	w = PyInt_FromLong(ch);
2232	if (w == NULL)
2233	    goto onError;
2234	x = PyObject_GetItem(mapping, w);
2235	Py_DECREF(w);
2236	if (x == NULL) {
2237	    if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2238		/* No mapping found: default to 1-1 mapping */
2239		PyErr_Clear();
2240		*p++ = ch;
2241		continue;
2242	    }
2243	    goto onError;
2244	}
2245
2246	/* Apply mapping */
2247	if (PyInt_Check(x))
2248	    *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2249	else if (x == Py_None) {
2250	    /* undefined mapping */
2251	    if (translate_error(&s, &p, errors,
2252				"character maps to <undefined>")) {
2253		Py_DECREF(x);
2254		goto onError;
2255	    }
2256	}
2257	else if (PyUnicode_Check(x)) {
2258	    if (PyUnicode_GET_SIZE(x) != 1) {
2259		/* 1-n mapping */
2260		PyErr_SetString(PyExc_NotImplementedError,
2261				"1-n mappings are currently not implemented");
2262		Py_DECREF(x);
2263		goto onError;
2264	    }
2265	    *p++ = *PyUnicode_AS_UNICODE(x);
2266	}
2267	else {
2268	    /* wrong return value */
2269	    PyErr_SetString(PyExc_TypeError,
2270		  "translate mapping must return integer, None or unicode");
2271	    Py_DECREF(x);
2272	    goto onError;
2273	}
2274	Py_DECREF(x);
2275    }
2276    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2277	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2278	    goto onError;
2279
2280 done:
2281    return (PyObject *)v;
2282
2283 onError:
2284    Py_XDECREF(v);
2285    return NULL;
2286}
2287
2288PyObject *PyUnicode_Translate(PyObject *str,
2289			      PyObject *mapping,
2290			      const char *errors)
2291{
2292    PyObject *result;
2293
2294    str = PyUnicode_FromObject(str);
2295    if (str == NULL)
2296	goto onError;
2297    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2298					PyUnicode_GET_SIZE(str),
2299					mapping,
2300					errors);
2301    Py_DECREF(str);
2302    return result;
2303
2304 onError:
2305    Py_XDECREF(str);
2306    return NULL;
2307}
2308
2309/* --- Decimal Encoder ---------------------------------------------------- */
2310
2311int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2312			    int length,
2313			    char *output,
2314			    const char *errors)
2315{
2316    Py_UNICODE *p, *end;
2317
2318    if (output == NULL) {
2319	PyErr_BadArgument();
2320	return -1;
2321    }
2322
2323    p = s;
2324    end = s + length;
2325    while (p < end) {
2326	register Py_UNICODE ch = *p++;
2327	int decimal;
2328
2329	if (Py_UNICODE_ISSPACE(ch)) {
2330	    *output++ = ' ';
2331	    continue;
2332	}
2333	decimal = Py_UNICODE_TODECIMAL(ch);
2334	if (decimal >= 0) {
2335	    *output++ = '0' + decimal;
2336	    continue;
2337	}
2338	if (0 < ch && ch < 256) {
2339	    *output++ = (char)ch;
2340	    continue;
2341	}
2342	/* All other characters are considered invalid */
2343	if (errors == NULL || strcmp(errors, "strict") == 0) {
2344	    PyErr_SetString(PyExc_ValueError,
2345			    "invalid decimal Unicode string");
2346	    goto onError;
2347	}
2348	else if (strcmp(errors, "ignore") == 0)
2349	    continue;
2350	else if (strcmp(errors, "replace") == 0) {
2351	    *output++ = '?';
2352	    continue;
2353	}
2354    }
2355    /* 0-terminate the output string */
2356    *output++ = '\0';
2357    return 0;
2358
2359 onError:
2360    return -1;
2361}
2362
2363/* --- Helpers ------------------------------------------------------------ */
2364
2365static
2366int count(PyUnicodeObject *self,
2367	  int start,
2368	  int end,
2369	  PyUnicodeObject *substring)
2370{
2371    int count = 0;
2372
2373    if (start < 0)
2374        start += self->length;
2375    if (start < 0)
2376        start = 0;
2377    if (end > self->length)
2378        end = self->length;
2379    if (end < 0)
2380        end += self->length;
2381    if (end < 0)
2382        end = 0;
2383
2384    if (substring->length == 0)
2385	return (end - start + 1);
2386
2387    end -= substring->length;
2388
2389    while (start <= end)
2390        if (Py_UNICODE_MATCH(self, start, substring)) {
2391            count++;
2392            start += substring->length;
2393        } else
2394            start++;
2395
2396    return count;
2397}
2398
2399int PyUnicode_Count(PyObject *str,
2400		    PyObject *substr,
2401		    int start,
2402		    int end)
2403{
2404    int result;
2405
2406    str = PyUnicode_FromObject(str);
2407    if (str == NULL)
2408	return -1;
2409    substr = PyUnicode_FromObject(substr);
2410    if (substr == NULL) {
2411	Py_DECREF(str);
2412	return -1;
2413    }
2414
2415    result = count((PyUnicodeObject *)str,
2416		   start, end,
2417		   (PyUnicodeObject *)substr);
2418
2419    Py_DECREF(str);
2420    Py_DECREF(substr);
2421    return result;
2422}
2423
2424static
2425int findstring(PyUnicodeObject *self,
2426	       PyUnicodeObject *substring,
2427	       int start,
2428	       int end,
2429	       int direction)
2430{
2431    if (start < 0)
2432        start += self->length;
2433    if (start < 0)
2434        start = 0;
2435
2436    if (substring->length == 0)
2437        return start;
2438
2439    if (end > self->length)
2440        end = self->length;
2441    if (end < 0)
2442        end += self->length;
2443    if (end < 0)
2444        end = 0;
2445
2446    end -= substring->length;
2447
2448    if (direction < 0) {
2449        for (; end >= start; end--)
2450            if (Py_UNICODE_MATCH(self, end, substring))
2451                return end;
2452    } else {
2453        for (; start <= end; start++)
2454            if (Py_UNICODE_MATCH(self, start, substring))
2455                return start;
2456    }
2457
2458    return -1;
2459}
2460
2461int PyUnicode_Find(PyObject *str,
2462		   PyObject *substr,
2463		   int start,
2464		   int end,
2465		   int direction)
2466{
2467    int result;
2468
2469    str = PyUnicode_FromObject(str);
2470    if (str == NULL)
2471	return -1;
2472    substr = PyUnicode_FromObject(substr);
2473    if (substr == NULL) {
2474	Py_DECREF(substr);
2475	return -1;
2476    }
2477
2478    result = findstring((PyUnicodeObject *)str,
2479			(PyUnicodeObject *)substr,
2480			start, end, direction);
2481    Py_DECREF(str);
2482    Py_DECREF(substr);
2483    return result;
2484}
2485
2486static
2487int tailmatch(PyUnicodeObject *self,
2488	      PyUnicodeObject *substring,
2489	      int start,
2490	      int end,
2491	      int direction)
2492{
2493    if (start < 0)
2494        start += self->length;
2495    if (start < 0)
2496        start = 0;
2497
2498    if (substring->length == 0)
2499        return 1;
2500
2501    if (end > self->length)
2502        end = self->length;
2503    if (end < 0)
2504        end += self->length;
2505    if (end < 0)
2506        end = 0;
2507
2508    end -= substring->length;
2509    if (end < start)
2510	return 0;
2511
2512    if (direction > 0) {
2513	if (Py_UNICODE_MATCH(self, end, substring))
2514	    return 1;
2515    } else {
2516        if (Py_UNICODE_MATCH(self, start, substring))
2517	    return 1;
2518    }
2519
2520    return 0;
2521}
2522
2523int PyUnicode_Tailmatch(PyObject *str,
2524			PyObject *substr,
2525			int start,
2526			int end,
2527			int direction)
2528{
2529    int result;
2530
2531    str = PyUnicode_FromObject(str);
2532    if (str == NULL)
2533	return -1;
2534    substr = PyUnicode_FromObject(substr);
2535    if (substr == NULL) {
2536	Py_DECREF(substr);
2537	return -1;
2538    }
2539
2540    result = tailmatch((PyUnicodeObject *)str,
2541		       (PyUnicodeObject *)substr,
2542		       start, end, direction);
2543    Py_DECREF(str);
2544    Py_DECREF(substr);
2545    return result;
2546}
2547
2548static
2549const Py_UNICODE *findchar(const Py_UNICODE *s,
2550		     int size,
2551		     Py_UNICODE ch)
2552{
2553    /* like wcschr, but doesn't stop at NULL characters */
2554
2555    while (size-- > 0) {
2556        if (*s == ch)
2557            return s;
2558        s++;
2559    }
2560
2561    return NULL;
2562}
2563
2564/* Apply fixfct filter to the Unicode object self and return a
2565   reference to the modified object */
2566
2567static
2568PyObject *fixup(PyUnicodeObject *self,
2569		int (*fixfct)(PyUnicodeObject *s))
2570{
2571
2572    PyUnicodeObject *u;
2573
2574    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
2575    if (u == NULL)
2576	return NULL;
2577
2578    Py_UNICODE_COPY(u->str, self->str, self->length);
2579
2580    if (!fixfct(u)) {
2581	/* fixfct should return TRUE if it modified the buffer. If
2582	   FALSE, return a reference to the original buffer instead
2583	   (to save space, not time) */
2584	Py_INCREF(self);
2585	Py_DECREF(u);
2586	return (PyObject*) self;
2587    }
2588    return (PyObject*) u;
2589}
2590
2591static
2592int fixupper(PyUnicodeObject *self)
2593{
2594    int len = self->length;
2595    Py_UNICODE *s = self->str;
2596    int status = 0;
2597
2598    while (len-- > 0) {
2599	register Py_UNICODE ch;
2600
2601	ch = Py_UNICODE_TOUPPER(*s);
2602	if (ch != *s) {
2603            status = 1;
2604	    *s = ch;
2605	}
2606        s++;
2607    }
2608
2609    return status;
2610}
2611
2612static
2613int fixlower(PyUnicodeObject *self)
2614{
2615    int len = self->length;
2616    Py_UNICODE *s = self->str;
2617    int status = 0;
2618
2619    while (len-- > 0) {
2620	register Py_UNICODE ch;
2621
2622	ch = Py_UNICODE_TOLOWER(*s);
2623	if (ch != *s) {
2624            status = 1;
2625	    *s = ch;
2626	}
2627        s++;
2628    }
2629
2630    return status;
2631}
2632
2633static
2634int fixswapcase(PyUnicodeObject *self)
2635{
2636    int len = self->length;
2637    Py_UNICODE *s = self->str;
2638    int status = 0;
2639
2640    while (len-- > 0) {
2641        if (Py_UNICODE_ISUPPER(*s)) {
2642            *s = Py_UNICODE_TOLOWER(*s);
2643            status = 1;
2644        } else if (Py_UNICODE_ISLOWER(*s)) {
2645            *s = Py_UNICODE_TOUPPER(*s);
2646            status = 1;
2647        }
2648        s++;
2649    }
2650
2651    return status;
2652}
2653
2654static
2655int fixcapitalize(PyUnicodeObject *self)
2656{
2657    int len = self->length;
2658    Py_UNICODE *s = self->str;
2659    int status = 0;
2660
2661    if (len == 0)
2662	return 0;
2663    if (Py_UNICODE_ISLOWER(*s)) {
2664	*s = Py_UNICODE_TOUPPER(*s);
2665	status = 1;
2666    }
2667    s++;
2668    while (--len > 0) {
2669        if (Py_UNICODE_ISUPPER(*s)) {
2670            *s = Py_UNICODE_TOLOWER(*s);
2671            status = 1;
2672        }
2673        s++;
2674    }
2675    return status;
2676}
2677
2678static
2679int fixtitle(PyUnicodeObject *self)
2680{
2681    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2682    register Py_UNICODE *e;
2683    int previous_is_cased;
2684
2685    /* Shortcut for single character strings */
2686    if (PyUnicode_GET_SIZE(self) == 1) {
2687	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2688	if (*p != ch) {
2689	    *p = ch;
2690	    return 1;
2691	}
2692	else
2693	    return 0;
2694    }
2695
2696    e = p + PyUnicode_GET_SIZE(self);
2697    previous_is_cased = 0;
2698    for (; p < e; p++) {
2699	register const Py_UNICODE ch = *p;
2700
2701	if (previous_is_cased)
2702	    *p = Py_UNICODE_TOLOWER(ch);
2703	else
2704	    *p = Py_UNICODE_TOTITLE(ch);
2705
2706	if (Py_UNICODE_ISLOWER(ch) ||
2707	    Py_UNICODE_ISUPPER(ch) ||
2708	    Py_UNICODE_ISTITLE(ch))
2709	    previous_is_cased = 1;
2710	else
2711	    previous_is_cased = 0;
2712    }
2713    return 1;
2714}
2715
2716PyObject *PyUnicode_Join(PyObject *separator,
2717			 PyObject *seq)
2718{
2719    Py_UNICODE *sep;
2720    int seplen;
2721    PyUnicodeObject *res = NULL;
2722    int reslen = 0;
2723    Py_UNICODE *p;
2724    int seqlen = 0;
2725    int sz = 100;
2726    int i;
2727    PyObject *it;
2728
2729    it = PyObject_GetIter(seq);
2730    if (it == NULL)
2731        return NULL;
2732
2733    if (separator == NULL) {
2734	Py_UNICODE blank = ' ';
2735	sep = &blank;
2736	seplen = 1;
2737    }
2738    else {
2739	separator = PyUnicode_FromObject(separator);
2740	if (separator == NULL)
2741	    goto onError;
2742	sep = PyUnicode_AS_UNICODE(separator);
2743	seplen = PyUnicode_GET_SIZE(separator);
2744    }
2745
2746    res = _PyUnicode_New(sz);
2747    if (res == NULL)
2748	goto onError;
2749    p = PyUnicode_AS_UNICODE(res);
2750    reslen = 0;
2751
2752    for (i = 0; ; ++i) {
2753	int itemlen;
2754	PyObject *item = PyIter_Next(it);
2755	if (item == NULL) {
2756	    if (PyErr_Occurred())
2757		goto onError;
2758	    break;
2759	}
2760	if (!PyUnicode_Check(item)) {
2761	    PyObject *v;
2762	    v = PyUnicode_FromObject(item);
2763	    Py_DECREF(item);
2764	    item = v;
2765	    if (item == NULL)
2766		goto onError;
2767	}
2768	itemlen = PyUnicode_GET_SIZE(item);
2769	while (reslen + itemlen + seplen >= sz) {
2770	    if (_PyUnicode_Resize(&res, sz*2))
2771		goto onError;
2772	    sz *= 2;
2773	    p = PyUnicode_AS_UNICODE(res) + reslen;
2774	}
2775	if (i > 0) {
2776	    Py_UNICODE_COPY(p, sep, seplen);
2777	    p += seplen;
2778	    reslen += seplen;
2779	}
2780	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
2781	p += itemlen;
2782	reslen += itemlen;
2783	Py_DECREF(item);
2784    }
2785    if (_PyUnicode_Resize(&res, reslen))
2786	goto onError;
2787
2788    Py_XDECREF(separator);
2789    Py_DECREF(it);
2790    return (PyObject *)res;
2791
2792 onError:
2793    Py_XDECREF(separator);
2794    Py_XDECREF(res);
2795    Py_DECREF(it);
2796    return NULL;
2797}
2798
2799static
2800PyUnicodeObject *pad(PyUnicodeObject *self,
2801		     int left,
2802		     int right,
2803		     Py_UNICODE fill)
2804{
2805    PyUnicodeObject *u;
2806
2807    if (left < 0)
2808        left = 0;
2809    if (right < 0)
2810        right = 0;
2811
2812    if (left == 0 && right == 0) {
2813        Py_INCREF(self);
2814        return self;
2815    }
2816
2817    u = _PyUnicode_New(left + self->length + right);
2818    if (u) {
2819        if (left)
2820            Py_UNICODE_FILL(u->str, fill, left);
2821        Py_UNICODE_COPY(u->str + left, self->str, self->length);
2822        if (right)
2823            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2824    }
2825
2826    return u;
2827}
2828
2829#define SPLIT_APPEND(data, left, right)					\
2830	str = PyUnicode_FromUnicode(data + left, right - left);		\
2831	if (!str)							\
2832	    goto onError;						\
2833	if (PyList_Append(list, str)) {					\
2834	    Py_DECREF(str);						\
2835	    goto onError;						\
2836	}								\
2837        else								\
2838            Py_DECREF(str);
2839
2840static
2841PyObject *split_whitespace(PyUnicodeObject *self,
2842			   PyObject *list,
2843			   int maxcount)
2844{
2845    register int i;
2846    register int j;
2847    int len = self->length;
2848    PyObject *str;
2849
2850    for (i = j = 0; i < len; ) {
2851	/* find a token */
2852	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2853	    i++;
2854	j = i;
2855	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2856	    i++;
2857	if (j < i) {
2858	    if (maxcount-- <= 0)
2859		break;
2860	    SPLIT_APPEND(self->str, j, i);
2861	    while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2862		i++;
2863	    j = i;
2864	}
2865    }
2866    if (j < len) {
2867	SPLIT_APPEND(self->str, j, len);
2868    }
2869    return list;
2870
2871 onError:
2872    Py_DECREF(list);
2873    return NULL;
2874}
2875
2876PyObject *PyUnicode_Splitlines(PyObject *string,
2877			       int keepends)
2878{
2879    register int i;
2880    register int j;
2881    int len;
2882    PyObject *list;
2883    PyObject *str;
2884    Py_UNICODE *data;
2885
2886    string = PyUnicode_FromObject(string);
2887    if (string == NULL)
2888	return NULL;
2889    data = PyUnicode_AS_UNICODE(string);
2890    len = PyUnicode_GET_SIZE(string);
2891
2892    list = PyList_New(0);
2893    if (!list)
2894        goto onError;
2895
2896    for (i = j = 0; i < len; ) {
2897	int eol;
2898
2899	/* Find a line and append it */
2900	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2901	    i++;
2902
2903	/* Skip the line break reading CRLF as one line break */
2904	eol = i;
2905	if (i < len) {
2906	    if (data[i] == '\r' && i + 1 < len &&
2907		data[i+1] == '\n')
2908		i += 2;
2909	    else
2910		i++;
2911	    if (keepends)
2912		eol = i;
2913	}
2914	SPLIT_APPEND(data, j, eol);
2915	j = i;
2916    }
2917    if (j < len) {
2918	SPLIT_APPEND(data, j, len);
2919    }
2920
2921    Py_DECREF(string);
2922    return list;
2923
2924 onError:
2925    Py_DECREF(list);
2926    Py_DECREF(string);
2927    return NULL;
2928}
2929
2930static
2931PyObject *split_char(PyUnicodeObject *self,
2932		     PyObject *list,
2933		     Py_UNICODE ch,
2934		     int maxcount)
2935{
2936    register int i;
2937    register int j;
2938    int len = self->length;
2939    PyObject *str;
2940
2941    for (i = j = 0; i < len; ) {
2942	if (self->str[i] == ch) {
2943	    if (maxcount-- <= 0)
2944		break;
2945	    SPLIT_APPEND(self->str, j, i);
2946	    i = j = i + 1;
2947	} else
2948	    i++;
2949    }
2950    if (j <= len) {
2951	SPLIT_APPEND(self->str, j, len);
2952    }
2953    return list;
2954
2955 onError:
2956    Py_DECREF(list);
2957    return NULL;
2958}
2959
2960static
2961PyObject *split_substring(PyUnicodeObject *self,
2962			  PyObject *list,
2963			  PyUnicodeObject *substring,
2964			  int maxcount)
2965{
2966    register int i;
2967    register int j;
2968    int len = self->length;
2969    int sublen = substring->length;
2970    PyObject *str;
2971
2972    for (i = j = 0; i <= len - sublen; ) {
2973	if (Py_UNICODE_MATCH(self, i, substring)) {
2974	    if (maxcount-- <= 0)
2975		break;
2976	    SPLIT_APPEND(self->str, j, i);
2977	    i = j = i + sublen;
2978	} else
2979	    i++;
2980    }
2981    if (j <= len) {
2982	SPLIT_APPEND(self->str, j, len);
2983    }
2984    return list;
2985
2986 onError:
2987    Py_DECREF(list);
2988    return NULL;
2989}
2990
2991#undef SPLIT_APPEND
2992
2993static
2994PyObject *split(PyUnicodeObject *self,
2995		PyUnicodeObject *substring,
2996		int maxcount)
2997{
2998    PyObject *list;
2999
3000    if (maxcount < 0)
3001        maxcount = INT_MAX;
3002
3003    list = PyList_New(0);
3004    if (!list)
3005        return NULL;
3006
3007    if (substring == NULL)
3008	return split_whitespace(self,list,maxcount);
3009
3010    else if (substring->length == 1)
3011	return split_char(self,list,substring->str[0],maxcount);
3012
3013    else if (substring->length == 0) {
3014	Py_DECREF(list);
3015	PyErr_SetString(PyExc_ValueError, "empty separator");
3016	return NULL;
3017    }
3018    else
3019	return split_substring(self,list,substring,maxcount);
3020}
3021
3022static
3023PyObject *strip(PyUnicodeObject *self,
3024		int left,
3025		int right)
3026{
3027    Py_UNICODE *p = self->str;
3028    int start = 0;
3029    int end = self->length;
3030
3031    if (left)
3032        while (start < end && Py_UNICODE_ISSPACE(p[start]))
3033            start++;
3034
3035    if (right)
3036        while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3037            end--;
3038
3039    if (start == 0 && end == self->length) {
3040        /* couldn't strip anything off, return original string */
3041        Py_INCREF(self);
3042        return (PyObject*) self;
3043    }
3044
3045    return (PyObject*) PyUnicode_FromUnicode(
3046        self->str + start,
3047        end - start
3048        );
3049}
3050
3051static
3052PyObject *replace(PyUnicodeObject *self,
3053		  PyUnicodeObject *str1,
3054		  PyUnicodeObject *str2,
3055		  int maxcount)
3056{
3057    PyUnicodeObject *u;
3058
3059    if (maxcount < 0)
3060	maxcount = INT_MAX;
3061
3062    if (str1->length == 1 && str2->length == 1) {
3063        int i;
3064
3065        /* replace characters */
3066        if (!findchar(self->str, self->length, str1->str[0])) {
3067            /* nothing to replace, return original string */
3068            Py_INCREF(self);
3069            u = self;
3070        } else {
3071	    Py_UNICODE u1 = str1->str[0];
3072	    Py_UNICODE u2 = str2->str[0];
3073
3074            u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3075                NULL,
3076                self->length
3077                );
3078            if (u != NULL) {
3079		Py_UNICODE_COPY(u->str, self->str,
3080				self->length);
3081                for (i = 0; i < u->length; i++)
3082                    if (u->str[i] == u1) {
3083                        if (--maxcount < 0)
3084                            break;
3085                        u->str[i] = u2;
3086                    }
3087        }
3088        }
3089
3090    } else {
3091        int n, i;
3092        Py_UNICODE *p;
3093
3094        /* replace strings */
3095        n = count(self, 0, self->length, str1);
3096        if (n > maxcount)
3097            n = maxcount;
3098        if (n == 0) {
3099            /* nothing to replace, return original string */
3100            Py_INCREF(self);
3101            u = self;
3102        } else {
3103            u = _PyUnicode_New(
3104                self->length + n * (str2->length - str1->length));
3105            if (u) {
3106                i = 0;
3107                p = u->str;
3108                while (i <= self->length - str1->length)
3109                    if (Py_UNICODE_MATCH(self, i, str1)) {
3110                        /* replace string segment */
3111                        Py_UNICODE_COPY(p, str2->str, str2->length);
3112                        p += str2->length;
3113                        i += str1->length;
3114                        if (--n <= 0) {
3115                            /* copy remaining part */
3116                            Py_UNICODE_COPY(p, self->str+i, self->length-i);
3117                            break;
3118                        }
3119                    } else
3120                        *p++ = self->str[i++];
3121            }
3122        }
3123    }
3124
3125    return (PyObject *) u;
3126}
3127
3128/* --- Unicode Object Methods --------------------------------------------- */
3129
3130static char title__doc__[] =
3131"S.title() -> unicode\n\
3132\n\
3133Return a titlecased version of S, i.e. words start with title case\n\
3134characters, all remaining cased characters have lower case.";
3135
3136static PyObject*
3137unicode_title(PyUnicodeObject *self, PyObject *args)
3138{
3139    if (!PyArg_NoArgs(args))
3140        return NULL;
3141    return fixup(self, fixtitle);
3142}
3143
3144static char capitalize__doc__[] =
3145"S.capitalize() -> unicode\n\
3146\n\
3147Return a capitalized version of S, i.e. make the first character\n\
3148have upper case.";
3149
3150static PyObject*
3151unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3152{
3153    if (!PyArg_NoArgs(args))
3154        return NULL;
3155    return fixup(self, fixcapitalize);
3156}
3157
3158#if 0
3159static char capwords__doc__[] =
3160"S.capwords() -> unicode\n\
3161\n\
3162Apply .capitalize() to all words in S and return the result with\n\
3163normalized whitespace (all whitespace strings are replaced by ' ').";
3164
3165static PyObject*
3166unicode_capwords(PyUnicodeObject *self, PyObject *args)
3167{
3168    PyObject *list;
3169    PyObject *item;
3170    int i;
3171
3172    if (!PyArg_NoArgs(args))
3173        return NULL;
3174
3175    /* Split into words */
3176    list = split(self, NULL, -1);
3177    if (!list)
3178        return NULL;
3179
3180    /* Capitalize each word */
3181    for (i = 0; i < PyList_GET_SIZE(list); i++) {
3182        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3183		     fixcapitalize);
3184        if (item == NULL)
3185            goto onError;
3186        Py_DECREF(PyList_GET_ITEM(list, i));
3187        PyList_SET_ITEM(list, i, item);
3188    }
3189
3190    /* Join the words to form a new string */
3191    item = PyUnicode_Join(NULL, list);
3192
3193onError:
3194    Py_DECREF(list);
3195    return (PyObject *)item;
3196}
3197#endif
3198
3199static char center__doc__[] =
3200"S.center(width) -> unicode\n\
3201\n\
3202Return S centered in a Unicode string of length width. Padding is done\n\
3203using spaces.";
3204
3205static PyObject *
3206unicode_center(PyUnicodeObject *self, PyObject *args)
3207{
3208    int marg, left;
3209    int width;
3210
3211    if (!PyArg_ParseTuple(args, "i:center", &width))
3212        return NULL;
3213
3214    if (self->length >= width) {
3215        Py_INCREF(self);
3216        return (PyObject*) self;
3217    }
3218
3219    marg = width - self->length;
3220    left = marg / 2 + (marg & width & 1);
3221
3222    return (PyObject*) pad(self, left, marg - left, ' ');
3223}
3224
3225#if 0
3226
3227/* This code should go into some future Unicode collation support
3228   module. The basic comparison should compare ordinals on a naive
3229   basis (this is what Java does and thus JPython too). */
3230
3231/* speedy UTF-16 code point order comparison */
3232/* gleaned from: */
3233/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3234
3235static short utf16Fixup[32] =
3236{
3237    0, 0, 0, 0, 0, 0, 0, 0,
3238    0, 0, 0, 0, 0, 0, 0, 0,
3239    0, 0, 0, 0, 0, 0, 0, 0,
3240    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3241};
3242
3243static int
3244unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3245{
3246    int len1, len2;
3247
3248    Py_UNICODE *s1 = str1->str;
3249    Py_UNICODE *s2 = str2->str;
3250
3251    len1 = str1->length;
3252    len2 = str2->length;
3253
3254    while (len1 > 0 && len2 > 0) {
3255        Py_UNICODE c1, c2;
3256	long diff;
3257
3258        c1 = *s1++;
3259        c2 = *s2++;
3260	if (c1 > (1<<11) * 26)
3261	    c1 += utf16Fixup[c1>>11];
3262	if (c2 > (1<<11) * 26)
3263            c2 += utf16Fixup[c2>>11];
3264
3265        /* now c1 and c2 are in UTF-32-compatible order */
3266        diff = (long)c1 - (long)c2;
3267        if (diff)
3268            return (diff < 0) ? -1 : (diff != 0);
3269        len1--; len2--;
3270    }
3271
3272    return (len1 < len2) ? -1 : (len1 != len2);
3273}
3274
3275#else
3276
3277static int
3278unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3279{
3280    register int len1, len2;
3281
3282    Py_UNICODE *s1 = str1->str;
3283    Py_UNICODE *s2 = str2->str;
3284
3285    len1 = str1->length;
3286    len2 = str2->length;
3287
3288    while (len1 > 0 && len2 > 0) {
3289	register long diff;
3290
3291        diff = (long)*s1++ - (long)*s2++;
3292        if (diff)
3293            return (diff < 0) ? -1 : (diff != 0);
3294        len1--; len2--;
3295    }
3296
3297    return (len1 < len2) ? -1 : (len1 != len2);
3298}
3299
3300#endif
3301
3302int PyUnicode_Compare(PyObject *left,
3303		      PyObject *right)
3304{
3305    PyUnicodeObject *u = NULL, *v = NULL;
3306    int result;
3307
3308    /* Coerce the two arguments */
3309    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3310    if (u == NULL)
3311	goto onError;
3312    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3313    if (v == NULL)
3314	goto onError;
3315
3316    /* Shortcut for empty or interned objects */
3317    if (v == u) {
3318	Py_DECREF(u);
3319	Py_DECREF(v);
3320	return 0;
3321    }
3322
3323    result = unicode_compare(u, v);
3324
3325    Py_DECREF(u);
3326    Py_DECREF(v);
3327    return result;
3328
3329onError:
3330    Py_XDECREF(u);
3331    Py_XDECREF(v);
3332    return -1;
3333}
3334
3335int PyUnicode_Contains(PyObject *container,
3336		       PyObject *element)
3337{
3338    PyUnicodeObject *u = NULL, *v = NULL;
3339    int result;
3340    register const Py_UNICODE *p, *e;
3341    register Py_UNICODE ch;
3342
3343    /* Coerce the two arguments */
3344    v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3345    if (v == NULL) {
3346	PyErr_SetString(PyExc_TypeError,
3347	    "'in <string>' requires character as left operand");
3348	goto onError;
3349    }
3350    u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3351    if (u == NULL) {
3352	Py_DECREF(v);
3353	goto onError;
3354    }
3355
3356    /* Check v in u */
3357    if (PyUnicode_GET_SIZE(v) != 1) {
3358	PyErr_SetString(PyExc_TypeError,
3359	    "'in <string>' requires character as left operand");
3360	goto onError;
3361    }
3362    ch = *PyUnicode_AS_UNICODE(v);
3363    p = PyUnicode_AS_UNICODE(u);
3364    e = p + PyUnicode_GET_SIZE(u);
3365    result = 0;
3366    while (p < e) {
3367	if (*p++ == ch) {
3368	    result = 1;
3369	    break;
3370	}
3371    }
3372
3373    Py_DECREF(u);
3374    Py_DECREF(v);
3375    return result;
3376
3377onError:
3378    Py_XDECREF(u);
3379    Py_XDECREF(v);
3380    return -1;
3381}
3382
3383/* Concat to string or Unicode object giving a new Unicode object. */
3384
3385PyObject *PyUnicode_Concat(PyObject *left,
3386			   PyObject *right)
3387{
3388    PyUnicodeObject *u = NULL, *v = NULL, *w;
3389
3390    /* Coerce the two arguments */
3391    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3392    if (u == NULL)
3393	goto onError;
3394    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3395    if (v == NULL)
3396	goto onError;
3397
3398    /* Shortcuts */
3399    if (v == unicode_empty) {
3400	Py_DECREF(v);
3401	return (PyObject *)u;
3402    }
3403    if (u == unicode_empty) {
3404	Py_DECREF(u);
3405	return (PyObject *)v;
3406    }
3407
3408    /* Concat the two Unicode strings */
3409    w = _PyUnicode_New(u->length + v->length);
3410    if (w == NULL)
3411	goto onError;
3412    Py_UNICODE_COPY(w->str, u->str, u->length);
3413    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3414
3415    Py_DECREF(u);
3416    Py_DECREF(v);
3417    return (PyObject *)w;
3418
3419onError:
3420    Py_XDECREF(u);
3421    Py_XDECREF(v);
3422    return NULL;
3423}
3424
3425static char count__doc__[] =
3426"S.count(sub[, start[, end]]) -> int\n\
3427\n\
3428Return the number of occurrences of substring sub in Unicode string\n\
3429S[start:end].  Optional arguments start and end are\n\
3430interpreted as in slice notation.";
3431
3432static PyObject *
3433unicode_count(PyUnicodeObject *self, PyObject *args)
3434{
3435    PyUnicodeObject *substring;
3436    int start = 0;
3437    int end = INT_MAX;
3438    PyObject *result;
3439
3440    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3441		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3442        return NULL;
3443
3444    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3445						(PyObject *)substring);
3446    if (substring == NULL)
3447	return NULL;
3448
3449    if (start < 0)
3450        start += self->length;
3451    if (start < 0)
3452        start = 0;
3453    if (end > self->length)
3454        end = self->length;
3455    if (end < 0)
3456        end += self->length;
3457    if (end < 0)
3458        end = 0;
3459
3460    result = PyInt_FromLong((long) count(self, start, end, substring));
3461
3462    Py_DECREF(substring);
3463    return result;
3464}
3465
3466static char encode__doc__[] =
3467"S.encode([encoding[,errors]]) -> string\n\
3468\n\
3469Return an encoded string version of S. Default encoding is the current\n\
3470default string encoding. errors may be given to set a different error\n\
3471handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3472a ValueError. Other possible values are 'ignore' and 'replace'.";
3473
3474static PyObject *
3475unicode_encode(PyUnicodeObject *self, PyObject *args)
3476{
3477    char *encoding = NULL;
3478    char *errors = NULL;
3479    if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3480        return NULL;
3481    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3482}
3483
3484static char expandtabs__doc__[] =
3485"S.expandtabs([tabsize]) -> unicode\n\
3486\n\
3487Return a copy of S where all tab characters are expanded using spaces.\n\
3488If tabsize is not given, a tab size of 8 characters is assumed.";
3489
3490static PyObject*
3491unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3492{
3493    Py_UNICODE *e;
3494    Py_UNICODE *p;
3495    Py_UNICODE *q;
3496    int i, j;
3497    PyUnicodeObject *u;
3498    int tabsize = 8;
3499
3500    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3501	return NULL;
3502
3503    /* First pass: determine size of output string */
3504    i = j = 0;
3505    e = self->str + self->length;
3506    for (p = self->str; p < e; p++)
3507        if (*p == '\t') {
3508	    if (tabsize > 0)
3509		j += tabsize - (j % tabsize);
3510	}
3511        else {
3512            j++;
3513            if (*p == '\n' || *p == '\r') {
3514                i += j;
3515                j = 0;
3516            }
3517        }
3518
3519    /* Second pass: create output string and fill it */
3520    u = _PyUnicode_New(i + j);
3521    if (!u)
3522        return NULL;
3523
3524    j = 0;
3525    q = u->str;
3526
3527    for (p = self->str; p < e; p++)
3528        if (*p == '\t') {
3529	    if (tabsize > 0) {
3530		i = tabsize - (j % tabsize);
3531		j += i;
3532		while (i--)
3533		    *q++ = ' ';
3534	    }
3535	}
3536	else {
3537            j++;
3538	    *q++ = *p;
3539            if (*p == '\n' || *p == '\r')
3540                j = 0;
3541        }
3542
3543    return (PyObject*) u;
3544}
3545
3546static char find__doc__[] =
3547"S.find(sub [,start [,end]]) -> int\n\
3548\n\
3549Return the lowest index in S where substring sub is found,\n\
3550such that sub is contained within s[start,end].  Optional\n\
3551arguments start and end are interpreted as in slice notation.\n\
3552\n\
3553Return -1 on failure.";
3554
3555static PyObject *
3556unicode_find(PyUnicodeObject *self, PyObject *args)
3557{
3558    PyUnicodeObject *substring;
3559    int start = 0;
3560    int end = INT_MAX;
3561    PyObject *result;
3562
3563    if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3564		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3565        return NULL;
3566    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3567						(PyObject *)substring);
3568    if (substring == NULL)
3569	return NULL;
3570
3571    result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3572
3573    Py_DECREF(substring);
3574    return result;
3575}
3576
3577static PyObject *
3578unicode_getitem(PyUnicodeObject *self, int index)
3579{
3580    if (index < 0 || index >= self->length) {
3581        PyErr_SetString(PyExc_IndexError, "string index out of range");
3582        return NULL;
3583    }
3584
3585    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3586}
3587
3588static long
3589unicode_hash(PyUnicodeObject *self)
3590{
3591    /* Since Unicode objects compare equal to their ASCII string
3592       counterparts, they should use the individual character values
3593       as basis for their hash value.  This is needed to assure that
3594       strings and Unicode objects behave in the same way as
3595       dictionary keys. */
3596
3597    register int len;
3598    register Py_UNICODE *p;
3599    register long x;
3600
3601    if (self->hash != -1)
3602	return self->hash;
3603    len = PyUnicode_GET_SIZE(self);
3604    p = PyUnicode_AS_UNICODE(self);
3605    x = *p << 7;
3606    while (--len >= 0)
3607	x = (1000003*x) ^ *p++;
3608    x ^= PyUnicode_GET_SIZE(self);
3609    if (x == -1)
3610	x = -2;
3611    self->hash = x;
3612    return x;
3613}
3614
3615static char index__doc__[] =
3616"S.index(sub [,start [,end]]) -> int\n\
3617\n\
3618Like S.find() but raise ValueError when the substring is not found.";
3619
3620static PyObject *
3621unicode_index(PyUnicodeObject *self, PyObject *args)
3622{
3623    int result;
3624    PyUnicodeObject *substring;
3625    int start = 0;
3626    int end = INT_MAX;
3627
3628    if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3629		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3630        return NULL;
3631
3632    substring = (PyUnicodeObject *)PyUnicode_FromObject(
3633						(PyObject *)substring);
3634    if (substring == NULL)
3635	return NULL;
3636
3637    result = findstring(self, substring, start, end, 1);
3638
3639    Py_DECREF(substring);
3640    if (result < 0) {
3641        PyErr_SetString(PyExc_ValueError, "substring not found");
3642        return NULL;
3643    }
3644    return PyInt_FromLong(result);
3645}
3646
3647static char islower__doc__[] =
3648"S.islower() -> int\n\
3649\n\
3650Return 1 if  all cased characters in S are lowercase and there is\n\
3651at least one cased character in S, 0 otherwise.";
3652
3653static PyObject*
3654unicode_islower(PyUnicodeObject *self, PyObject *args)
3655{
3656    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3657    register const Py_UNICODE *e;
3658    int cased;
3659
3660    if (!PyArg_NoArgs(args))
3661        return NULL;
3662
3663    /* Shortcut for single character strings */
3664    if (PyUnicode_GET_SIZE(self) == 1)
3665	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3666
3667    /* Special case for empty strings */
3668    if (PyString_GET_SIZE(self) == 0)
3669	return PyInt_FromLong(0);
3670
3671    e = p + PyUnicode_GET_SIZE(self);
3672    cased = 0;
3673    for (; p < e; p++) {
3674	register const Py_UNICODE ch = *p;
3675
3676	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3677	    return PyInt_FromLong(0);
3678	else if (!cased && Py_UNICODE_ISLOWER(ch))
3679	    cased = 1;
3680    }
3681    return PyInt_FromLong(cased);
3682}
3683
3684static char isupper__doc__[] =
3685"S.isupper() -> int\n\
3686\n\
3687Return 1 if  all cased characters in S are uppercase and there is\n\
3688at least one cased character in S, 0 otherwise.";
3689
3690static PyObject*
3691unicode_isupper(PyUnicodeObject *self, PyObject *args)
3692{
3693    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3694    register const Py_UNICODE *e;
3695    int cased;
3696
3697    if (!PyArg_NoArgs(args))
3698        return NULL;
3699
3700    /* Shortcut for single character strings */
3701    if (PyUnicode_GET_SIZE(self) == 1)
3702	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3703
3704    /* Special case for empty strings */
3705    if (PyString_GET_SIZE(self) == 0)
3706	return PyInt_FromLong(0);
3707
3708    e = p + PyUnicode_GET_SIZE(self);
3709    cased = 0;
3710    for (; p < e; p++) {
3711	register const Py_UNICODE ch = *p;
3712
3713	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3714	    return PyInt_FromLong(0);
3715	else if (!cased && Py_UNICODE_ISUPPER(ch))
3716	    cased = 1;
3717    }
3718    return PyInt_FromLong(cased);
3719}
3720
3721static char istitle__doc__[] =
3722"S.istitle() -> int\n\
3723\n\
3724Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3725may only follow uncased characters and lowercase characters only cased\n\
3726ones. Return 0 otherwise.";
3727
3728static PyObject*
3729unicode_istitle(PyUnicodeObject *self, PyObject *args)
3730{
3731    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3732    register const Py_UNICODE *e;
3733    int cased, previous_is_cased;
3734
3735    if (!PyArg_NoArgs(args))
3736        return NULL;
3737
3738    /* Shortcut for single character strings */
3739    if (PyUnicode_GET_SIZE(self) == 1)
3740	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3741			      (Py_UNICODE_ISUPPER(*p) != 0));
3742
3743    /* Special case for empty strings */
3744    if (PyString_GET_SIZE(self) == 0)
3745	return PyInt_FromLong(0);
3746
3747    e = p + PyUnicode_GET_SIZE(self);
3748    cased = 0;
3749    previous_is_cased = 0;
3750    for (; p < e; p++) {
3751	register const Py_UNICODE ch = *p;
3752
3753	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3754	    if (previous_is_cased)
3755		return PyInt_FromLong(0);
3756	    previous_is_cased = 1;
3757	    cased = 1;
3758	}
3759	else if (Py_UNICODE_ISLOWER(ch)) {
3760	    if (!previous_is_cased)
3761		return PyInt_FromLong(0);
3762	    previous_is_cased = 1;
3763	    cased = 1;
3764	}
3765	else
3766	    previous_is_cased = 0;
3767    }
3768    return PyInt_FromLong(cased);
3769}
3770
3771static char isspace__doc__[] =
3772"S.isspace() -> int\n\
3773\n\
3774Return 1 if there are only whitespace characters in S,\n\
37750 otherwise.";
3776
3777static PyObject*
3778unicode_isspace(PyUnicodeObject *self, PyObject *args)
3779{
3780    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3781    register const Py_UNICODE *e;
3782
3783    if (!PyArg_NoArgs(args))
3784        return NULL;
3785
3786    /* Shortcut for single character strings */
3787    if (PyUnicode_GET_SIZE(self) == 1 &&
3788	Py_UNICODE_ISSPACE(*p))
3789	return PyInt_FromLong(1);
3790
3791    /* Special case for empty strings */
3792    if (PyString_GET_SIZE(self) == 0)
3793	return PyInt_FromLong(0);
3794
3795    e = p + PyUnicode_GET_SIZE(self);
3796    for (; p < e; p++) {
3797	if (!Py_UNICODE_ISSPACE(*p))
3798	    return PyInt_FromLong(0);
3799    }
3800    return PyInt_FromLong(1);
3801}
3802
3803static char isalpha__doc__[] =
3804"S.isalpha() -> int\n\
3805\n\
3806Return 1 if  all characters in S are alphabetic\n\
3807and there is at least one character in S, 0 otherwise.";
3808
3809static PyObject*
3810unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3811{
3812    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3813    register const Py_UNICODE *e;
3814
3815    if (!PyArg_NoArgs(args))
3816        return NULL;
3817
3818    /* Shortcut for single character strings */
3819    if (PyUnicode_GET_SIZE(self) == 1 &&
3820	Py_UNICODE_ISALPHA(*p))
3821	return PyInt_FromLong(1);
3822
3823    /* Special case for empty strings */
3824    if (PyString_GET_SIZE(self) == 0)
3825	return PyInt_FromLong(0);
3826
3827    e = p + PyUnicode_GET_SIZE(self);
3828    for (; p < e; p++) {
3829	if (!Py_UNICODE_ISALPHA(*p))
3830	    return PyInt_FromLong(0);
3831    }
3832    return PyInt_FromLong(1);
3833}
3834
3835static char isalnum__doc__[] =
3836"S.isalnum() -> int\n\
3837\n\
3838Return 1 if  all characters in S are alphanumeric\n\
3839and there is at least one character in S, 0 otherwise.";
3840
3841static PyObject*
3842unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3843{
3844    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3845    register const Py_UNICODE *e;
3846
3847    if (!PyArg_NoArgs(args))
3848        return NULL;
3849
3850    /* Shortcut for single character strings */
3851    if (PyUnicode_GET_SIZE(self) == 1 &&
3852	Py_UNICODE_ISALNUM(*p))
3853	return PyInt_FromLong(1);
3854
3855    /* Special case for empty strings */
3856    if (PyString_GET_SIZE(self) == 0)
3857	return PyInt_FromLong(0);
3858
3859    e = p + PyUnicode_GET_SIZE(self);
3860    for (; p < e; p++) {
3861	if (!Py_UNICODE_ISALNUM(*p))
3862	    return PyInt_FromLong(0);
3863    }
3864    return PyInt_FromLong(1);
3865}
3866
3867static char isdecimal__doc__[] =
3868"S.isdecimal() -> int\n\
3869\n\
3870Return 1 if there are only decimal characters in S,\n\
38710 otherwise.";
3872
3873static PyObject*
3874unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3875{
3876    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3877    register const Py_UNICODE *e;
3878
3879    if (!PyArg_NoArgs(args))
3880        return NULL;
3881
3882    /* Shortcut for single character strings */
3883    if (PyUnicode_GET_SIZE(self) == 1 &&
3884	Py_UNICODE_ISDECIMAL(*p))
3885	return PyInt_FromLong(1);
3886
3887    /* Special case for empty strings */
3888    if (PyString_GET_SIZE(self) == 0)
3889	return PyInt_FromLong(0);
3890
3891    e = p + PyUnicode_GET_SIZE(self);
3892    for (; p < e; p++) {
3893	if (!Py_UNICODE_ISDECIMAL(*p))
3894	    return PyInt_FromLong(0);
3895    }
3896    return PyInt_FromLong(1);
3897}
3898
3899static char isdigit__doc__[] =
3900"S.isdigit() -> int\n\
3901\n\
3902Return 1 if there are only digit characters in S,\n\
39030 otherwise.";
3904
3905static PyObject*
3906unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3907{
3908    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3909    register const Py_UNICODE *e;
3910
3911    if (!PyArg_NoArgs(args))
3912        return NULL;
3913
3914    /* Shortcut for single character strings */
3915    if (PyUnicode_GET_SIZE(self) == 1 &&
3916	Py_UNICODE_ISDIGIT(*p))
3917	return PyInt_FromLong(1);
3918
3919    /* Special case for empty strings */
3920    if (PyString_GET_SIZE(self) == 0)
3921	return PyInt_FromLong(0);
3922
3923    e = p + PyUnicode_GET_SIZE(self);
3924    for (; p < e; p++) {
3925	if (!Py_UNICODE_ISDIGIT(*p))
3926	    return PyInt_FromLong(0);
3927    }
3928    return PyInt_FromLong(1);
3929}
3930
3931static char isnumeric__doc__[] =
3932"S.isnumeric() -> int\n\
3933\n\
3934Return 1 if there are only numeric characters in S,\n\
39350 otherwise.";
3936
3937static PyObject*
3938unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3939{
3940    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3941    register const Py_UNICODE *e;
3942
3943    if (!PyArg_NoArgs(args))
3944        return NULL;
3945
3946    /* Shortcut for single character strings */
3947    if (PyUnicode_GET_SIZE(self) == 1 &&
3948	Py_UNICODE_ISNUMERIC(*p))
3949	return PyInt_FromLong(1);
3950
3951    /* Special case for empty strings */
3952    if (PyString_GET_SIZE(self) == 0)
3953	return PyInt_FromLong(0);
3954
3955    e = p + PyUnicode_GET_SIZE(self);
3956    for (; p < e; p++) {
3957	if (!Py_UNICODE_ISNUMERIC(*p))
3958	    return PyInt_FromLong(0);
3959    }
3960    return PyInt_FromLong(1);
3961}
3962
3963static char join__doc__[] =
3964"S.join(sequence) -> unicode\n\
3965\n\
3966Return a string which is the concatenation of the strings in the\n\
3967sequence.  The separator between elements is S.";
3968
3969static PyObject*
3970unicode_join(PyUnicodeObject *self, PyObject *args)
3971{
3972    PyObject *data;
3973    if (!PyArg_ParseTuple(args, "O:join", &data))
3974        return NULL;
3975
3976    return PyUnicode_Join((PyObject *)self, data);
3977}
3978
3979static int
3980unicode_length(PyUnicodeObject *self)
3981{
3982    return self->length;
3983}
3984
3985static char ljust__doc__[] =
3986"S.ljust(width) -> unicode\n\
3987\n\
3988Return S left justified in a Unicode string of length width. Padding is\n\
3989done using spaces.";
3990
3991static PyObject *
3992unicode_ljust(PyUnicodeObject *self, PyObject *args)
3993{
3994    int width;
3995    if (!PyArg_ParseTuple(args, "i:ljust", &width))
3996        return NULL;
3997
3998    if (self->length >= width) {
3999        Py_INCREF(self);
4000        return (PyObject*) self;
4001    }
4002
4003    return (PyObject*) pad(self, 0, width - self->length, ' ');
4004}
4005
4006static char lower__doc__[] =
4007"S.lower() -> unicode\n\
4008\n\
4009Return a copy of the string S converted to lowercase.";
4010
4011static PyObject*
4012unicode_lower(PyUnicodeObject *self, PyObject *args)
4013{
4014    if (!PyArg_NoArgs(args))
4015        return NULL;
4016    return fixup(self, fixlower);
4017}
4018
4019static char lstrip__doc__[] =
4020"S.lstrip() -> unicode\n\
4021\n\
4022Return a copy of the string S with leading whitespace removed.";
4023
4024static PyObject *
4025unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4026{
4027    if (!PyArg_NoArgs(args))
4028        return NULL;
4029    return strip(self, 1, 0);
4030}
4031
4032static PyObject*
4033unicode_repeat(PyUnicodeObject *str, int len)
4034{
4035    PyUnicodeObject *u;
4036    Py_UNICODE *p;
4037    int nchars;
4038    size_t nbytes;
4039
4040    if (len < 0)
4041        len = 0;
4042
4043    if (len == 1) {
4044        /* no repeat, return original string */
4045        Py_INCREF(str);
4046        return (PyObject*) str;
4047    }
4048
4049    /* ensure # of chars needed doesn't overflow int and # of bytes
4050     * needed doesn't overflow size_t
4051     */
4052    nchars = len * str->length;
4053    if (len && nchars / len != str->length) {
4054        PyErr_SetString(PyExc_OverflowError,
4055                        "repeated string is too long");
4056        return NULL;
4057    }
4058    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4059    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4060        PyErr_SetString(PyExc_OverflowError,
4061                        "repeated string is too long");
4062        return NULL;
4063    }
4064    u = _PyUnicode_New(nchars);
4065    if (!u)
4066        return NULL;
4067
4068    p = u->str;
4069
4070    while (len-- > 0) {
4071        Py_UNICODE_COPY(p, str->str, str->length);
4072        p += str->length;
4073    }
4074
4075    return (PyObject*) u;
4076}
4077
4078PyObject *PyUnicode_Replace(PyObject *obj,
4079			    PyObject *subobj,
4080			    PyObject *replobj,
4081			    int maxcount)
4082{
4083    PyObject *self;
4084    PyObject *str1;
4085    PyObject *str2;
4086    PyObject *result;
4087
4088    self = PyUnicode_FromObject(obj);
4089    if (self == NULL)
4090	return NULL;
4091    str1 = PyUnicode_FromObject(subobj);
4092    if (str1 == NULL) {
4093	Py_DECREF(self);
4094	return NULL;
4095    }
4096    str2 = PyUnicode_FromObject(replobj);
4097    if (str2 == NULL) {
4098	Py_DECREF(self);
4099	Py_DECREF(str1);
4100	return NULL;
4101    }
4102    result = replace((PyUnicodeObject *)self,
4103		     (PyUnicodeObject *)str1,
4104		     (PyUnicodeObject *)str2,
4105		     maxcount);
4106    Py_DECREF(self);
4107    Py_DECREF(str1);
4108    Py_DECREF(str2);
4109    return result;
4110}
4111
4112static char replace__doc__[] =
4113"S.replace (old, new[, maxsplit]) -> unicode\n\
4114\n\
4115Return a copy of S with all occurrences of substring\n\
4116old replaced by new.  If the optional argument maxsplit is\n\
4117given, only the first maxsplit occurrences are replaced.";
4118
4119static PyObject*
4120unicode_replace(PyUnicodeObject *self, PyObject *args)
4121{
4122    PyUnicodeObject *str1;
4123    PyUnicodeObject *str2;
4124    int maxcount = -1;
4125    PyObject *result;
4126
4127    if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4128        return NULL;
4129    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4130    if (str1 == NULL)
4131	return NULL;
4132    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4133    if (str2 == NULL)
4134	return NULL;
4135
4136    result = replace(self, str1, str2, maxcount);
4137
4138    Py_DECREF(str1);
4139    Py_DECREF(str2);
4140    return result;
4141}
4142
4143static
4144PyObject *unicode_repr(PyObject *unicode)
4145{
4146    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4147				PyUnicode_GET_SIZE(unicode),
4148				1);
4149}
4150
4151static char rfind__doc__[] =
4152"S.rfind(sub [,start [,end]]) -> int\n\
4153\n\
4154Return the highest index in S where substring sub is found,\n\
4155such that sub is contained within s[start,end].  Optional\n\
4156arguments start and end are interpreted as in slice notation.\n\
4157\n\
4158Return -1 on failure.";
4159
4160static PyObject *
4161unicode_rfind(PyUnicodeObject *self, PyObject *args)
4162{
4163    PyUnicodeObject *substring;
4164    int start = 0;
4165    int end = INT_MAX;
4166    PyObject *result;
4167
4168    if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4169		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4170        return NULL;
4171    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4172						(PyObject *)substring);
4173    if (substring == NULL)
4174	return NULL;
4175
4176    result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4177
4178    Py_DECREF(substring);
4179    return result;
4180}
4181
4182static char rindex__doc__[] =
4183"S.rindex(sub [,start [,end]]) -> int\n\
4184\n\
4185Like S.rfind() but raise ValueError when the substring is not found.";
4186
4187static PyObject *
4188unicode_rindex(PyUnicodeObject *self, PyObject *args)
4189{
4190    int result;
4191    PyUnicodeObject *substring;
4192    int start = 0;
4193    int end = INT_MAX;
4194
4195    if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4196		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4197        return NULL;
4198    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4199						(PyObject *)substring);
4200    if (substring == NULL)
4201	return NULL;
4202
4203    result = findstring(self, substring, start, end, -1);
4204
4205    Py_DECREF(substring);
4206    if (result < 0) {
4207        PyErr_SetString(PyExc_ValueError, "substring not found");
4208        return NULL;
4209    }
4210    return PyInt_FromLong(result);
4211}
4212
4213static char rjust__doc__[] =
4214"S.rjust(width) -> unicode\n\
4215\n\
4216Return S right justified in a Unicode string of length width. Padding is\n\
4217done using spaces.";
4218
4219static PyObject *
4220unicode_rjust(PyUnicodeObject *self, PyObject *args)
4221{
4222    int width;
4223    if (!PyArg_ParseTuple(args, "i:rjust", &width))
4224        return NULL;
4225
4226    if (self->length >= width) {
4227        Py_INCREF(self);
4228        return (PyObject*) self;
4229    }
4230
4231    return (PyObject*) pad(self, width - self->length, 0, ' ');
4232}
4233
4234static char rstrip__doc__[] =
4235"S.rstrip() -> unicode\n\
4236\n\
4237Return a copy of the string S with trailing whitespace removed.";
4238
4239static PyObject *
4240unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4241{
4242    if (!PyArg_NoArgs(args))
4243        return NULL;
4244    return strip(self, 0, 1);
4245}
4246
4247static PyObject*
4248unicode_slice(PyUnicodeObject *self, int start, int end)
4249{
4250    /* standard clamping */
4251    if (start < 0)
4252        start = 0;
4253    if (end < 0)
4254        end = 0;
4255    if (end > self->length)
4256        end = self->length;
4257    if (start == 0 && end == self->length) {
4258        /* full slice, return original string */
4259        Py_INCREF(self);
4260        return (PyObject*) self;
4261    }
4262    if (start > end)
4263        start = end;
4264    /* copy slice */
4265    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4266					     end - start);
4267}
4268
4269PyObject *PyUnicode_Split(PyObject *s,
4270			  PyObject *sep,
4271			  int maxsplit)
4272{
4273    PyObject *result;
4274
4275    s = PyUnicode_FromObject(s);
4276    if (s == NULL)
4277	return NULL;
4278    if (sep != NULL) {
4279	sep = PyUnicode_FromObject(sep);
4280	if (sep == NULL) {
4281	    Py_DECREF(s);
4282	    return NULL;
4283	}
4284    }
4285
4286    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4287
4288    Py_DECREF(s);
4289    Py_XDECREF(sep);
4290    return result;
4291}
4292
4293static char split__doc__[] =
4294"S.split([sep [,maxsplit]]) -> list of strings\n\
4295\n\
4296Return a list of the words in S, using sep as the\n\
4297delimiter string.  If maxsplit is given, at most maxsplit\n\
4298splits are done. If sep is not specified, any whitespace string\n\
4299is a separator.";
4300
4301static PyObject*
4302unicode_split(PyUnicodeObject *self, PyObject *args)
4303{
4304    PyObject *substring = Py_None;
4305    int maxcount = -1;
4306
4307    if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4308        return NULL;
4309
4310    if (substring == Py_None)
4311	return split(self, NULL, maxcount);
4312    else if (PyUnicode_Check(substring))
4313	return split(self, (PyUnicodeObject *)substring, maxcount);
4314    else
4315	return PyUnicode_Split((PyObject *)self, substring, maxcount);
4316}
4317
4318static char splitlines__doc__[] =
4319"S.splitlines([keepends]]) -> list of strings\n\
4320\n\
4321Return a list of the lines in S, breaking at line boundaries.\n\
4322Line breaks are not included in the resulting list unless keepends\n\
4323is given and true.";
4324
4325static PyObject*
4326unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4327{
4328    int keepends = 0;
4329
4330    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4331        return NULL;
4332
4333    return PyUnicode_Splitlines((PyObject *)self, keepends);
4334}
4335
4336static
4337PyObject *unicode_str(PyUnicodeObject *self)
4338{
4339    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4340}
4341
4342static char strip__doc__[] =
4343"S.strip() -> unicode\n\
4344\n\
4345Return a copy of S with leading and trailing whitespace removed.";
4346
4347static PyObject *
4348unicode_strip(PyUnicodeObject *self, PyObject *args)
4349{
4350    if (!PyArg_NoArgs(args))
4351        return NULL;
4352    return strip(self, 1, 1);
4353}
4354
4355static char swapcase__doc__[] =
4356"S.swapcase() -> unicode\n\
4357\n\
4358Return a copy of S with uppercase characters converted to lowercase\n\
4359and vice versa.";
4360
4361static PyObject*
4362unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4363{
4364    if (!PyArg_NoArgs(args))
4365        return NULL;
4366    return fixup(self, fixswapcase);
4367}
4368
4369static char translate__doc__[] =
4370"S.translate(table) -> unicode\n\
4371\n\
4372Return a copy of the string S, where all characters have been mapped\n\
4373through the given translation table, which must be a mapping of\n\
4374Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4375are left untouched. Characters mapped to None are deleted.";
4376
4377static PyObject*
4378unicode_translate(PyUnicodeObject *self, PyObject *args)
4379{
4380    PyObject *table;
4381
4382    if (!PyArg_ParseTuple(args, "O:translate", &table))
4383	return NULL;
4384    return PyUnicode_TranslateCharmap(self->str,
4385				      self->length,
4386				      table,
4387				      "ignore");
4388}
4389
4390static char upper__doc__[] =
4391"S.upper() -> unicode\n\
4392\n\
4393Return a copy of S converted to uppercase.";
4394
4395static PyObject*
4396unicode_upper(PyUnicodeObject *self, PyObject *args)
4397{
4398    if (!PyArg_NoArgs(args))
4399        return NULL;
4400    return fixup(self, fixupper);
4401}
4402
4403#if 0
4404static char zfill__doc__[] =
4405"S.zfill(width) -> unicode\n\
4406\n\
4407Pad a numeric string x with zeros on the left, to fill a field\n\
4408of the specified width. The string x is never truncated.";
4409
4410static PyObject *
4411unicode_zfill(PyUnicodeObject *self, PyObject *args)
4412{
4413    int fill;
4414    PyUnicodeObject *u;
4415
4416    int width;
4417    if (!PyArg_ParseTuple(args, "i:zfill", &width))
4418        return NULL;
4419
4420    if (self->length >= width) {
4421        Py_INCREF(self);
4422        return (PyObject*) self;
4423    }
4424
4425    fill = width - self->length;
4426
4427    u = pad(self, fill, 0, '0');
4428
4429    if (u->str[fill] == '+' || u->str[fill] == '-') {
4430        /* move sign to beginning of string */
4431        u->str[0] = u->str[fill];
4432        u->str[fill] = '0';
4433    }
4434
4435    return (PyObject*) u;
4436}
4437#endif
4438
4439#if 0
4440static PyObject*
4441unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4442{
4443    if (!PyArg_NoArgs(args))
4444        return NULL;
4445    return PyInt_FromLong(unicode_freelist_size);
4446}
4447#endif
4448
4449static char startswith__doc__[] =
4450"S.startswith(prefix[, start[, end]]) -> int\n\
4451\n\
4452Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4453optional start, test S beginning at that position.  With optional end, stop\n\
4454comparing S at that position.";
4455
4456static PyObject *
4457unicode_startswith(PyUnicodeObject *self,
4458		   PyObject *args)
4459{
4460    PyUnicodeObject *substring;
4461    int start = 0;
4462    int end = INT_MAX;
4463    PyObject *result;
4464
4465    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4466		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4467	return NULL;
4468    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4469						(PyObject *)substring);
4470    if (substring == NULL)
4471	return NULL;
4472
4473    result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4474
4475    Py_DECREF(substring);
4476    return result;
4477}
4478
4479
4480static char endswith__doc__[] =
4481"S.endswith(suffix[, start[, end]]) -> int\n\
4482\n\
4483Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4484optional start, test S beginning at that position.  With optional end, stop\n\
4485comparing S at that position.";
4486
4487static PyObject *
4488unicode_endswith(PyUnicodeObject *self,
4489		 PyObject *args)
4490{
4491    PyUnicodeObject *substring;
4492    int start = 0;
4493    int end = INT_MAX;
4494    PyObject *result;
4495
4496    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4497		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4498	return NULL;
4499    substring = (PyUnicodeObject *)PyUnicode_FromObject(
4500						(PyObject *)substring);
4501    if (substring == NULL)
4502	return NULL;
4503
4504    result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4505
4506    Py_DECREF(substring);
4507    return result;
4508}
4509
4510
4511static PyMethodDef unicode_methods[] = {
4512
4513    /* Order is according to common usage: often used methods should
4514       appear first, since lookup is done sequentially. */
4515
4516    {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4517    {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4518    {"split", (PyCFunction) unicode_split, 1, split__doc__},
4519    {"join", (PyCFunction) unicode_join, 1, join__doc__},
4520    {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4521    {"title", (PyCFunction) unicode_title, 0, title__doc__},
4522    {"center", (PyCFunction) unicode_center, 1, center__doc__},
4523    {"count", (PyCFunction) unicode_count, 1, count__doc__},
4524    {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4525    {"find", (PyCFunction) unicode_find, 1, find__doc__},
4526    {"index", (PyCFunction) unicode_index, 1, index__doc__},
4527    {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4528    {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4529    {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4530/*  {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4531    {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4532    {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4533    {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4534    {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4535    {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4536    {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4537    {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4538    {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4539    {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4540    {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4541    {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4542    {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4543    {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4544    {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4545    {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4546    {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4547    {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4548    {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4549    {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4550    {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4551#if 0
4552    {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4553    {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4554#endif
4555
4556#if 0
4557    /* This one is just used for debugging the implementation. */
4558    {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4559#endif
4560
4561    {NULL, NULL}
4562};
4563
4564static PyObject *
4565unicode_getattr(PyUnicodeObject *self, char *name)
4566{
4567    return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4568}
4569
4570static PySequenceMethods unicode_as_sequence = {
4571    (inquiry) unicode_length, 		/* sq_length */
4572    (binaryfunc) PyUnicode_Concat, 	/* sq_concat */
4573    (intargfunc) unicode_repeat, 	/* sq_repeat */
4574    (intargfunc) unicode_getitem, 	/* sq_item */
4575    (intintargfunc) unicode_slice, 	/* sq_slice */
4576    0, 					/* sq_ass_item */
4577    0, 					/* sq_ass_slice */
4578    (objobjproc)PyUnicode_Contains, 	/*sq_contains*/
4579};
4580
4581static int
4582unicode_buffer_getreadbuf(PyUnicodeObject *self,
4583			  int index,
4584			  const void **ptr)
4585{
4586    if (index != 0) {
4587        PyErr_SetString(PyExc_SystemError,
4588			"accessing non-existent unicode segment");
4589        return -1;
4590    }
4591    *ptr = (void *) self->str;
4592    return PyUnicode_GET_DATA_SIZE(self);
4593}
4594
4595static int
4596unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4597			   const void **ptr)
4598{
4599    PyErr_SetString(PyExc_TypeError,
4600		    "cannot use unicode as modifyable buffer");
4601    return -1;
4602}
4603
4604static int
4605unicode_buffer_getsegcount(PyUnicodeObject *self,
4606			   int *lenp)
4607{
4608    if (lenp)
4609        *lenp = PyUnicode_GET_DATA_SIZE(self);
4610    return 1;
4611}
4612
4613static int
4614unicode_buffer_getcharbuf(PyUnicodeObject *self,
4615			  int index,
4616			  const void **ptr)
4617{
4618    PyObject *str;
4619
4620    if (index != 0) {
4621        PyErr_SetString(PyExc_SystemError,
4622			"accessing non-existent unicode segment");
4623        return -1;
4624    }
4625    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4626    if (str == NULL)
4627	return -1;
4628    *ptr = (void *) PyString_AS_STRING(str);
4629    return PyString_GET_SIZE(str);
4630}
4631
4632/* Helpers for PyUnicode_Format() */
4633
4634static PyObject *
4635getnextarg(PyObject *args, int arglen, int *p_argidx)
4636{
4637    int argidx = *p_argidx;
4638    if (argidx < arglen) {
4639	(*p_argidx)++;
4640	if (arglen < 0)
4641	    return args;
4642	else
4643	    return PyTuple_GetItem(args, argidx);
4644    }
4645    PyErr_SetString(PyExc_TypeError,
4646		    "not enough arguments for format string");
4647    return NULL;
4648}
4649
4650#define F_LJUST (1<<0)
4651#define F_SIGN	(1<<1)
4652#define F_BLANK (1<<2)
4653#define F_ALT	(1<<3)
4654#define F_ZERO	(1<<4)
4655
4656static
4657int usprintf(register Py_UNICODE *buffer, char *format, ...)
4658{
4659    register int i;
4660    int len;
4661    va_list va;
4662    char *charbuffer;
4663    va_start(va, format);
4664
4665    /* First, format the string as char array, then expand to Py_UNICODE
4666       array. */
4667    charbuffer = (char *)buffer;
4668    len = vsprintf(charbuffer, format, va);
4669    for (i = len - 1; i >= 0; i--)
4670	buffer[i] = (Py_UNICODE) charbuffer[i];
4671
4672    va_end(va);
4673    return len;
4674}
4675
4676static int
4677formatfloat(Py_UNICODE *buf,
4678	    size_t buflen,
4679	    int flags,
4680	    int prec,
4681	    int type,
4682	    PyObject *v)
4683{
4684    /* fmt = '%#.' + `prec` + `type`
4685       worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4686    char fmt[20];
4687    double x;
4688
4689    x = PyFloat_AsDouble(v);
4690    if (x == -1.0 && PyErr_Occurred())
4691	return -1;
4692    if (prec < 0)
4693	prec = 6;
4694    if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4695	type = 'g';
4696    sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4697    /* worst case length calc to ensure no buffer overrun:
4698         fmt = %#.<prec>g
4699         buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4700            for any double rep.)
4701         len = 1 + prec + 1 + 2 + 5 = 9 + prec
4702       If prec=0 the effective precision is 1 (the leading digit is
4703       always given), therefore increase by one to 10+prec. */
4704    if (buflen <= (size_t)10 + (size_t)prec) {
4705	PyErr_SetString(PyExc_OverflowError,
4706	    "formatted float is too long (precision too long?)");
4707	return -1;
4708    }
4709    return usprintf(buf, fmt, x);
4710}
4711
4712static PyObject*
4713formatlong(PyObject *val, int flags, int prec, int type)
4714{
4715	char *buf;
4716	int i, len;
4717	PyObject *str; /* temporary string object. */
4718	PyUnicodeObject *result;
4719
4720	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4721	if (!str)
4722		return NULL;
4723	result = _PyUnicode_New(len);
4724	for (i = 0; i < len; i++)
4725		result->str[i] = buf[i];
4726	result->str[len] = 0;
4727	Py_DECREF(str);
4728	return (PyObject*)result;
4729}
4730
4731static int
4732formatint(Py_UNICODE *buf,
4733	  size_t buflen,
4734	  int flags,
4735	  int prec,
4736	  int type,
4737	  PyObject *v)
4738{
4739    /* fmt = '%#.' + `prec` + 'l' + `type`
4740       worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4741       + 1 + 1 = 24*/
4742    char fmt[64]; /* plenty big enough! */
4743    long x;
4744    int use_native_c_format = 1;
4745
4746    x = PyInt_AsLong(v);
4747    if (x == -1 && PyErr_Occurred())
4748	return -1;
4749    if (prec < 0)
4750	prec = 1;
4751    /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4752       worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4753    if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4754        PyErr_SetString(PyExc_OverflowError,
4755            "formatted integer is too long (precision too long?)");
4756        return -1;
4757    }
4758    /* When converting 0 under %#x or %#X, C leaves off the base marker,
4759     * but we want it (for consistency with other %#x conversions, and
4760     * for consistency with Python's hex() function).
4761     * BUG 28-Apr-2001 tim:  At least two platform Cs (Metrowerks &
4762     * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4763     * So add it only if the platform doesn't already.
4764     */
4765    if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4766        /* Only way to know what the platform does is to try it. */
4767        sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4768        if (fmt[1] != (char)type) {
4769            /* Supply our own leading 0x/0X -- needed under std C */
4770            use_native_c_format = 0;
4771            sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4772        }
4773    }
4774    if (use_native_c_format)
4775         sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4776    return usprintf(buf, fmt, x);
4777}
4778
4779static int
4780formatchar(Py_UNICODE *buf,
4781           size_t buflen,
4782           PyObject *v)
4783{
4784    /* presume that the buffer is at least 2 characters long */
4785    if (PyUnicode_Check(v)) {
4786	if (PyUnicode_GET_SIZE(v) != 1)
4787	    goto onError;
4788	buf[0] = PyUnicode_AS_UNICODE(v)[0];
4789    }
4790
4791    else if (PyString_Check(v)) {
4792	if (PyString_GET_SIZE(v) != 1)
4793	    goto onError;
4794	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4795    }
4796
4797    else {
4798	/* Integer input truncated to a character */
4799        long x;
4800	x = PyInt_AsLong(v);
4801	if (x == -1 && PyErr_Occurred())
4802	    goto onError;
4803	buf[0] = (char) x;
4804    }
4805    buf[1] = '\0';
4806    return 1;
4807
4808 onError:
4809    PyErr_SetString(PyExc_TypeError,
4810		    "%c requires int or char");
4811    return -1;
4812}
4813
4814/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4815
4816   FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4817   chars are formatted. XXX This is a magic number. Each formatting
4818   routine does bounds checking to ensure no overflow, but a better
4819   solution may be to malloc a buffer of appropriate size for each
4820   format. For now, the current solution is sufficient.
4821*/
4822#define FORMATBUFLEN (size_t)120
4823
4824PyObject *PyUnicode_Format(PyObject *format,
4825			   PyObject *args)
4826{
4827    Py_UNICODE *fmt, *res;
4828    int fmtcnt, rescnt, reslen, arglen, argidx;
4829    int args_owned = 0;
4830    PyUnicodeObject *result = NULL;
4831    PyObject *dict = NULL;
4832    PyObject *uformat;
4833
4834    if (format == NULL || args == NULL) {
4835	PyErr_BadInternalCall();
4836	return NULL;
4837    }
4838    uformat = PyUnicode_FromObject(format);
4839    if (uformat == NULL)
4840	return NULL;
4841    fmt = PyUnicode_AS_UNICODE(uformat);
4842    fmtcnt = PyUnicode_GET_SIZE(uformat);
4843
4844    reslen = rescnt = fmtcnt + 100;
4845    result = _PyUnicode_New(reslen);
4846    if (result == NULL)
4847	goto onError;
4848    res = PyUnicode_AS_UNICODE(result);
4849
4850    if (PyTuple_Check(args)) {
4851	arglen = PyTuple_Size(args);
4852	argidx = 0;
4853    }
4854    else {
4855	arglen = -1;
4856	argidx = -2;
4857    }
4858    if (args->ob_type->tp_as_mapping)
4859	dict = args;
4860
4861    while (--fmtcnt >= 0) {
4862	if (*fmt != '%') {
4863	    if (--rescnt < 0) {
4864		rescnt = fmtcnt + 100;
4865		reslen += rescnt;
4866		if (_PyUnicode_Resize(&result, reslen) < 0)
4867		    return NULL;
4868		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4869		--rescnt;
4870	    }
4871	    *res++ = *fmt++;
4872	}
4873	else {
4874	    /* Got a format specifier */
4875	    int flags = 0;
4876	    int width = -1;
4877	    int prec = -1;
4878	    Py_UNICODE c = '\0';
4879	    Py_UNICODE fill;
4880	    PyObject *v = NULL;
4881	    PyObject *temp = NULL;
4882	    Py_UNICODE *pbuf;
4883	    Py_UNICODE sign;
4884	    int len;
4885	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4886
4887	    fmt++;
4888	    if (*fmt == '(') {
4889		Py_UNICODE *keystart;
4890		int keylen;
4891		PyObject *key;
4892		int pcount = 1;
4893
4894		if (dict == NULL) {
4895		    PyErr_SetString(PyExc_TypeError,
4896				    "format requires a mapping");
4897		    goto onError;
4898		}
4899		++fmt;
4900		--fmtcnt;
4901		keystart = fmt;
4902		/* Skip over balanced parentheses */
4903		while (pcount > 0 && --fmtcnt >= 0) {
4904		    if (*fmt == ')')
4905			--pcount;
4906		    else if (*fmt == '(')
4907			++pcount;
4908		    fmt++;
4909		}
4910		keylen = fmt - keystart - 1;
4911		if (fmtcnt < 0 || pcount > 0) {
4912		    PyErr_SetString(PyExc_ValueError,
4913				    "incomplete format key");
4914		    goto onError;
4915		}
4916		/* keys are converted to strings using UTF-8 and
4917		   then looked up since Python uses strings to hold
4918		   variables names etc. in its namespaces and we
4919		   wouldn't want to break common idioms. */
4920		key = PyUnicode_EncodeUTF8(keystart,
4921					   keylen,
4922					   NULL);
4923		if (key == NULL)
4924		    goto onError;
4925		if (args_owned) {
4926		    Py_DECREF(args);
4927		    args_owned = 0;
4928		}
4929		args = PyObject_GetItem(dict, key);
4930		Py_DECREF(key);
4931		if (args == NULL) {
4932		    goto onError;
4933		}
4934		args_owned = 1;
4935		arglen = -1;
4936		argidx = -2;
4937	    }
4938	    while (--fmtcnt >= 0) {
4939		switch (c = *fmt++) {
4940		case '-': flags |= F_LJUST; continue;
4941		case '+': flags |= F_SIGN; continue;
4942		case ' ': flags |= F_BLANK; continue;
4943		case '#': flags |= F_ALT; continue;
4944		case '0': flags |= F_ZERO; continue;
4945		}
4946		break;
4947	    }
4948	    if (c == '*') {
4949		v = getnextarg(args, arglen, &argidx);
4950		if (v == NULL)
4951		    goto onError;
4952		if (!PyInt_Check(v)) {
4953		    PyErr_SetString(PyExc_TypeError,
4954				    "* wants int");
4955		    goto onError;
4956		}
4957		width = PyInt_AsLong(v);
4958		if (width < 0) {
4959		    flags |= F_LJUST;
4960		    width = -width;
4961		}
4962		if (--fmtcnt >= 0)
4963		    c = *fmt++;
4964	    }
4965	    else if (c >= '0' && c <= '9') {
4966		width = c - '0';
4967		while (--fmtcnt >= 0) {
4968		    c = *fmt++;
4969		    if (c < '0' || c > '9')
4970			break;
4971		    if ((width*10) / 10 != width) {
4972			PyErr_SetString(PyExc_ValueError,
4973					"width too big");
4974			goto onError;
4975		    }
4976		    width = width*10 + (c - '0');
4977		}
4978	    }
4979	    if (c == '.') {
4980		prec = 0;
4981		if (--fmtcnt >= 0)
4982		    c = *fmt++;
4983		if (c == '*') {
4984		    v = getnextarg(args, arglen, &argidx);
4985		    if (v == NULL)
4986			goto onError;
4987		    if (!PyInt_Check(v)) {
4988			PyErr_SetString(PyExc_TypeError,
4989					"* wants int");
4990			goto onError;
4991		    }
4992		    prec = PyInt_AsLong(v);
4993		    if (prec < 0)
4994			prec = 0;
4995		    if (--fmtcnt >= 0)
4996			c = *fmt++;
4997		}
4998		else if (c >= '0' && c <= '9') {
4999		    prec = c - '0';
5000		    while (--fmtcnt >= 0) {
5001			c = Py_CHARMASK(*fmt++);
5002			if (c < '0' || c > '9')
5003			    break;
5004			if ((prec*10) / 10 != prec) {
5005			    PyErr_SetString(PyExc_ValueError,
5006					    "prec too big");
5007			    goto onError;
5008			}
5009			prec = prec*10 + (c - '0');
5010		    }
5011		}
5012	    } /* prec */
5013	    if (fmtcnt >= 0) {
5014		if (c == 'h' || c == 'l' || c == 'L') {
5015		    if (--fmtcnt >= 0)
5016			c = *fmt++;
5017		}
5018	    }
5019	    if (fmtcnt < 0) {
5020		PyErr_SetString(PyExc_ValueError,
5021				"incomplete format");
5022		goto onError;
5023	    }
5024	    if (c != '%') {
5025		v = getnextarg(args, arglen, &argidx);
5026		if (v == NULL)
5027		    goto onError;
5028	    }
5029	    sign = 0;
5030	    fill = ' ';
5031	    switch (c) {
5032
5033	    case '%':
5034		pbuf = formatbuf;
5035		/* presume that buffer length is at least 1 */
5036		pbuf[0] = '%';
5037		len = 1;
5038		break;
5039
5040	    case 's':
5041	    case 'r':
5042		if (PyUnicode_Check(v) && c == 's') {
5043		    temp = v;
5044		    Py_INCREF(temp);
5045		}
5046		else {
5047		    PyObject *unicode;
5048		    if (c == 's')
5049			temp = PyObject_Str(v);
5050		    else
5051			temp = PyObject_Repr(v);
5052		    if (temp == NULL)
5053			goto onError;
5054		    if (!PyString_Check(temp)) {
5055			/* XXX Note: this should never happen, since
5056   			       PyObject_Repr() and PyObject_Str() assure
5057			       this */
5058			Py_DECREF(temp);
5059			PyErr_SetString(PyExc_TypeError,
5060					"%s argument has non-string str()");
5061			goto onError;
5062		    }
5063		    unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5064						   PyString_GET_SIZE(temp),
5065					       NULL,
5066						   "strict");
5067		    Py_DECREF(temp);
5068		    temp = unicode;
5069		    if (temp == NULL)
5070			goto onError;
5071		}
5072		pbuf = PyUnicode_AS_UNICODE(temp);
5073		len = PyUnicode_GET_SIZE(temp);
5074		if (prec >= 0 && len > prec)
5075		    len = prec;
5076		break;
5077
5078	    case 'i':
5079	    case 'd':
5080	    case 'u':
5081	    case 'o':
5082	    case 'x':
5083	    case 'X':
5084		if (c == 'i')
5085		    c = 'd';
5086		if (PyLong_Check(v)) {
5087		    temp = formatlong(v, flags, prec, c);
5088		    if (!temp)
5089			goto onError;
5090		    pbuf = PyUnicode_AS_UNICODE(temp);
5091		    len = PyUnicode_GET_SIZE(temp);
5092		    /* unbounded ints can always produce
5093		       a sign character! */
5094		    sign = 1;
5095		}
5096		else {
5097		    pbuf = formatbuf;
5098		    len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5099				    flags, prec, c, v);
5100		    if (len < 0)
5101			goto onError;
5102		    /* only d conversion is signed */
5103		    sign = c == 'd';
5104		}
5105		if (flags & F_ZERO)
5106		    fill = '0';
5107		break;
5108
5109	    case 'e':
5110	    case 'E':
5111	    case 'f':
5112	    case 'g':
5113	    case 'G':
5114		pbuf = formatbuf;
5115		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5116			flags, prec, c, v);
5117		if (len < 0)
5118		    goto onError;
5119		sign = 1;
5120		if (flags & F_ZERO)
5121		    fill = '0';
5122		break;
5123
5124	    case 'c':
5125		pbuf = formatbuf;
5126		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5127		if (len < 0)
5128		    goto onError;
5129		break;
5130
5131	    default:
5132		PyErr_Format(PyExc_ValueError,
5133			     "unsupported format character '%c' (0x%x) "
5134			     "at index %i",
5135			     (31<=c && c<=126) ? c : '?',
5136                             c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5137		goto onError;
5138	    }
5139	    if (sign) {
5140		if (*pbuf == '-' || *pbuf == '+') {
5141		    sign = *pbuf++;
5142		    len--;
5143		}
5144		else if (flags & F_SIGN)
5145		    sign = '+';
5146		else if (flags & F_BLANK)
5147		    sign = ' ';
5148		else
5149		    sign = 0;
5150	    }
5151	    if (width < len)
5152		width = len;
5153	    if (rescnt < width + (sign != 0)) {
5154		reslen -= rescnt;
5155		rescnt = width + fmtcnt + 100;
5156		reslen += rescnt;
5157		if (_PyUnicode_Resize(&result, reslen) < 0)
5158		    return NULL;
5159		res = PyUnicode_AS_UNICODE(result)
5160		    + reslen - rescnt;
5161	    }
5162	    if (sign) {
5163		if (fill != ' ')
5164		    *res++ = sign;
5165		rescnt--;
5166		if (width > len)
5167		    width--;
5168	    }
5169	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5170		assert(pbuf[0] == '0');
5171		assert(pbuf[1] == c);
5172		if (fill != ' ') {
5173		    *res++ = *pbuf++;
5174		    *res++ = *pbuf++;
5175		}
5176		rescnt -= 2;
5177		width -= 2;
5178		if (width < 0)
5179		    width = 0;
5180		len -= 2;
5181	    }
5182	    if (width > len && !(flags & F_LJUST)) {
5183		do {
5184		    --rescnt;
5185		    *res++ = fill;
5186		} while (--width > len);
5187	    }
5188	    if (fill == ' ') {
5189		if (sign)
5190		    *res++ = sign;
5191		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5192		    assert(pbuf[0] == '0');
5193		    assert(pbuf[1] == c);
5194		    *res++ = *pbuf++;
5195		    *res++ = *pbuf++;
5196		}
5197	    }
5198	    Py_UNICODE_COPY(res, pbuf, len);
5199	    res += len;
5200	    rescnt -= len;
5201	    while (--width >= len) {
5202		--rescnt;
5203		*res++ = ' ';
5204	    }
5205	    if (dict && (argidx < arglen) && c != '%') {
5206		PyErr_SetString(PyExc_TypeError,
5207				"not all arguments converted");
5208		goto onError;
5209	    }
5210	    Py_XDECREF(temp);
5211	} /* '%' */
5212    } /* until end */
5213    if (argidx < arglen && !dict) {
5214	PyErr_SetString(PyExc_TypeError,
5215			"not all arguments converted");
5216	goto onError;
5217    }
5218
5219    if (args_owned) {
5220	Py_DECREF(args);
5221    }
5222    Py_DECREF(uformat);
5223    if (_PyUnicode_Resize(&result, reslen - rescnt))
5224	goto onError;
5225    return (PyObject *)result;
5226
5227 onError:
5228    Py_XDECREF(result);
5229    Py_DECREF(uformat);
5230    if (args_owned) {
5231	Py_DECREF(args);
5232    }
5233    return NULL;
5234}
5235
5236static PyBufferProcs unicode_as_buffer = {
5237    (getreadbufferproc) unicode_buffer_getreadbuf,
5238    (getwritebufferproc) unicode_buffer_getwritebuf,
5239    (getsegcountproc) unicode_buffer_getsegcount,
5240    (getcharbufferproc) unicode_buffer_getcharbuf,
5241};
5242
5243PyTypeObject PyUnicode_Type = {
5244    PyObject_HEAD_INIT(&PyType_Type)
5245    0, 					/* ob_size */
5246    "unicode", 				/* tp_name */
5247    sizeof(PyUnicodeObject), 		/* tp_size */
5248    0, 					/* tp_itemsize */
5249    /* Slots */
5250    (destructor)_PyUnicode_Free, 	/* tp_dealloc */
5251    0, 					/* tp_print */
5252    (getattrfunc)unicode_getattr, 	/* tp_getattr */
5253    0, 					/* tp_setattr */
5254    (cmpfunc) unicode_compare, 		/* tp_compare */
5255    (reprfunc) unicode_repr, 		/* tp_repr */
5256    0, 					/* tp_as_number */
5257    &unicode_as_sequence, 		/* tp_as_sequence */
5258    0, 					/* tp_as_mapping */
5259    (hashfunc) unicode_hash, 		/* tp_hash*/
5260    0, 					/* tp_call*/
5261    (reprfunc) unicode_str,	 	/* tp_str */
5262    (getattrofunc) NULL, 		/* tp_getattro */
5263    (setattrofunc) NULL, 		/* tp_setattro */
5264    &unicode_as_buffer,			/* tp_as_buffer */
5265    Py_TPFLAGS_DEFAULT,			/* tp_flags */
5266};
5267
5268/* Initialize the Unicode implementation */
5269
5270void _PyUnicode_Init(void)
5271{
5272    int i;
5273
5274    /* Doublecheck the configuration... */
5275    if (sizeof(Py_UNICODE) != 2)
5276        Py_FatalError("Unicode configuration error: "
5277		      "sizeof(Py_UNICODE) != 2 bytes");
5278
5279    /* Init the implementation */
5280    unicode_freelist = NULL;
5281    unicode_freelist_size = 0;
5282    unicode_empty = _PyUnicode_New(0);
5283    strcpy(unicode_default_encoding, "ascii");
5284    for (i = 0; i < 256; i++)
5285	unicode_latin1[i] = NULL;
5286}
5287
5288/* Finalize the Unicode implementation */
5289
5290void
5291_PyUnicode_Fini(void)
5292{
5293    PyUnicodeObject *u;
5294    int i;
5295
5296    Py_XDECREF(unicode_empty);
5297    unicode_empty = NULL;
5298
5299    for (i = 0; i < 256; i++) {
5300	if (unicode_latin1[i]) {
5301	    Py_DECREF(unicode_latin1[i]);
5302	    unicode_latin1[i] = NULL;
5303	}
5304    }
5305
5306    for (u = unicode_freelist; u != NULL;) {
5307	PyUnicodeObject *v = u;
5308	u = *(PyUnicodeObject **)u;
5309	if (v->str)
5310	    PyMem_DEL(v->str);
5311	Py_XDECREF(v->defenc);
5312	PyObject_DEL(v);
5313    }
5314    unicode_freelist = NULL;
5315    unicode_freelist_size = 0;
5316}
5317